/build/source/llvm/lib/Target/X86/X86ISelLowering.cpp

1

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This file defines the interfaces that X86 uses to lower LLVM code into a

10

// selection DAG.

11

//

12

//===----------------------------------------------------------------------===//

13

14

#include "X86ISelLowering.h"

15

#include "MCTargetDesc/X86ShuffleDecode.h"

16

#include "X86.h"

17

#include "X86CallingConv.h"

18

#include "X86FrameLowering.h"

19

#include "X86InstrBuilder.h"

20

#include "X86IntrinsicsInfo.h"

21

#include "X86MachineFunctionInfo.h"

22

#include "X86TargetMachine.h"

23

#include "X86TargetObjectFile.h"

24

#include "llvm/ADT/SmallBitVector.h"

25

#include "llvm/ADT/SmallSet.h"

26

#include "llvm/ADT/Statistic.h"

27

#include "llvm/ADT/StringExtras.h"

28

#include "llvm/ADT/StringSwitch.h"

29

#include "llvm/Analysis/BlockFrequencyInfo.h"

30

#include "llvm/Analysis/ObjCARCUtil.h"

31

#include "llvm/Analysis/ProfileSummaryInfo.h"

32

#include "llvm/Analysis/VectorUtils.h"

33

#include "llvm/CodeGen/IntrinsicLowering.h"

34

#include "llvm/CodeGen/MachineFrameInfo.h"

35

#include "llvm/CodeGen/MachineFunction.h"

36

#include "llvm/CodeGen/MachineInstrBuilder.h"

37

#include "llvm/CodeGen/MachineJumpTableInfo.h"

38

#include "llvm/CodeGen/MachineLoopInfo.h"

39

#include "llvm/CodeGen/MachineModuleInfo.h"

40

#include "llvm/CodeGen/MachineRegisterInfo.h"

41

#include "llvm/CodeGen/TargetLowering.h"

42

#include "llvm/CodeGen/WinEHFuncInfo.h"

43

#include "llvm/IR/CallingConv.h"

44

#include "llvm/IR/Constants.h"

45

#include "llvm/IR/DerivedTypes.h"

46

#include "llvm/IR/DiagnosticInfo.h"

47

#include "llvm/IR/EHPersonalities.h"

48

#include "llvm/IR/Function.h"

49

#include "llvm/IR/GlobalAlias.h"

50

#include "llvm/IR/GlobalVariable.h"

51

#include "llvm/IR/IRBuilder.h"

52

#include "llvm/IR/Instructions.h"

53

#include "llvm/IR/Intrinsics.h"

54

#include "llvm/IR/PatternMatch.h"

55

#include "llvm/MC/MCAsmInfo.h"

56

#include "llvm/MC/MCContext.h"

57

#include "llvm/MC/MCExpr.h"

58

#include "llvm/MC/MCSymbol.h"

59

#include "llvm/Support/CommandLine.h"

60

#include "llvm/Support/Debug.h"

61

#include "llvm/Support/ErrorHandling.h"

62

#include "llvm/Support/KnownBits.h"

63

#include "llvm/Support/MathExtras.h"

64

#include "llvm/Target/TargetOptions.h"

65

#include <algorithm>

66

#include <bitset>

67

#include <cctype>

68

#include <numeric>

69

using namespace llvm;

70

71

#define DEBUG_TYPE"x86-isel" "x86-isel"

72

73

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"};

74

75

static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(

76

"x86-experimental-pref-innermost-loop-alignment", cl::init(4),

77

cl::desc(

78

"Sets the preferable loop alignment for experiments (as log2 bytes) "

79

"for innermost loops only. If specified, this option overrides "

80

"alignment set by x86-experimental-pref-loop-alignment."),

81

cl::Hidden);

82

83

static cl::opt<bool> MulConstantOptimization(

84

"mul-constant-optimization", cl::init(true),

85

cl::desc("Replace 'mul x, Const' with more effective instructions like "

86

"SHIFT, LEA, etc."),

87

cl::Hidden);

88

89

static cl::opt<bool> ExperimentalUnorderedISEL(

90

"x86-experimental-unordered-atomic-isel", cl::init(false),

91

cl::desc("Use LoadSDNode and StoreSDNode instead of "

92

"AtomicSDNode for unordered atomic loads and "

93

"stores respectively."),

94

cl::Hidden);

95

96

/// Call this when the user attempts to do something unsupported, like

97

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

98

/// report_fatal_error, so calling code should attempt to recover without

99

/// crashing.

100

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

101

const char *Msg) {

102

MachineFunction &MF = DAG.getMachineFunction();

103

DAG.getContext()->diagnose(

104

DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));

105

}

106

107

/// Returns true if a CC can dynamically exclude a register from the list of

108

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

109

/// the return registers.

110

static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {

111

switch (CC) {

112

default:

113

return false;

114

case CallingConv::X86_RegCall:

115

case CallingConv::PreserveMost:

116

case CallingConv::PreserveAll:

117

return true;

118

}

119

}

120

121

/// Returns true if a CC can dynamically exclude a register from the list of

122

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

123

/// the parameters.

124

static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {

125

return CC == CallingConv::X86_RegCall;

126

}

127

128

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

129

const X86Subtarget &STI)

130

: TargetLowering(TM), Subtarget(STI) {

131

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

132

MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

133

134

// Set up the TargetLowering object.

135

136

// X86 is weird. It always uses i8 for shift amounts and setcc results.

137

setBooleanContents(ZeroOrOneBooleanContent);

138

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

139

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

140

141

// For 64-bit, since we have so many registers, use the ILP scheduler.

142

// For 32-bit, use the register pressure specific scheduling.

143

// For Atom, always use ILP scheduling.

144

if (Subtarget.isAtom())

145

setSchedulingPreference(Sched::ILP);

146

else if (Subtarget.is64Bit())

147

setSchedulingPreference(Sched::ILP);

148

else

149

setSchedulingPreference(Sched::RegPressure);

150

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

151

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

152

153

// Bypass expensive divides and use cheaper ones.

154

if (TM.getOptLevel() >= CodeGenOpt::Default) {

155

if (Subtarget.hasSlowDivide32())

156

addBypassSlowDiv(32, 8);

157

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

158

addBypassSlowDiv(64, 32);

159

}

160

161

// Setup Windows compiler runtime calls.

162

if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {

163

static const struct {

164

const RTLIB::Libcall Op;

165

const char * const Name;

166

const CallingConv::ID CC;

167

} LibraryCalls[] = {

168

{ RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },

169

{ RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },

170

{ RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },

171

{ RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },

172

{ RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },

173

};

174

175

for (const auto &LC : LibraryCalls) {

176

setLibcallName(LC.Op, LC.Name);

177

setLibcallCallingConv(LC.Op, LC.CC);

178

}

179

}

180

181

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

182

// MSVCRT doesn't have powi; fall back to pow

183

setLibcallName(RTLIB::POWI_F32, nullptr);

184

setLibcallName(RTLIB::POWI_F64, nullptr);

185

}

186

187

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

188

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

189

// FIXME: Should we be limiting the atomic size on other configs? Default is

190

// 1024.

191

if (!Subtarget.canUseCMPXCHG8B())

192

setMaxAtomicSizeInBitsSupported(32);

193

194

setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);

195

196

setMaxLargeFPConvertBitWidthSupported(128);

197

198

// Set up the register classes.

199

addRegisterClass(MVT::i8, &X86::GR8RegClass);

200

addRegisterClass(MVT::i16, &X86::GR16RegClass);

201

addRegisterClass(MVT::i32, &X86::GR32RegClass);

202

if (Subtarget.is64Bit())

203

addRegisterClass(MVT::i64, &X86::GR64RegClass);

204

205

for (MVT VT : MVT::integer_valuetypes())

206

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

207

208

// We don't accept any truncstore of integer registers.

209

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

210

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

211

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

212

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

213

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

214

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

215

216

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

217

218

// SETOEQ and SETUNE require checking two conditions.

219

for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

220

setCondCodeAction(ISD::SETOEQ, VT, Expand);

221

setCondCodeAction(ISD::SETUNE, VT, Expand);

222

}

223

224

// Integer absolute.

225

if (Subtarget.canUseCMOV()) {

226

setOperationAction(ISD::ABS , MVT::i16 , Custom);

227

setOperationAction(ISD::ABS , MVT::i32 , Custom);

228

if (Subtarget.is64Bit())

229

setOperationAction(ISD::ABS , MVT::i64 , Custom);

230

}

231

232

// Absolute difference.

233

for (auto Op : {ISD::ABDS, ISD::ABDU}) {

234

setOperationAction(Op , MVT::i8 , Custom);

235

setOperationAction(Op , MVT::i16 , Custom);

236

setOperationAction(Op , MVT::i32 , Custom);

237

if (Subtarget.is64Bit())

238

setOperationAction(Op , MVT::i64 , Custom);

239

}

240

241

// Signed saturation subtraction.

242

setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);

243

setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);

244

setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);

245

if (Subtarget.is64Bit())

246

setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);

247

248

// Funnel shifts.

249

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

250

// For slow shld targets we only lower for code size.

251

LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

252

253

setOperationAction(ShiftOp , MVT::i8 , Custom);

254

setOperationAction(ShiftOp , MVT::i16 , Custom);

255

setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

256

if (Subtarget.is64Bit())

257

setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

258

}

259

260

if (!Subtarget.useSoftFloat()) {

261

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

262

// operation.

263

setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

264

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

265

setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

266

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

267

// We have an algorithm for SSE2, and we turn this into a 64-bit

268

// FILD or VCVTUSI2SS/SD for other targets.

269

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

270

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

271

// We have an algorithm for SSE2->double, and we turn this into a

272

// 64-bit FILD followed by conditional FADD for other targets.

273

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

274

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

275

276

// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

277

// this operation.

278

setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

279

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

280

// SSE has no i16 to fp conversion, only i32. We promote in the handler

281

// to allow f80 to use i16 and f64 to use i16 with sse1 only

282

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

283

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

284

// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

285

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

286

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

287

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

288

// are Legal, f80 is custom lowered.

289

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

290

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

291

292

// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

293

// this operation.

294

setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

295

// FIXME: This doesn't generate invalid exception when it should. PR44019.

296

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

297

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

298

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

299

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

300

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

301

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

302

// are Legal, f80 is custom lowered.

303

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

304

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

305

306

// Handle FP_TO_UINT by promoting the destination to a larger signed

307

// conversion.

308

setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

309

// FIXME: This doesn't generate invalid exception when it should. PR44019.

310

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

311

setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

312

// FIXME: This doesn't generate invalid exception when it should. PR44019.

313

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

314

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

315

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

316

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

317

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

318

319

setOperationAction(ISD::LRINT, MVT::f32, Custom);

320

setOperationAction(ISD::LRINT, MVT::f64, Custom);

321

setOperationAction(ISD::LLRINT, MVT::f32, Custom);

322

setOperationAction(ISD::LLRINT, MVT::f64, Custom);

323

324

if (!Subtarget.is64Bit()) {

325

setOperationAction(ISD::LRINT, MVT::i64, Custom);

326

setOperationAction(ISD::LLRINT, MVT::i64, Custom);

327

}

328

}

329

330

if (Subtarget.hasSSE2()) {

331

// Custom lowering for saturating float to int conversions.

332

// We handle promotion to larger result types manually.

333

for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {

334

setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);

335

setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);

336

}

337

if (Subtarget.is64Bit()) {

338

setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

339

setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);

340

}

341

}

342

343

// Handle address space casts between mixed sized pointers.

344

setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

345

setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

346

347

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

348

if (!Subtarget.hasSSE2()) {

349

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

350

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

351

if (Subtarget.is64Bit()) {

352

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

353

// Without SSE, i64->f64 goes through memory.

354

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

355

}

356

} else if (!Subtarget.is64Bit())

357

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

358

359

// Scalar integer divide and remainder are lowered to use operations that

360

// produce two results, to match the available instructions. This exposes

361

// the two-result form to trivial CSE, which is able to combine x/y and x%y

362

// into a single instruction.

363

//

364

// Scalar integer multiply-high is also lowered to use two-result

365

// operations, to match the available instructions. However, plain multiply

366

// (low) operations are left as Legal, as there are single-result

367

// instructions for this in x86. Using the two-result multiply instructions

368

// when both high and low results are needed must be arranged by dagcombine.

369

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

370

setOperationAction(ISD::MULHS, VT, Expand);

371

setOperationAction(ISD::MULHU, VT, Expand);

372

setOperationAction(ISD::SDIV, VT, Expand);

373

setOperationAction(ISD::UDIV, VT, Expand);

374

setOperationAction(ISD::SREM, VT, Expand);

375

setOperationAction(ISD::UREM, VT, Expand);

376

}

377

378

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

379

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

380

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

381

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

382

setOperationAction(ISD::BR_CC, VT, Expand);

383

setOperationAction(ISD::SELECT_CC, VT, Expand);

384

}

385

if (Subtarget.is64Bit())

386

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

387

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

388

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

389

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

390

391

setOperationAction(ISD::FREM , MVT::f32 , Expand);

392

setOperationAction(ISD::FREM , MVT::f64 , Expand);

393

setOperationAction(ISD::FREM , MVT::f80 , Expand);

394

setOperationAction(ISD::FREM , MVT::f128 , Expand);

395

396

if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {

397

setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);

398

setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);

399

}

400

401

// Promote the i8 variants and force them on up to i32 which has a shorter

402

// encoding.

403

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

404

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

405

// Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit

406

// a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to

407

// promote that too.

408

setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);

409

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);

410

411

if (!Subtarget.hasBMI()) {

412

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

413

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

414

if (Subtarget.is64Bit()) {

415

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

416

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

417

}

418

}

419

420

if (Subtarget.hasLZCNT()) {

421

// When promoting the i8 variants, force them to i32 for a shorter

422

// encoding.

423

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

424

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

425

} else {

426

for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

427

if (VT == MVT::i64 && !Subtarget.is64Bit())

428

continue;

429

setOperationAction(ISD::CTLZ , VT, Custom);

430

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

431

}

432

}

433

434

for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

435

ISD::STRICT_FP_TO_FP16}) {

436

// Special handling for half-precision floating point conversions.

437

// If we don't have F16C support, then lower half float conversions

438

// into library calls.

439

setOperationAction(

440

Op, MVT::f32,

441

(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

442

// There's never any support for operations beyond MVT::f32.

443

setOperationAction(Op, MVT::f64, Expand);

444

setOperationAction(Op, MVT::f80, Expand);

445

setOperationAction(Op, MVT::f128, Expand);

446

}

447

448

for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {

449

setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);

450

setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);

451

setTruncStoreAction(VT, MVT::f16, Expand);

452

setTruncStoreAction(VT, MVT::bf16, Expand);

453

454

setOperationAction(ISD::BF16_TO_FP, VT, Expand);

455

setOperationAction(ISD::FP_TO_BF16, VT, Custom);

456

}

457

458

setOperationAction(ISD::PARITY, MVT::i8, Custom);

459

setOperationAction(ISD::PARITY, MVT::i16, Custom);

460

setOperationAction(ISD::PARITY, MVT::i32, Custom);

461

if (Subtarget.is64Bit())

462

setOperationAction(ISD::PARITY, MVT::i64, Custom);

463

if (Subtarget.hasPOPCNT()) {

464

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

465

// popcntw is longer to encode than popcntl and also has a false dependency

466

// on the dest that popcntl hasn't had since Cannon Lake.

467

setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);

468

} else {

469

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

470

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

471

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

472

if (Subtarget.is64Bit())

473

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

474

else

475

setOperationAction(ISD::CTPOP , MVT::i64 , Custom);

476

}

477

478

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

479

480

if (!Subtarget.hasMOVBE())

481

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

482

483

// X86 wants to expand cmov itself.

484

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

485

setOperationAction(ISD::SELECT, VT, Custom);

486

setOperationAction(ISD::SETCC, VT, Custom);

487

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

488

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

489

}

490

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

491

if (VT == MVT::i64 && !Subtarget.is64Bit())

492

continue;

493

setOperationAction(ISD::SELECT, VT, Custom);

494

setOperationAction(ISD::SETCC, VT, Custom);

495

}

496

497

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

498

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

499

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

500

501

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

502

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

503

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

504

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

505

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

506

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

507

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

508

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

509

510

// Darwin ABI issue.

511

for (auto VT : { MVT::i32, MVT::i64 }) {

512

if (VT == MVT::i64 && !Subtarget.is64Bit())

513

continue;

514

setOperationAction(ISD::ConstantPool , VT, Custom);

515

setOperationAction(ISD::JumpTable , VT, Custom);

516

setOperationAction(ISD::GlobalAddress , VT, Custom);

517

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

518

setOperationAction(ISD::ExternalSymbol , VT, Custom);

519

setOperationAction(ISD::BlockAddress , VT, Custom);

520

}

521

522

// 64-bit shl, sra, srl (iff 32-bit x86)

523

for (auto VT : { MVT::i32, MVT::i64 }) {

524

if (VT == MVT::i64 && !Subtarget.is64Bit())

525

continue;

526

setOperationAction(ISD::SHL_PARTS, VT, Custom);

527

setOperationAction(ISD::SRA_PARTS, VT, Custom);

528

setOperationAction(ISD::SRL_PARTS, VT, Custom);

529

}

530

531

if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())

532

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

533

534

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

535

536

// Expand certain atomics

537

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

538

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

539

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

540

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

541

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

542

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

543

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

544

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

545

}

546

547

if (!Subtarget.is64Bit())

548

setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

549

550

if (Subtarget.canUseCMPXCHG16B())

551

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

552

553

// FIXME - use subtarget debug flags

554

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

555

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

556

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

557

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

558

}

559

560

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

561

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

562

563

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

564

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

565

566

setOperationAction(ISD::TRAP, MVT::Other, Legal);

567

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

568

if (Subtarget.isTargetPS())

569

setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);

570

else

571

setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);

572

573

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

574

setOperationAction(ISD::VASTART , MVT::Other, Custom);

575

setOperationAction(ISD::VAEND , MVT::Other, Expand);

576

bool Is64Bit = Subtarget.is64Bit();

577

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

578

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

579

580

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

581

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

582

583

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

584

585

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

586

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

587

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

588

589

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

590

591

auto setF16Action = [&] (MVT VT, LegalizeAction Action) {

592

setOperationAction(ISD::FABS, VT, Action);

593

setOperationAction(ISD::FNEG, VT, Action);

594

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

595

setOperationAction(ISD::FREM, VT, Action);

596

setOperationAction(ISD::FMA, VT, Action);

597

setOperationAction(ISD::FMINNUM, VT, Action);

598

setOperationAction(ISD::FMAXNUM, VT, Action);

599

setOperationAction(ISD::FMINIMUM, VT, Action);

600

setOperationAction(ISD::FMAXIMUM, VT, Action);

601

setOperationAction(ISD::FSIN, VT, Action);

602

setOperationAction(ISD::FCOS, VT, Action);

603

setOperationAction(ISD::FSINCOS, VT, Action);

604

setOperationAction(ISD::FSQRT, VT, Action);

605

setOperationAction(ISD::FPOW, VT, Action);

606

setOperationAction(ISD::FLOG, VT, Action);

607

setOperationAction(ISD::FLOG2, VT, Action);

608

setOperationAction(ISD::FLOG10, VT, Action);

609

setOperationAction(ISD::FEXP, VT, Action);

610

setOperationAction(ISD::FEXP2, VT, Action);

611

setOperationAction(ISD::FCEIL, VT, Action);

612

setOperationAction(ISD::FFLOOR, VT, Action);

613

setOperationAction(ISD::FNEARBYINT, VT, Action);

614

setOperationAction(ISD::FRINT, VT, Action);

615

setOperationAction(ISD::BR_CC, VT, Action);

616

setOperationAction(ISD::SETCC, VT, Action);

617

setOperationAction(ISD::SELECT, VT, Custom);

618

setOperationAction(ISD::SELECT_CC, VT, Action);

619

setOperationAction(ISD::FROUND, VT, Action);

620

setOperationAction(ISD::FROUNDEVEN, VT, Action);

621

setOperationAction(ISD::FTRUNC, VT, Action);

622

};

623

624

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

625

// f16, f32 and f64 use SSE.

626

// Set up the FP register classes.

627

addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass

628

: &X86::FR16RegClass);

629

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

630

: &X86::FR32RegClass);

631

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

632

: &X86::FR64RegClass);

633

634

// Disable f32->f64 extload as we can only generate this in one instruction

635

// under optsize. So its easier to pattern match (fpext (load)) for that

636

// case instead of needing to emit 2 instructions for extload in the

637

// non-optsize case.

638

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

639

640

for (auto VT : { MVT::f32, MVT::f64 }) {

641

// Use ANDPD to simulate FABS.

642

setOperationAction(ISD::FABS, VT, Custom);

643

644

// Use XORP to simulate FNEG.

645

setOperationAction(ISD::FNEG, VT, Custom);

646

647

// Use ANDPD and ORPD to simulate FCOPYSIGN.

648

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

649

650

// These might be better off as horizontal vector ops.

651

setOperationAction(ISD::FADD, VT, Custom);

652

setOperationAction(ISD::FSUB, VT, Custom);

653

654

// We don't support sin/cos/fmod

655

setOperationAction(ISD::FSIN , VT, Expand);

656

setOperationAction(ISD::FCOS , VT, Expand);

657

setOperationAction(ISD::FSINCOS, VT, Expand);

658

}

659

660

// Half type will be promoted by default.

661

setF16Action(MVT::f16, Promote);

662

setOperationAction(ISD::FADD, MVT::f16, Promote);

663

setOperationAction(ISD::FSUB, MVT::f16, Promote);

664

setOperationAction(ISD::FMUL, MVT::f16, Promote);

665

setOperationAction(ISD::FDIV, MVT::f16, Promote);

666

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

667

setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);

668

setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);

669

670

setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);

671

setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);

672

setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);

673

setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);

674

setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);

675

setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);

676

setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);

677

setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);

678

setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);

679

setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);

680

setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);

681

setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);

682

setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);

683

setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);

684

setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);

685

setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);

686

setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);

687

setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);

688

setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);

689

setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);

690

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);

691

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);

692

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

693

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);

694

setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);

695

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

696

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);

697

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);

698

699

setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

700

setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");

701

702

// Lower this to MOVMSK plus an AND.

703

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

704

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

705

706

} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&

707

(UseX87 || Is64Bit)) {

708

// Use SSE for f32, x87 for f64.

709

// Set up the FP register classes.

710

addRegisterClass(MVT::f32, &X86::FR32RegClass);

711

if (UseX87)

712

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

713

714

// Use ANDPS to simulate FABS.

715

setOperationAction(ISD::FABS , MVT::f32, Custom);

716

717

// Use XORP to simulate FNEG.

718

setOperationAction(ISD::FNEG , MVT::f32, Custom);

719

720

if (UseX87)

721

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

722

723

// Use ANDPS and ORPS to simulate FCOPYSIGN.

724

if (UseX87)

725

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

726

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

727

728

// We don't support sin/cos/fmod

729

setOperationAction(ISD::FSIN , MVT::f32, Expand);

730

setOperationAction(ISD::FCOS , MVT::f32, Expand);

731

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

732

733

if (UseX87) {

734

// Always expand sin/cos functions even though x87 has an instruction.

735

setOperationAction(ISD::FSIN, MVT::f64, Expand);

736

setOperationAction(ISD::FCOS, MVT::f64, Expand);

737

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

738

}

739

} else if (UseX87) {

740

// f32 and f64 in x87.

741

// Set up the FP register classes.

742

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

743

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

744

745

for (auto VT : { MVT::f32, MVT::f64 }) {

746

setOperationAction(ISD::UNDEF, VT, Expand);

747

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

748

749

// Always expand sin/cos functions even though x87 has an instruction.

750

setOperationAction(ISD::FSIN , VT, Expand);

751

setOperationAction(ISD::FCOS , VT, Expand);

752

setOperationAction(ISD::FSINCOS, VT, Expand);

753

}

754

}

755

756

// Expand FP32 immediates into loads from the stack, save special cases.

757

if (isTypeLegal(MVT::f32)) {

758

if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

759

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

760

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

761

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

762

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

763

} else // SSE immediates.

764

addLegalFPImmediate(APFloat(+0.0f)); // xorps

765

}

766

// Expand FP64 immediates into loads from the stack, save special cases.

767

if (isTypeLegal(MVT::f64)) {

768

if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

769

addLegalFPImmediate(APFloat(+0.0)); // FLD0

770

addLegalFPImmediate(APFloat(+1.0)); // FLD1

771

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

772

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

773

} else // SSE immediates.

774

addLegalFPImmediate(APFloat(+0.0)); // xorpd

775

}

776

// Support fp16 0 immediate.

777

if (isTypeLegal(MVT::f16))

778

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));

779

780

// Handle constrained floating-point operations of scalar.

781

setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

782

setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

783

setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

784

setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

785

setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

786

setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

787

setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

788

setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

789

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

790

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

791

setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

792

setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

793

794

// We don't support FMA.

795

setOperationAction(ISD::FMA, MVT::f64, Expand);

796

setOperationAction(ISD::FMA, MVT::f32, Expand);

797

798

// f80 always uses X87.

799

if (UseX87) {

800

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

801

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

802

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

803

{

804

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

805

addLegalFPImmediate(TmpFlt); // FLD0

806

TmpFlt.changeSign();

807

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

808

809

bool ignored;

810

APFloat TmpFlt2(+1.0);

811

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

812

&ignored);

813

addLegalFPImmediate(TmpFlt2); // FLD1

814

TmpFlt2.changeSign();

815

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

816

}

817

818

// Always expand sin/cos functions even though x87 has an instruction.

819

setOperationAction(ISD::FSIN , MVT::f80, Expand);

820

setOperationAction(ISD::FCOS , MVT::f80, Expand);

821

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

822

823

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

824

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

825

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

826

setOperationAction(ISD::FRINT, MVT::f80, Expand);

827

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

828

setOperationAction(ISD::FMA, MVT::f80, Expand);

829

setOperationAction(ISD::LROUND, MVT::f80, Expand);

830

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

831

setOperationAction(ISD::LRINT, MVT::f80, Custom);

832

setOperationAction(ISD::LLRINT, MVT::f80, Custom);

833

834

// Handle constrained floating-point operations of scalar.

835

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

836

setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

837

setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

838

setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

839

setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

840

if (isTypeLegal(MVT::f16)) {

841

setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);

842

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);

843

} else {

844

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

845

}

846

// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

847

// as Custom.

848

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

849

}

850

851

// f128 uses xmm registers, but most operations require libcalls.

852

if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

853

addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

854

: &X86::VR128RegClass);

855

856

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

857

858

setOperationAction(ISD::FADD, MVT::f128, LibCall);

859

setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

860

setOperationAction(ISD::FSUB, MVT::f128, LibCall);

861

setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

862

setOperationAction(ISD::FDIV, MVT::f128, LibCall);

863

setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

864

setOperationAction(ISD::FMUL, MVT::f128, LibCall);

865

setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

866

setOperationAction(ISD::FMA, MVT::f128, LibCall);

867

setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

868

869

setOperationAction(ISD::FABS, MVT::f128, Custom);

870

setOperationAction(ISD::FNEG, MVT::f128, Custom);

871

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

872

873

setOperationAction(ISD::FSIN, MVT::f128, LibCall);

874

setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

875

setOperationAction(ISD::FCOS, MVT::f128, LibCall);

876

setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

877

setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

878

// No STRICT_FSINCOS

879

setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

880

setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

881

882

setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

883

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

884

// We need to custom handle any FP_ROUND with an f128 input, but

885

// LegalizeDAG uses the result type to know when to run a custom handler.

886

// So we have to list all legal floating point result types here.

887

if (isTypeLegal(MVT::f32)) {

888

setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

889

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

890

}

891

if (isTypeLegal(MVT::f64)) {

892

setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

893

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

894

}

895

if (isTypeLegal(MVT::f80)) {

896

setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

897

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

898

}

899

900

setOperationAction(ISD::SETCC, MVT::f128, Custom);

901

902

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

903

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

904

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

905

setTruncStoreAction(MVT::f128, MVT::f32, Expand);

906

setTruncStoreAction(MVT::f128, MVT::f64, Expand);

907

setTruncStoreAction(MVT::f128, MVT::f80, Expand);

908

}

909

910

// Always use a library call for pow.

911

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

912

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

913

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

914

setOperationAction(ISD::FPOW , MVT::f128 , Expand);

915

916

setOperationAction(ISD::FLOG, MVT::f80, Expand);

917

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

918

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

919

setOperationAction(ISD::FEXP, MVT::f80, Expand);

920

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

921

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

922

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

923

924

// Some FP actions are always expanded for vector types.

925

for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,

926

MVT::v4f32, MVT::v8f32, MVT::v16f32,

927

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

928

setOperationAction(ISD::FSIN, VT, Expand);

929

setOperationAction(ISD::FSINCOS, VT, Expand);

930

setOperationAction(ISD::FCOS, VT, Expand);

931

setOperationAction(ISD::FREM, VT, Expand);

932

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

933

setOperationAction(ISD::FPOW, VT, Expand);

934

setOperationAction(ISD::FLOG, VT, Expand);

935

setOperationAction(ISD::FLOG2, VT, Expand);

936

setOperationAction(ISD::FLOG10, VT, Expand);

937

setOperationAction(ISD::FEXP, VT, Expand);

938

setOperationAction(ISD::FEXP2, VT, Expand);

939

}

940

941

// First set operation action for all vector types to either promote

942

// (for widening) or expand (for scalarization). Then we will selectively

943

// turn on ones that can be effectively codegen'd.

944

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

945

setOperationAction(ISD::SDIV, VT, Expand);

946

setOperationAction(ISD::UDIV, VT, Expand);

947

setOperationAction(ISD::SREM, VT, Expand);

948

setOperationAction(ISD::UREM, VT, Expand);

949

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

950

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

951

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

952

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

953

setOperationAction(ISD::FMA, VT, Expand);

954

setOperationAction(ISD::FFLOOR, VT, Expand);

955

setOperationAction(ISD::FCEIL, VT, Expand);

956

setOperationAction(ISD::FTRUNC, VT, Expand);

957

setOperationAction(ISD::FRINT, VT, Expand);

958

setOperationAction(ISD::FNEARBYINT, VT, Expand);

959

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

960

setOperationAction(ISD::MULHS, VT, Expand);

961

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

962

setOperationAction(ISD::MULHU, VT, Expand);

963

setOperationAction(ISD::SDIVREM, VT, Expand);

964

setOperationAction(ISD::UDIVREM, VT, Expand);

965

setOperationAction(ISD::CTPOP, VT, Expand);

966

setOperationAction(ISD::CTTZ, VT, Expand);

967

setOperationAction(ISD::CTLZ, VT, Expand);

968

setOperationAction(ISD::ROTL, VT, Expand);

969

setOperationAction(ISD::ROTR, VT, Expand);

970

setOperationAction(ISD::BSWAP, VT, Expand);

971

setOperationAction(ISD::SETCC, VT, Expand);

972

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

973

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

974

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

975

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

976

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

977

setOperationAction(ISD::TRUNCATE, VT, Expand);

978

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

979

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

980

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

981

setOperationAction(ISD::SELECT_CC, VT, Expand);

982

for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

983

setTruncStoreAction(InnerVT, VT, Expand);

984

985

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

986

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

987

988

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

989

// types, we have to deal with them whether we ask for Expansion or not.

990

// Setting Expand causes its own optimisation problems though, so leave

991

// them legal.

992

if (VT.getVectorElementType() == MVT::i1)

993

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

994

995

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

996

// split/scalarized right now.

997

if (VT.getVectorElementType() == MVT::f16 ||

998

VT.getVectorElementType() == MVT::bf16)

999

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

1000

}

1001

}

1002

1003

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

1004

// with -msoft-float, disable use of MMX as well.

1005

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

1006

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

1007

// No operations on x86mmx supported, everything uses intrinsics.

1008

}

1009

1010

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

1011

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1012

: &X86::VR128RegClass);

1013

1014

setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);

1015

setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);

1016

1017

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

1018

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

1019

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

1020

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

1021

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

1022

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

1023

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

1024

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

1025

1026

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

1027

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

1028

1029

setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

1030

setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

1031

setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

1032

setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

1033

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

1034

}

1035

1036

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

1037

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1038

: &X86::VR128RegClass);

1039

1040

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

1041

// registers cannot be used even for integer operations.

1042

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

1043

: &X86::VR128RegClass);

1044

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1045

: &X86::VR128RegClass);

1046

addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1047

: &X86::VR128RegClass);

1048

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1049

: &X86::VR128RegClass);

1050

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1051

: &X86::VR128RegClass);

1052

1053

setOperationAction(ISD::FMAXIMUM, MVT::f64, Custom);

1054

setOperationAction(ISD::FMINIMUM, MVT::f64, Custom);

1055

1056

for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

1057

MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

1058

setOperationAction(ISD::SDIV, VT, Custom);

1059

setOperationAction(ISD::SREM, VT, Custom);

1060

setOperationAction(ISD::UDIV, VT, Custom);

1061

setOperationAction(ISD::UREM, VT, Custom);

1062

}

1063

1064

setOperationAction(ISD::MUL, MVT::v2i8, Custom);

1065

setOperationAction(ISD::MUL, MVT::v4i8, Custom);

1066

setOperationAction(ISD::MUL, MVT::v8i8, Custom);

1067

1068

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

1069

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

1070

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

1071

setOperationAction(ISD::MULHU, MVT::v4i32, Custom);

1072

setOperationAction(ISD::MULHS, MVT::v4i32, Custom);

1073

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

1074

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

1075

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

1076

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

1077

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

1078

setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);

1079

setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);

1080

1081

setOperationAction(ISD::SMULO, MVT::v16i8, Custom);

1082

setOperationAction(ISD::UMULO, MVT::v16i8, Custom);

1083

setOperationAction(ISD::UMULO, MVT::v2i32, Custom);

1084

1085

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

1086

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

1087

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

1088

1089

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1090

setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

1091

setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

1092

setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

1093

setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

1094

}

1095

1096

setOperationAction(ISD::ABDU, MVT::v16i8, Custom);

1097

setOperationAction(ISD::ABDU, MVT::v8i16, Custom);

1098

setOperationAction(ISD::ABDS, MVT::v8i16, Custom);

1099

1100

setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);

1101

setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);

1102

setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);

1103

setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);

1104

setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);

1105

setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);

1106

setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);

1107

setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);

1108

setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);

1109

setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

1110

1111

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

1112

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

1113

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

1114

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

1115

1116

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1117

setOperationAction(ISD::SETCC, VT, Custom);

1118

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1119

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1120

setOperationAction(ISD::CTPOP, VT, Custom);

1121

setOperationAction(ISD::ABS, VT, Custom);

1122

1123

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1124

// setcc all the way to isel and prefer SETGT in some isel patterns.

1125

setCondCodeAction(ISD::SETLT, VT, Custom);

1126

setCondCodeAction(ISD::SETLE, VT, Custom);

1127

}

1128

1129

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

1130

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1131

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1132

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1133

setOperationAction(ISD::VSELECT, VT, Custom);

1134

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1135

}

1136

1137

for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {

1138

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1139

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1140

setOperationAction(ISD::VSELECT, VT, Custom);

1141

1142

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

1143

continue;

1144

1145

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1146

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1147

}

1148

setF16Action(MVT::v8f16, Expand);

1149

setOperationAction(ISD::FADD, MVT::v8f16, Expand);

1150

setOperationAction(ISD::FSUB, MVT::v8f16, Expand);

1151

setOperationAction(ISD::FMUL, MVT::v8f16, Expand);

1152

setOperationAction(ISD::FDIV, MVT::v8f16, Expand);

1153

1154

// Custom lower v2i64 and v2f64 selects.

1155

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

1156

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

1157

setOperationAction(ISD::SELECT, MVT::v4i32, Custom);

1158

setOperationAction(ISD::SELECT, MVT::v8i16, Custom);

1159

setOperationAction(ISD::SELECT, MVT::v8f16, Custom);

1160

setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

1161

1162

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);

1163

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);

1164

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

1165

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1166

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);

1167

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

1168

1169

// Custom legalize these to avoid over promotion or custom promotion.

1170

for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

1171

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1172

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1173

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1174

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1175

}

1176

1177

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);

1178

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);

1179

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

1180

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

1181

1182

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

1183

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

1184

1185

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

1186

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

1187

1188

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

1189

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1190

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

1191

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1192

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

1193

1194

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1195

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

1196

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1197

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

1198

1199

// We want to legalize this to an f64 load rather than an i64 load on

1200

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

1201

// store.

1202

setOperationAction(ISD::LOAD, MVT::v2i32, Custom);

1203

setOperationAction(ISD::LOAD, MVT::v4i16, Custom);

1204

setOperationAction(ISD::LOAD, MVT::v8i8, Custom);

1205

setOperationAction(ISD::STORE, MVT::v2i32, Custom);

1206

setOperationAction(ISD::STORE, MVT::v4i16, Custom);

1207

setOperationAction(ISD::STORE, MVT::v8i8, Custom);

1208

1209

// Add 32-bit vector stores to help vectorization opportunities.

1210

setOperationAction(ISD::STORE, MVT::v2i16, Custom);

1211

setOperationAction(ISD::STORE, MVT::v4i8, Custom);

1212

1213

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1214

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1215

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1216

if (!Subtarget.hasAVX512())

1217

setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

1218

1219

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

1220

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

1221

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

1222

1223

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1224

1225

setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);

1226

setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

1227

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);

1228

setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);

1229

setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);

1230

setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

1231

1232

// In the customized shift lowering, the legal v4i32/v2i64 cases

1233

// in AVX2 will be recognized.

1234

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1235

setOperationAction(ISD::SRL, VT, Custom);

1236

setOperationAction(ISD::SHL, VT, Custom);

1237

setOperationAction(ISD::SRA, VT, Custom);

1238

if (VT == MVT::v2i64) continue;

1239

setOperationAction(ISD::ROTL, VT, Custom);

1240

setOperationAction(ISD::ROTR, VT, Custom);

1241

setOperationAction(ISD::FSHL, VT, Custom);

1242

setOperationAction(ISD::FSHR, VT, Custom);

1243

}

1244

1245

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

1246

setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

1247

setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

1248

setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

1249

setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

1250

}

1251

1252

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

1253

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

1254

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

1255

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

1256

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

1257

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

1258

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

1259

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

1260

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

1261

1262

// These might be better off as horizontal vector ops.

1263

setOperationAction(ISD::ADD, MVT::i16, Custom);

1264

setOperationAction(ISD::ADD, MVT::i32, Custom);

1265

setOperationAction(ISD::SUB, MVT::i16, Custom);

1266

setOperationAction(ISD::SUB, MVT::i32, Custom);

1267

}

1268

1269

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

1270

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

1271

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

1272

setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

1273

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

1274

setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

1275

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

1276

setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

1277

setOperationAction(ISD::FRINT, RoundedTy, Legal);

1278

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

1279

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

1280

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

1281

setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);

1282

setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);

1283

1284

setOperationAction(ISD::FROUND, RoundedTy, Custom);

1285

}

1286

1287

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

1288

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

1289

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

1290

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

1291

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

1292

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

1293

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

1294

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

1295

1296

for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {

1297

setOperationAction(ISD::ABDS, VT, Custom);

1298

setOperationAction(ISD::ABDU, VT, Custom);

1299

}

1300

1301

setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);

1302

setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);

1303

setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);

1304

1305

// FIXME: Do we need to handle scalar-to-vector here?

1306

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1307

setOperationAction(ISD::SMULO, MVT::v2i32, Custom);

1308

1309

// We directly match byte blends in the backend as they match the VSELECT

1310

// condition form.

1311

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1312

1313

// SSE41 brings specific instructions for doing vector sign extend even in

1314

// cases where we don't have SRA.

1315

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1316

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

1317

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

1318

}

1319

1320

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1321

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1322

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

1323

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

1324

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

1325

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

1326

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

1327

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

1328

}

1329

1330

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

1331

// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

1332

// do the pre and post work in the vector domain.

1333

setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

1334

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

1335

// We need to mark SINT_TO_FP as Custom even though we want to expand it

1336

// so that DAG combine doesn't try to turn it into uint_to_fp.

1337

setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

1338

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

1339

}

1340

}

1341

1342

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {

1343

setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);

1344

}

1345

1346

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

1347

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1348

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1349

setOperationAction(ISD::ROTL, VT, Custom);

1350

setOperationAction(ISD::ROTR, VT, Custom);

1351

}

1352

1353

// XOP can efficiently perform BITREVERSE with VPPERM.

1354

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

1355

setOperationAction(ISD::BITREVERSE, VT, Custom);

1356

1357

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1358

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1359

setOperationAction(ISD::BITREVERSE, VT, Custom);

1360

}

1361

1362

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

1363

bool HasInt256 = Subtarget.hasInt256();

1364

1365

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

1366

: &X86::VR256RegClass);

1367

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1368

: &X86::VR256RegClass);

1369

addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1370

: &X86::VR256RegClass);

1371

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1372

: &X86::VR256RegClass);

1373

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1374

: &X86::VR256RegClass);

1375

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1376

: &X86::VR256RegClass);

1377

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1378

: &X86::VR256RegClass);

1379

1380

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

1381

setOperationAction(ISD::FFLOOR, VT, Legal);

1382

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1383

setOperationAction(ISD::FCEIL, VT, Legal);

1384

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1385

setOperationAction(ISD::FTRUNC, VT, Legal);

1386

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1387

setOperationAction(ISD::FRINT, VT, Legal);

1388

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1389

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1390

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1391

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1392

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1393

1394

setOperationAction(ISD::FROUND, VT, Custom);

1395

1396

setOperationAction(ISD::FNEG, VT, Custom);

1397

setOperationAction(ISD::FABS, VT, Custom);

1398

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1399

}

1400

1401

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1402

// even though v8i16 is a legal type.

1403

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1404

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1405

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1406

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1407

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);

1408

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);

1409

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);

1410

1411

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);

1412

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);

1413

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);

1414

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);

1415

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);

1416

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);

1417

1418

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

1419

setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

1420

setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

1421

setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

1422

setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

1423

setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

1424

setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

1425

setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

1426

setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

1427

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

1428

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

1429

1430

if (!Subtarget.hasAVX512())

1431

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

1432

1433

// In the customized shift lowering, the legal v8i32/v4i64 cases

1434

// in AVX2 will be recognized.

1435

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1436

setOperationAction(ISD::SRL, VT, Custom);

1437

setOperationAction(ISD::SHL, VT, Custom);

1438

setOperationAction(ISD::SRA, VT, Custom);

1439

setOperationAction(ISD::ABDS, VT, Custom);

1440

setOperationAction(ISD::ABDU, VT, Custom);

1441

if (VT == MVT::v4i64) continue;

1442

setOperationAction(ISD::ROTL, VT, Custom);

1443

setOperationAction(ISD::ROTR, VT, Custom);

1444

setOperationAction(ISD::FSHL, VT, Custom);

1445

setOperationAction(ISD::FSHR, VT, Custom);

1446

}

1447

1448

// These types need custom splitting if their input is a 128-bit vector.

1449

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1450

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1451

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1452

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1453

1454

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1455

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1456

setOperationAction(ISD::SELECT, MVT::v8i32, Custom);

1457

setOperationAction(ISD::SELECT, MVT::v16i16, Custom);

1458

setOperationAction(ISD::SELECT, MVT::v16f16, Custom);

1459

setOperationAction(ISD::SELECT, MVT::v32i8, Custom);

1460

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1461

1462

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1463

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1464

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1465

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1466

}

1467

1468

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1469

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1470

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1471

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1472

1473

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1474

setOperationAction(ISD::SETCC, VT, Custom);

1475

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1476

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1477

setOperationAction(ISD::CTPOP, VT, Custom);

1478

setOperationAction(ISD::CTLZ, VT, Custom);

1479

1480

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1481

// setcc all the way to isel and prefer SETGT in some isel patterns.

1482

setCondCodeAction(ISD::SETLT, VT, Custom);

1483

setCondCodeAction(ISD::SETLE, VT, Custom);

1484

}

1485

1486

if (Subtarget.hasAnyFMA()) {

1487

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1488

MVT::v2f64, MVT::v4f64 }) {

1489

setOperationAction(ISD::FMA, VT, Legal);

1490

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1491

}

1492

}

1493

1494

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1495

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1496

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1497

}

1498

1499

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1500

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1501

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1502

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1503

1504

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);

1505

setOperationAction(ISD::MULHS, MVT::v8i32, Custom);

1506

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1507

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1508

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1509

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1510

setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);

1511

setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);

1512

1513

setOperationAction(ISD::SMULO, MVT::v32i8, Custom);

1514

setOperationAction(ISD::UMULO, MVT::v32i8, Custom);

1515

1516

setOperationAction(ISD::ABS, MVT::v4i64, Custom);

1517

setOperationAction(ISD::SMAX, MVT::v4i64, Custom);

1518

setOperationAction(ISD::UMAX, MVT::v4i64, Custom);

1519

setOperationAction(ISD::SMIN, MVT::v4i64, Custom);

1520

setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

1521

1522

setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1523

setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1524

setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1525

setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1526

setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1527

setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1528

setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1529

setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1530

setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);

1531

setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);

1532

setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);

1533

setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);

1534

1535

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1536

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1537

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1538

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1539

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1540

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1541

}

1542

1543

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

1544

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1545

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1546

}

1547

1548

if (HasInt256) {

1549

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1550

// when we have a 256bit-wide blend with immediate.

1551

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1552

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

1553

1554

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1555

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1556

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1557

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1558

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1559

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1560

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1561

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1562

}

1563

}

1564

1565

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1566

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1567

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1568

setOperationAction(ISD::MSTORE, VT, Legal);

1569

}

1570

1571

// Extract subvector is special because the value type

1572

// (result) is 128-bit but the source is 256-bit wide.

1573

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1574

MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

1575

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1576

}

1577

1578

// Custom lower several nodes for 256-bit types.

1579

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1580

MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {

1581

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1582

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1583

setOperationAction(ISD::VSELECT, VT, Custom);

1584

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1585

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1586

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1587

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1588

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1589

setOperationAction(ISD::STORE, VT, Custom);

1590

}

1591

setF16Action(MVT::v16f16, Expand);

1592

setOperationAction(ISD::FADD, MVT::v16f16, Expand);

1593

setOperationAction(ISD::FSUB, MVT::v16f16, Expand);

1594

setOperationAction(ISD::FMUL, MVT::v16f16, Expand);

1595

setOperationAction(ISD::FDIV, MVT::v16f16, Expand);

1596

1597

if (HasInt256) {

1598

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1599

1600

// Custom legalize 2x32 to get a little better code.

1601

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1602

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1603

1604

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1605

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1606

setOperationAction(ISD::MGATHER, VT, Custom);

1607

}

1608

}

1609

1610

if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&

1611

Subtarget.hasF16C()) {

1612

for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {

1613

setOperationAction(ISD::FP_ROUND, VT, Custom);

1614

setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);

1615

}

1616

for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {

1617

setOperationAction(ISD::FP_EXTEND, VT, Custom);

1618

setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);

1619

}

1620

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1621

setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);

1622

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1623

}

1624

1625

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

1626

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

1627

}

1628

1629

// This block controls legalization of the mask vector sizes that are

1630

// available with AVX512. 512-bit vectors are in a separate block controlled

1631

// by useAVX512Regs.

1632

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1633

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1634

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1635

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1636

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1637

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1638

1639

setOperationAction(ISD::SELECT, MVT::v1i1, Custom);

1640

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1641

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1642

1643

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1644

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1645

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1646

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1647

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1648

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1649

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1650

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1651

setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

1652

setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

1653

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

1654

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

1655

1656

// There is no byte sized k-register load or store without AVX512DQ.

1657

if (!Subtarget.hasDQI()) {

1658

setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

1659

setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

1660

setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

1661

setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

1662

1663

setOperationAction(ISD::STORE, MVT::v1i1, Custom);

1664

setOperationAction(ISD::STORE, MVT::v2i1, Custom);

1665

setOperationAction(ISD::STORE, MVT::v4i1, Custom);

1666

setOperationAction(ISD::STORE, MVT::v8i1, Custom);

1667

}

1668

1669

// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

1670

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1671

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1672

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1673

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1674

}

1675

1676

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })

1677

setOperationAction(ISD::VSELECT, VT, Expand);

1678

1679

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1680

setOperationAction(ISD::SETCC, VT, Custom);

1681

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1682

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1683

setOperationAction(ISD::SELECT, VT, Custom);

1684

setOperationAction(ISD::TRUNCATE, VT, Custom);

1685

1686

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1687

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1688

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1689

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1690

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1691

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1692

}

1693

1694

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

1695

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1696

}

1697

1698

// This block controls legalization for 512-bit operations with 8/16/32/64 bit

1699

// elements. 512-bits can be disabled based on prefer-vector-width and

1700

// required-vector-width function attributes.

1701

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

1702

bool HasBWI = Subtarget.hasBWI();

1703

1704

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1705

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1706

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1707

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1708

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1709

addRegisterClass(MVT::v32f16, &X86::VR512RegClass);

1710

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1711

1712

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1713

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1714

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1715

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1716

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1717

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1718

if (HasBWI)

1719

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1720

}

1721

1722

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1723

setOperationAction(ISD::FNEG, VT, Custom);

1724

setOperationAction(ISD::FABS, VT, Custom);

1725

setOperationAction(ISD::FMA, VT, Legal);

1726

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1727

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1728

}

1729

1730

for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {

1731

setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

1732

setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

1733

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

1734

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

1735

}

1736

1737

for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {

1738

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1739

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1740

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1741

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1742

}

1743

1744

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);

1745

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);

1746

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);

1747

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);

1748

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);

1749

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);

1750

1751

setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

1752

setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

1753

setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

1754

setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

1755

setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

1756

setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

1757

setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

1758

setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

1759

setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

1760

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

1761

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

1762

1763

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1764

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1765

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1766

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1767

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1768

if (HasBWI)

1769

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1770

1771

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

1772

// to 512-bit rather than use the AVX2 instructions so that we can use

1773

// k-masks.

1774

if (!Subtarget.hasVLX()) {

1775

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1776

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1777

setOperationAction(ISD::MLOAD, VT, Custom);

1778

setOperationAction(ISD::MSTORE, VT, Custom);

1779

}

1780

}

1781

1782

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

1783

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

1784

setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

1785

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1786

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1787

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1788

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1789

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1790

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1791

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1792

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1793

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1794

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1795

1796

if (HasBWI) {

1797

// Extends from v64i1 masks to 512-bit vectors.

1798

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1799

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1800

setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

1801

}

1802

1803

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1804

setOperationAction(ISD::FFLOOR, VT, Legal);

1805

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1806

setOperationAction(ISD::FCEIL, VT, Legal);

1807

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1808

setOperationAction(ISD::FTRUNC, VT, Legal);

1809

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1810

setOperationAction(ISD::FRINT, VT, Legal);

1811

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1812

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1813

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1814

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1815

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1816

1817

setOperationAction(ISD::FROUND, VT, Custom);

1818

}

1819

1820

for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

1821

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1822

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1823

}

1824

1825

setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

1826

setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

1827

setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

1828

setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

1829

1830

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1831

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1832

setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

1833

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1834

1835

setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

1836

setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

1837

setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

1838

setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

1839

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1840

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1841

setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);

1842

setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);

1843

1844

setOperationAction(ISD::SMULO, MVT::v64i8, Custom);

1845

setOperationAction(ISD::UMULO, MVT::v64i8, Custom);

1846

1847

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1848

1849

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1850

setOperationAction(ISD::SRL, VT, Custom);

1851

setOperationAction(ISD::SHL, VT, Custom);

1852

setOperationAction(ISD::SRA, VT, Custom);

1853

setOperationAction(ISD::ROTL, VT, Custom);

1854

setOperationAction(ISD::ROTR, VT, Custom);

1855

setOperationAction(ISD::SETCC, VT, Custom);

1856

setOperationAction(ISD::ABDS, VT, Custom);

1857

setOperationAction(ISD::ABDU, VT, Custom);

1858

1859

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1860

// setcc all the way to isel and prefer SETGT in some isel patterns.

1861

setCondCodeAction(ISD::SETLT, VT, Custom);

1862

setCondCodeAction(ISD::SETLE, VT, Custom);

1863

}

1864

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1865

setOperationAction(ISD::SMAX, VT, Legal);

1866

setOperationAction(ISD::UMAX, VT, Legal);

1867

setOperationAction(ISD::SMIN, VT, Legal);

1868

setOperationAction(ISD::UMIN, VT, Legal);

1869

setOperationAction(ISD::ABS, VT, Legal);

1870

setOperationAction(ISD::CTPOP, VT, Custom);

1871

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1872

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1873

}

1874

1875

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1876

setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

1877

setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

1878

setOperationAction(ISD::CTLZ, VT, Custom);

1879

setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

1880

setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

1881

setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

1882

setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

1883

setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

1884

setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

1885

setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

1886

setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

1887

}

1888

1889

setOperationAction(ISD::FSHL, MVT::v64i8, Custom);

1890

setOperationAction(ISD::FSHR, MVT::v64i8, Custom);

1891

setOperationAction(ISD::FSHL, MVT::v32i16, Custom);

1892

setOperationAction(ISD::FSHR, MVT::v32i16, Custom);

1893

setOperationAction(ISD::FSHL, MVT::v16i32, Custom);

1894

setOperationAction(ISD::FSHR, MVT::v16i32, Custom);

1895

1896

if (Subtarget.hasDQI()) {

1897

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

1898

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

1899

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})

1900

setOperationAction(Opc, MVT::v8i64, Custom);

1901

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1902

}

1903

1904

if (Subtarget.hasCDI()) {

1905

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1906

for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

1907

setOperationAction(ISD::CTLZ, VT, Legal);

1908

}

1909

} // Subtarget.hasCDI()

1910

1911

if (Subtarget.hasVPOPCNTDQ()) {

1912

for (auto VT : { MVT::v16i32, MVT::v8i64 })

1913

setOperationAction(ISD::CTPOP, VT, Legal);

1914

}

1915

1916

// Extract subvector is special because the value type

1917

// (result) is 256-bit but the source is 512-bit wide.

1918

// 128-bit was made Legal under AVX1.

1919

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1920

MVT::v16f16, MVT::v8f32, MVT::v4f64 })

1921

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1922

1923

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

1924

MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {

1925

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1926

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1927

setOperationAction(ISD::SELECT, VT, Custom);

1928

setOperationAction(ISD::VSELECT, VT, Custom);

1929

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1930

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1931

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1932

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1933

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1934

}

1935

setF16Action(MVT::v32f16, Expand);

1936

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);

1937

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);

1938

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

1939

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

1940

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1941

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1942

setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);

1943

}

1944

1945

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1946

setOperationAction(ISD::MLOAD, VT, Legal);

1947

setOperationAction(ISD::MSTORE, VT, Legal);

1948

setOperationAction(ISD::MGATHER, VT, Custom);

1949

setOperationAction(ISD::MSCATTER, VT, Custom);

1950

}

1951

if (HasBWI) {

1952

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1953

setOperationAction(ISD::MLOAD, VT, Legal);

1954

setOperationAction(ISD::MSTORE, VT, Legal);

1955

}

1956

} else {

1957

setOperationAction(ISD::STORE, MVT::v32i16, Custom);

1958

setOperationAction(ISD::STORE, MVT::v64i8, Custom);

1959

}

1960

1961

if (Subtarget.hasVBMI2()) {

1962

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,

1963

MVT::v16i16, MVT::v8i32, MVT::v4i64,

1964

MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1965

setOperationAction(ISD::FSHL, VT, Custom);

1966

setOperationAction(ISD::FSHR, VT, Custom);

1967

}

1968

1969

setOperationAction(ISD::ROTL, MVT::v32i16, Custom);

1970

setOperationAction(ISD::ROTR, MVT::v8i16, Custom);

1971

setOperationAction(ISD::ROTR, MVT::v16i16, Custom);

1972

setOperationAction(ISD::ROTR, MVT::v32i16, Custom);

1973

}

1974

}// useAVX512Regs

1975

1976

// This block controls legalization for operations that don't have

1977

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

1978

// narrower widths.

1979

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1980

// These operations are handled on non-VLX by artificially widening in

1981

// isel patterns.

1982

1983

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);

1984

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);

1985

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

1986

1987

if (Subtarget.hasDQI()) {

1988

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

1989

// v2f32 UINT_TO_FP is already custom under SSE2.

1990

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))

1991

isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__))

1992

"Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__
__PRETTY_FUNCTION__));

1993

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

1994

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1995

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1996

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

1997

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

1998

}

1999

2000

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

2001

setOperationAction(ISD::SMAX, VT, Legal);

2002

setOperationAction(ISD::UMAX, VT, Legal);

2003

setOperationAction(ISD::SMIN, VT, Legal);

2004

setOperationAction(ISD::UMIN, VT, Legal);

2005

setOperationAction(ISD::ABS, VT, Legal);

2006

}

2007

2008

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

2009

setOperationAction(ISD::ROTL, VT, Custom);

2010

setOperationAction(ISD::ROTR, VT, Custom);

2011

}

2012

2013

// Custom legalize 2x32 to get a little better code.

2014

setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

2015

setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

2016

2017

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

2018

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

2019

setOperationAction(ISD::MSCATTER, VT, Custom);

2020

2021

if (Subtarget.hasDQI()) {

2022

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

2023

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

2024

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {

2025

setOperationAction(Opc, MVT::v2i64, Custom);

2026

setOperationAction(Opc, MVT::v4i64, Custom);

2027

}

2028

setOperationAction(ISD::MUL, MVT::v2i64, Legal);

2029

setOperationAction(ISD::MUL, MVT::v4i64, Legal);

2030

}

2031

2032

if (Subtarget.hasCDI()) {

2033

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

2034

setOperationAction(ISD::CTLZ, VT, Legal);

2035

}

2036

} // Subtarget.hasCDI()

2037

2038

if (Subtarget.hasVPOPCNTDQ()) {

2039

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

2040

setOperationAction(ISD::CTPOP, VT, Legal);

2041

}

2042

}

2043

2044

// This block control legalization of v32i1/v64i1 which are available with

2045

// AVX512BW..

2046

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

2047

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

2048

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

2049

2050

for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

2051

setOperationAction(ISD::VSELECT, VT, Expand);

2052

setOperationAction(ISD::TRUNCATE, VT, Custom);

2053

setOperationAction(ISD::SETCC, VT, Custom);

2054

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2055

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

2056

setOperationAction(ISD::SELECT, VT, Custom);

2057

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2058

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2059

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

2060

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

2061

}

2062

2063

for (auto VT : { MVT::v16i1, MVT::v32i1 })

2064

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

2065

2066

// Extends from v32i1 masks to 256-bit vectors.

2067

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

2068

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

2069

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

2070

2071

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

2072

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

2073

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

2074

}

2075

2076

// These operations are handled on non-VLX by artificially widening in

2077

// isel patterns.

2078

// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

2079

2080

if (Subtarget.hasBITALG()) {

2081

for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

2082

setOperationAction(ISD::CTPOP, VT, Legal);

2083

}

2084

}

2085

2086

if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {

2087

auto setGroup = [&] (MVT VT) {

2088

setOperationAction(ISD::FADD, VT, Legal);

2089

setOperationAction(ISD::STRICT_FADD, VT, Legal);

2090

setOperationAction(ISD::FSUB, VT, Legal);

2091

setOperationAction(ISD::STRICT_FSUB, VT, Legal);

2092

setOperationAction(ISD::FMUL, VT, Legal);

2093

setOperationAction(ISD::STRICT_FMUL, VT, Legal);

2094

setOperationAction(ISD::FDIV, VT, Legal);

2095

setOperationAction(ISD::STRICT_FDIV, VT, Legal);

2096

setOperationAction(ISD::FSQRT, VT, Legal);

2097

setOperationAction(ISD::STRICT_FSQRT, VT, Legal);

2098

2099

setOperationAction(ISD::FFLOOR, VT, Legal);

2100

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

2101

setOperationAction(ISD::FCEIL, VT, Legal);

2102

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

2103

setOperationAction(ISD::FTRUNC, VT, Legal);

2104

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

2105

setOperationAction(ISD::FRINT, VT, Legal);

2106

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

2107

setOperationAction(ISD::FNEARBYINT, VT, Legal);

2108

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

2109

2110

setOperationAction(ISD::FROUND, VT, Custom);

2111

2112

setOperationAction(ISD::LOAD, VT, Legal);

2113

setOperationAction(ISD::STORE, VT, Legal);

2114

2115

setOperationAction(ISD::FMA, VT, Legal);

2116

setOperationAction(ISD::STRICT_FMA, VT, Legal);

2117

setOperationAction(ISD::VSELECT, VT, Legal);

2118

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2119

setOperationAction(ISD::SELECT, VT, Custom);

2120

2121

setOperationAction(ISD::FNEG, VT, Custom);

2122

setOperationAction(ISD::FABS, VT, Custom);

2123

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

2124

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2125

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2126

};

2127

2128

// AVX512_FP16 scalar operations

2129

setGroup(MVT::f16);

2130

setOperationAction(ISD::FREM, MVT::f16, Promote);

2131

setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);

2132

setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);

2133

setOperationAction(ISD::BR_CC, MVT::f16, Expand);

2134

setOperationAction(ISD::SETCC, MVT::f16, Custom);

2135

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);

2136

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);

2137

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

2138

setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);

2139

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);

2140

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

2141

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

2142

setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);

2143

setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);

2144

setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);

2145

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);

2146

2147

setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);

2148

setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);

2149

2150

if (Subtarget.useAVX512Regs()) {

2151

setGroup(MVT::v32f16);

2152

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);

2153

setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);

2154

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);

2155

setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);

2156

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);

2157

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);

2158

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);

2159

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

2160

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

2161

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);

2162

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

2163

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);

2164

2165

setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);

2166

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);

2167

setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);

2168

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);

2169

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);

2170

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,

2171

MVT::v32i16);

2172

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);

2173

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,

2174

MVT::v32i16);

2175

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);

2176

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,

2177

MVT::v32i16);

2178

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);

2179

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,

2180

MVT::v32i16);

2181

2182

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);

2183

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);

2184

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);

2185

2186

setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);

2187

setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);

2188

2189

setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);

2190

setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);

2191

}

2192

2193

if (Subtarget.hasVLX()) {

2194

setGroup(MVT::v8f16);

2195

setGroup(MVT::v16f16);

2196

2197

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);

2198

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);

2199

setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);

2200

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);

2201

setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);

2202

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);

2203

setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);

2204

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);

2205

setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);

2206

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);

2207

2208

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

2209

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);

2210

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);

2211

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);

2212

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);

2213

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);

2214

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

2215

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

2216

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

2217

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

2218

2219

// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE

2220

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);

2221

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);

2222

2223

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);

2224

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);

2225

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);

2226

2227

setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);

2228

setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);

2229

setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);

2230

setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);

2231

2232

// Need to custom widen these to prevent scalarization.

2233

setOperationAction(ISD::LOAD, MVT::v4f16, Custom);

2234

setOperationAction(ISD::STORE, MVT::v4f16, Custom);

2235

}

2236

}

2237

2238

if (!Subtarget.useSoftFloat() &&

2239

(Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {

2240

addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);

2241

addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);

2242

// We set the type action of bf16 to TypeSoftPromoteHalf, but we don't

2243

// provide the method to promote BUILD_VECTOR. Set the operation action

2244

// Custom to do the customization later.

2245

setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);

2246

for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {

2247

setF16Action(VT, Expand);

2248

setOperationAction(ISD::FADD, VT, Expand);

2249

setOperationAction(ISD::FSUB, VT, Expand);

2250

setOperationAction(ISD::FMUL, VT, Expand);

2251

setOperationAction(ISD::FDIV, VT, Expand);

2252

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2253

}

2254

addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));

2255

}

2256

2257

if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {

2258

addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);

2259

setF16Action(MVT::v32bf16, Expand);

2260

setOperationAction(ISD::FADD, MVT::v32bf16, Expand);

2261

setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);

2262

setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);

2263

setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);

2264

setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);

2265

}

2266

2267

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

2268

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

2269

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

2270

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

2271

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

2272

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

2273

2274

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

2275

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

2276

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

2277

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

2278

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

2279

2280

if (Subtarget.hasBWI()) {

2281

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

2282

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

2283

}

2284

2285

if (Subtarget.hasFP16()) {

2286

// vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64

2287

setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);

2288

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);

2289

setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);

2290

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);

2291

setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);

2292

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);

2293

setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);

2294

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);

2295

// vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16

2296

setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);

2297

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);

2298

setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);

2299

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);

2300

setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);

2301

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);

2302

setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);

2303

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);

2304

// vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16

2305

setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);

2306

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);

2307

setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

2308

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);

2309

// vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32

2310

setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);

2311

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);

2312

setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);

2313

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);

2314

}

2315

2316

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);

2317

setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);

2318

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

2319

}

2320

2321

if (Subtarget.hasAMXTILE()) {

2322

addRegisterClass(MVT::x86amx, &X86::TILERegClass);

2323

}

2324

2325

// We want to custom lower some of our intrinsics.

2326

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

2327

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

2328

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

2329

if (!Subtarget.is64Bit()) {

2330

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

2331

}

2332

2333

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

2334

// handle type legalization for these operations here.

2335

//

2336

// FIXME: We really should do custom legalization for addition and

2337

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

2338

// than generic legalization for 64-bit multiplication-with-overflow, though.

2339

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

2340

if (VT == MVT::i64 && !Subtarget.is64Bit())

2341

continue;

2342

// Add/Sub/Mul with overflow operations are custom lowered.

2343

setOperationAction(ISD::SADDO, VT, Custom);

2344

setOperationAction(ISD::UADDO, VT, Custom);

2345

setOperationAction(ISD::SSUBO, VT, Custom);

2346

setOperationAction(ISD::USUBO, VT, Custom);

2347

setOperationAction(ISD::SMULO, VT, Custom);

2348

setOperationAction(ISD::UMULO, VT, Custom);

2349

2350

// Support carry in as value rather than glue.

2351

setOperationAction(ISD::UADDO_CARRY, VT, Custom);

2352

setOperationAction(ISD::USUBO_CARRY, VT, Custom);

2353

setOperationAction(ISD::SETCCCARRY, VT, Custom);

2354

setOperationAction(ISD::SADDO_CARRY, VT, Custom);

2355

setOperationAction(ISD::SSUBO_CARRY, VT, Custom);

2356

}

2357

2358

if (!Subtarget.is64Bit()) {

2359

// These libcalls are not available in 32-bit.

2360

setLibcallName(RTLIB::SHL_I128, nullptr);

2361

setLibcallName(RTLIB::SRL_I128, nullptr);

2362

setLibcallName(RTLIB::SRA_I128, nullptr);

2363

setLibcallName(RTLIB::MUL_I128, nullptr);

2364

// The MULO libcall is not part of libgcc, only compiler-rt.

2365

setLibcallName(RTLIB::MULO_I64, nullptr);

2366

}

2367

// The MULO libcall is not part of libgcc, only compiler-rt.

2368

setLibcallName(RTLIB::MULO_I128, nullptr);

2369

2370

// Combine sin / cos into _sincos_stret if it is available.

2371

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

2372

getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

2373

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

2374

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

2375

}

2376

2377

if (Subtarget.isTargetWin64()) {

2378

setOperationAction(ISD::SDIV, MVT::i128, Custom);

2379

setOperationAction(ISD::UDIV, MVT::i128, Custom);

2380

setOperationAction(ISD::SREM, MVT::i128, Custom);

2381

setOperationAction(ISD::UREM, MVT::i128, Custom);

2382

setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

2383

setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

2384

setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

2385

setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

2386

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

2387

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

2388

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

2389

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

2390

}

2391

2392

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

2393

// is. We should promote the value to 64-bits to solve this.

2394

// This is what the CRT headers do - `fmodf` is an inline header

2395

// function casting to f64 and calling `fmod`.

2396

if (Subtarget.is32Bit() &&

2397

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

2398

for (ISD::NodeType Op :

2399

{ISD::FCEIL, ISD::STRICT_FCEIL,

2400

ISD::FCOS, ISD::STRICT_FCOS,

2401

ISD::FEXP, ISD::STRICT_FEXP,

2402

ISD::FFLOOR, ISD::STRICT_FFLOOR,

2403

ISD::FREM, ISD::STRICT_FREM,

2404

ISD::FLOG, ISD::STRICT_FLOG,

2405

ISD::FLOG10, ISD::STRICT_FLOG10,

2406

ISD::FPOW, ISD::STRICT_FPOW,

2407

ISD::FSIN, ISD::STRICT_FSIN})

2408

if (isOperationExpand(Op, MVT::f32))

2409

setOperationAction(Op, MVT::f32, Promote);

2410

2411

// We have target-specific dag combine patterns for the following nodes:

2412

setTargetDAGCombine({ISD::VECTOR_SHUFFLE,

2413

ISD::SCALAR_TO_VECTOR,

2414

ISD::INSERT_VECTOR_ELT,

2415

ISD::EXTRACT_VECTOR_ELT,

2416

ISD::CONCAT_VECTORS,

2417

ISD::INSERT_SUBVECTOR,

2418

ISD::EXTRACT_SUBVECTOR,

2419

ISD::BITCAST,

2420

ISD::VSELECT,

2421

ISD::SELECT,

2422

ISD::SHL,

2423

ISD::SRA,

2424

ISD::SRL,

2425

ISD::OR,

2426

ISD::AND,

2427

ISD::ADD,

2428

ISD::FADD,

2429

ISD::FSUB,

2430

ISD::FNEG,

2431

ISD::FMA,

2432

ISD::STRICT_FMA,

2433

ISD::FMINNUM,

2434

ISD::FMAXNUM,

2435

ISD::SUB,

2436

ISD::LOAD,

2437

ISD::MLOAD,

2438

ISD::STORE,

2439

ISD::MSTORE,

2440

ISD::TRUNCATE,

2441

ISD::ZERO_EXTEND,

2442

ISD::ANY_EXTEND,

2443

ISD::SIGN_EXTEND,

2444

ISD::SIGN_EXTEND_INREG,

2445

ISD::ANY_EXTEND_VECTOR_INREG,

2446

ISD::SIGN_EXTEND_VECTOR_INREG,

2447

ISD::ZERO_EXTEND_VECTOR_INREG,

2448

ISD::SINT_TO_FP,

2449

ISD::UINT_TO_FP,

2450

ISD::STRICT_SINT_TO_FP,

2451

ISD::STRICT_UINT_TO_FP,

2452

ISD::SETCC,

2453

ISD::MUL,

2454

ISD::XOR,

2455

ISD::MSCATTER,

2456

ISD::MGATHER,

2457

ISD::FP16_TO_FP,

2458

ISD::FP_EXTEND,

2459

ISD::STRICT_FP_EXTEND,

2460

ISD::FP_ROUND,

2461

ISD::STRICT_FP_ROUND});

2462

2463

computeRegisterProperties(Subtarget.getRegisterInfo());

2464

2465

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

2466

MaxStoresPerMemsetOptSize = 8;

2467

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

2468

MaxStoresPerMemcpyOptSize = 4;

2469

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

2470

MaxStoresPerMemmoveOptSize = 4;

2471

2472

// TODO: These control memcmp expansion in CGP and could be raised higher, but

2473

// that needs to benchmarked and balanced with the potential use of vector

2474

// load/store types (PR33329, PR33914).

2475

MaxLoadsPerMemcmp = 2;

2476

MaxLoadsPerMemcmpOptSize = 2;

2477

2478

// Default loop alignment, which can be overridden by -align-loops.

2479

setPrefLoopAlignment(Align(16));

2480

2481

// An out-of-order CPU can speculatively execute past a predictable branch,

2482

// but a conditional move could be stalled by an expensive earlier operation.

2483

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

2484

EnableExtLdPromotion = true;

2485

setPrefFunctionAlignment(Align(16));

2486

2487

verifyIntrinsicTables();

2488

2489

// Default to having -disable-strictnode-mutation on

2490

IsStrictFPEnabled = true;

2491

}

2492

2493

// This has so far only been implemented for 64-bit MachO.

2494

bool X86TargetLowering::useLoadStackGuardNode() const {

2495

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

2496

}

2497

2498

bool X86TargetLowering::useStackGuardXorFP() const {

2499

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

2500

return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

2501

}

2502

2503

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

2504

const SDLoc &DL) const {

2505

EVT PtrTy = getPointerTy(DAG.getDataLayout());

2506

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

2507

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

2508

return SDValue(Node, 0);

2509

}

2510

2511

TargetLoweringBase::LegalizeTypeAction

2512

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

2513

if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

2514

!Subtarget.hasBWI())

2515

return TypeSplitVector;

2516

2517

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2518

!Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)

2519

return TypeSplitVector;

2520

2521

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2522

VT.getVectorElementType() != MVT::i1)

2523

return TypeWidenVector;

2524

2525

return TargetLoweringBase::getPreferredVectorAction(VT);

2526

}

2527

2528

static std::pair<MVT, unsigned>

2529

handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

2530

const X86Subtarget &Subtarget) {

2531

// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

2532

// convention is one that uses k registers.

2533

if (NumElts == 2)

2534

return {MVT::v2i64, 1};

2535

if (NumElts == 4)

2536

return {MVT::v4i32, 1};

2537

if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

2538

CC != CallingConv::Intel_OCL_BI)

2539

return {MVT::v8i16, 1};

2540

if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

2541

CC != CallingConv::Intel_OCL_BI)

2542

return {MVT::v16i8, 1};

2543

// v32i1 passes in ymm unless we have BWI and the calling convention is

2544

// regcall.

2545

if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

2546

return {MVT::v32i8, 1};

2547

// Split v64i1 vectors if we don't have v64i8 available.

2548

if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

2549

if (Subtarget.useAVX512Regs())

2550

return {MVT::v64i8, 1};

2551

return {MVT::v32i8, 2};

2552

}

2553

2554

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2555

if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

2556

NumElts > 64)

2557

return {MVT::i8, NumElts};

2558

2559

return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

2560

}

2561

2562

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

2563

CallingConv::ID CC,

2564

EVT VT) const {

2565

if (VT.isVector()) {

2566

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2567

unsigned NumElts = VT.getVectorNumElements();

2568

2569

MVT RegisterVT;

2570

unsigned NumRegisters;

2571

std::tie(RegisterVT, NumRegisters) =

2572

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2573

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2574

return RegisterVT;

2575

}

2576

2577

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2578

return MVT::v8f16;

2579

}

2580

2581

// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.

2582

if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&

2583

!Subtarget.hasX87())

2584

return MVT::i32;

2585

2586

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2587

return getRegisterTypeForCallingConv(Context, CC,

2588

VT.changeVectorElementTypeToInteger());

2589

2590

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

2591

}

2592

2593

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

2594

CallingConv::ID CC,

2595

EVT VT) const {

2596

if (VT.isVector()) {

2597

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2598

unsigned NumElts = VT.getVectorNumElements();

2599

2600

MVT RegisterVT;

2601

unsigned NumRegisters;

2602

std::tie(RegisterVT, NumRegisters) =

2603

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2604

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2605

return NumRegisters;

2606

}

2607

2608

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2609

return 1;

2610

}

2611

2612

// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if

2613

// x87 is disabled.

2614

if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {

2615

if (VT == MVT::f64)

2616

return 2;

2617

if (VT == MVT::f80)

2618

return 3;

2619

}

2620

2621

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2622

return getNumRegistersForCallingConv(Context, CC,

2623

VT.changeVectorElementTypeToInteger());

2624

2625

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

2626

}

2627

2628

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

2629

LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

2630

unsigned &NumIntermediates, MVT &RegisterVT) const {

2631

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2632

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2633

Subtarget.hasAVX512() &&

2634

(!isPowerOf2_32(VT.getVectorNumElements()) ||

2635

(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

2636

VT.getVectorNumElements() > 64)) {

2637

RegisterVT = MVT::i8;

2638

IntermediateVT = MVT::i1;

2639

NumIntermediates = VT.getVectorNumElements();

2640

return NumIntermediates;

2641

}

2642

2643

// Split v64i1 vectors if we don't have v64i8 available.

2644

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

2645

CC != CallingConv::X86_RegCall) {

2646

RegisterVT = MVT::v32i8;

2647

IntermediateVT = MVT::v32i1;

2648

NumIntermediates = 2;

2649

return 2;

2650

}

2651

2652

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

2653

NumIntermediates, RegisterVT);

2654

}

2655

2656

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

2657

LLVMContext& Context,

2658

EVT VT) const {

2659

if (!VT.isVector())

2660

return MVT::i8;

2661

2662

if (Subtarget.hasAVX512()) {

2663

// Figure out what this type will be legalized to.

2664

EVT LegalVT = VT;

2665

while (getTypeAction(Context, LegalVT) != TypeLegal)

2666

LegalVT = getTypeToTransformTo(Context, LegalVT);

2667

2668

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

2669

if (LegalVT.getSimpleVT().is512BitVector())

2670

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2671

2672

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

2673

// If we legalized to less than a 512-bit vector, then we will use a vXi1

2674

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

2675

// vXi16/vXi8.

2676

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

2677

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

2678

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2679

}

2680

}

2681

2682

return VT.changeVectorElementTypeToInteger();

2683

}

2684

2685

/// Helper for getByValTypeAlignment to determine

2686

/// the desired ByVal argument alignment.

2687

static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

2688

if (MaxAlign == 16)

2689

return;

2690

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

2691

if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)

2692

MaxAlign = Align(16);

2693

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

2694

Align EltAlign;

2695

getMaxByValAlign(ATy->getElementType(), EltAlign);

2696

if (EltAlign > MaxAlign)

2697

MaxAlign = EltAlign;

2698

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

2699

for (auto *EltTy : STy->elements()) {

2700

Align EltAlign;

2701

getMaxByValAlign(EltTy, EltAlign);

2702

if (EltAlign > MaxAlign)

2703

MaxAlign = EltAlign;

2704

if (MaxAlign == 16)

2705

break;

2706

}

2707

}

2708

}

2709

2710

/// Return the desired alignment for ByVal aggregate

2711

/// function arguments in the caller parameter area. For X86, aggregates

2712

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

2713

/// are at 4-byte boundaries.

2714

uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,

2715

const DataLayout &DL) const {

2716

if (Subtarget.is64Bit()) {

2717

// Max of 8 and alignment of type.

2718

Align TyAlign = DL.getABITypeAlign(Ty);

2719

if (TyAlign > 8)

2720

return TyAlign.value();

2721

return 8;

2722

}

2723

2724

Align Alignment(4);

2725

if (Subtarget.hasSSE1())

2726

getMaxByValAlign(Ty, Alignment);

2727

return Alignment.value();

2728

}

2729

2730

/// It returns EVT::Other if the type should be determined using generic

2731

/// target-independent logic.

2732

/// For vector ops we check that the overall size isn't larger than our

2733

/// preferred vector width.

2734

EVT X86TargetLowering::getOptimalMemOpType(

2735

const MemOp &Op, const AttributeList &FuncAttributes) const {

2736

if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {

2737

if (Op.size() >= 16 &&

2738

(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

2739

// FIXME: Check if unaligned 64-byte accesses are slow.

2740

if (Op.size() >= 64 && Subtarget.hasAVX512() &&

2741

(Subtarget.getPreferVectorWidth() >= 512)) {

2742

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

2743

}

2744

// FIXME: Check if unaligned 32-byte accesses are slow.

2745

if (Op.size() >= 32 && Subtarget.hasAVX() &&

2746

Subtarget.useLight256BitInstructions()) {

2747

// Although this isn't a well-supported type for AVX1, we'll let

2748

// legalization and shuffle lowering produce the optimal codegen. If we

2749

// choose an optimal type with a vector element larger than a byte,

2750

// getMemsetStores() may create an intermediate splat (using an integer

2751

// multiply) before we splat as a vector.

2752

return MVT::v32i8;

2753

}

2754

if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))

2755

return MVT::v16i8;

2756

// TODO: Can SSE1 handle a byte vector?

2757

// If we have SSE1 registers we should be able to use them.

2758

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

2759

(Subtarget.getPreferVectorWidth() >= 128))

2760

return MVT::v4f32;

2761

} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

2762

Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

2763

// Do not use f64 to lower memcpy if source is string constant. It's

2764

// better to use i32 to avoid the loads.

2765

// Also, do not use f64 to lower memset unless this is a memset of zeros.

2766

// The gymnastics of splatting a byte value into an XMM register and then

2767

// only using 8-byte stores (because this is a CPU with slow unaligned

2768

// 16-byte accesses) makes that a loser.

2769

return MVT::f64;

2770

}

2771

}

2772

// This is a compromise. If we reach here, unaligned accesses may be slow on

2773

// this target. However, creating smaller, aligned accesses could be even

2774

// slower and would certainly be a lot more code.

2775

if (Subtarget.is64Bit() && Op.size() >= 8)

2776

return MVT::i64;

2777

return MVT::i32;

2778

}

2779

2780

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

2781

if (VT == MVT::f32)

2782

return Subtarget.hasSSE1();

2783

if (VT == MVT::f64)

2784

return Subtarget.hasSSE2();

2785

return true;

2786

}

2787

2788

static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {

2789

return (8 * Alignment.value()) % SizeInBits == 0;

2790

}

2791

2792

bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {

2793

if (isBitAligned(Alignment, VT.getSizeInBits()))

2794

return true;

2795

switch (VT.getSizeInBits()) {

2796

default:

2797

// 8-byte and under are always assumed to be fast.

2798

return true;

2799

case 128:

2800

return !Subtarget.isUnalignedMem16Slow();

2801

case 256:

2802

return !Subtarget.isUnalignedMem32Slow();

2803

// TODO: What about AVX-512 (512-bit) accesses?

2804

}

2805

}

2806

2807

bool X86TargetLowering::allowsMisalignedMemoryAccesses(

2808

EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,

2809

unsigned *Fast) const {

2810

if (Fast)

2811

*Fast = isMemoryAccessFast(VT, Alignment);

2812

// NonTemporal vector memory ops must be aligned.

2813

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2814

// NT loads can only be vector aligned, so if its less aligned than the

2815

// minimum vector size (which we can split the vector down to), we might as

2816

// well use a regular unaligned vector load.

2817

// We don't have any NT loads pre-SSE41.

2818

if (!!(Flags & MachineMemOperand::MOLoad))

2819

return (Alignment < 16 || !Subtarget.hasSSE41());

2820

return false;

2821

}

2822

// Misaligned accesses of any size are always allowed.

2823

return true;

2824

}

2825

2826

bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,

2827

const DataLayout &DL, EVT VT,

2828

unsigned AddrSpace, Align Alignment,

2829

MachineMemOperand::Flags Flags,

2830

unsigned *Fast) const {

2831

if (Fast)

2832

*Fast = isMemoryAccessFast(VT, Alignment);

2833

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2834

if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,

2835

/*Fast=*/nullptr))

2836

return true;

2837

// NonTemporal vector memory ops are special, and must be aligned.

2838

if (!isBitAligned(Alignment, VT.getSizeInBits()))

2839

return false;

2840

switch (VT.getSizeInBits()) {

2841

case 128:

2842

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())

2843

return true;

2844

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())

2845

return true;

2846

return false;

2847

case 256:

2848

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())

2849

return true;

2850

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())

2851

return true;

2852

return false;

2853

case 512:

2854

if (Subtarget.hasAVX512())

2855

return true;

2856

return false;

2857

default:

2858

return false; // Don't have NonTemporal vector memory ops of this size.

2859

}

2860

}

2861

return true;

2862

}

2863

2864

/// Return the entry encoding for a jump table in the

2865

/// current function. The returned value is a member of the

2866

/// MachineJumpTableInfo::JTEntryKind enum.

2867

unsigned X86TargetLowering::getJumpTableEncoding() const {

2868

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

2869

// symbol.

2870

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

2871

return MachineJumpTableInfo::EK_Custom32;

2872

2873

// Otherwise, use the normal jump table encoding heuristics.

2874

return TargetLowering::getJumpTableEncoding();

2875

}

2876

2877

bool X86TargetLowering::splitValueIntoRegisterParts(

2878

SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,

2879

unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {

2880

bool IsABIRegCopy = CC.has_value();

2881

EVT ValueVT = Val.getValueType();

2882

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2883

unsigned ValueBits = ValueVT.getSizeInBits();

2884

unsigned PartBits = PartVT.getSizeInBits();

2885

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);

2886

Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);

2887

Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);

2888

Parts[0] = Val;

2889

return true;

2890

}

2891

return false;

2892

}

2893

2894

SDValue X86TargetLowering::joinRegisterPartsIntoValue(

2895

SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,

2896

MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {

2897

bool IsABIRegCopy = CC.has_value();

2898

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2899

unsigned ValueBits = ValueVT.getSizeInBits();

2900

unsigned PartBits = PartVT.getSizeInBits();

2901

SDValue Val = Parts[0];

2902

2903

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);

2904

Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);

2905

Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

2906

return Val;

2907

}

2908

return SDValue();

2909

}

2910

2911

bool X86TargetLowering::useSoftFloat() const {

2912

return Subtarget.useSoftFloat();

2913

}

2914

2915

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

2916

ArgListTy &Args) const {

2917

2918

// Only relabel X86-32 for C / Stdcall CCs.

2919

if (Subtarget.is64Bit())

2920

return;

2921

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

2922

return;

2923

unsigned ParamRegs = 0;

2924

if (auto *M = MF->getFunction().getParent())

2925

ParamRegs = M->getNumberRegisterParameters();

2926

2927

// Mark the first N int arguments as having reg

2928

for (auto &Arg : Args) {

2929

Type *T = Arg.Ty;

2930

if (T->isIntOrPtrTy())

2931

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

2932

unsigned numRegs = 1;

2933

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

2934

numRegs = 2;

2935

if (ParamRegs < numRegs)

2936

return;

2937

ParamRegs -= numRegs;

2938

Arg.IsInReg = true;

2939

}

2940

}

2941

}

2942

2943

const MCExpr *

2944

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

2945

const MachineBasicBlock *MBB,

2946

unsigned uid,MCContext &Ctx) const{

2947

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2947, __extension__
__PRETTY_FUNCTION__));

2948

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

2949

// entries.

2950

return MCSymbolRefExpr::create(MBB->getSymbol(),

2951

MCSymbolRefExpr::VK_GOTOFF, Ctx);

2952

}

2953

2954

/// Returns relocation base for the given PIC jumptable.

2955

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

2956

SelectionDAG &DAG) const {

2957

if (!Subtarget.is64Bit())

2958

// This doesn't have SDLoc associated with it, but is not really the

2959

// same as a Register.

2960

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

2961

getPointerTy(DAG.getDataLayout()));

2962

return Table;

2963

}

2964

2965

/// This returns the relocation base for the given PIC jumptable,

2966

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

2967

const MCExpr *X86TargetLowering::

2968

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

2969

MCContext &Ctx) const {

2970

// X86-64 uses RIP relative addressing based on the jump table label.

2971

if (Subtarget.isPICStyleRIPRel())

2972

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

2973

2974

// Otherwise, the reference is relative to the PIC base.

2975

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

2976

}

2977

2978

std::pair<const TargetRegisterClass *, uint8_t>

2979

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

2980

MVT VT) const {

2981

const TargetRegisterClass *RRC = nullptr;

2982

uint8_t Cost = 1;

2983

switch (VT.SimpleTy) {

2984

default:

2985

return TargetLowering::findRepresentativeClass(TRI, VT);

2986

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

2987

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

2988

break;

2989

case MVT::x86mmx:

2990

RRC = &X86::VR64RegClass;

2991

break;

2992

case MVT::f32: case MVT::f64:

2993

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

2994

case MVT::v4f32: case MVT::v2f64:

2995

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

2996

case MVT::v8f32: case MVT::v4f64:

2997

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

2998

case MVT::v16f32: case MVT::v8f64:

2999

RRC = &X86::VR128XRegClass;

3000

break;

3001

}

3002

return std::make_pair(RRC, Cost);

3003

}

3004

3005

unsigned X86TargetLowering::getAddressSpace() const {

3006

if (Subtarget.is64Bit())

3007

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

3008

return 256;

3009

}

3010

3011

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

3012

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

3013

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

3014

}

3015

3016

static Constant* SegmentOffset(IRBuilderBase &IRB,

3017

int Offset, unsigned AddressSpace) {

3018

return ConstantExpr::getIntToPtr(

3019

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

3020

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

3021

}

3022

3023

Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {

3024

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

3025

// tcbhead_t; use it instead of the usual global variable (see

3026

// sysdeps/{i386,x86_64}/nptl/tls.h)

3027

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

3028

if (Subtarget.isTargetFuchsia()) {

3029

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

3030

return SegmentOffset(IRB, 0x10, getAddressSpace());

3031

} else {

3032

unsigned AddressSpace = getAddressSpace();

3033

Module *M = IRB.GetInsertBlock()->getParent()->getParent();

3034

// Specially, some users may customize the base reg and offset.

3035

int Offset = M->getStackProtectorGuardOffset();

3036

// If we don't set -stack-protector-guard-offset value:

3037

// %fs:0x28, unless we're using a Kernel code model, in which case

3038

// it's %gs:0x28. gs:0x14 on i386.

3039

if (Offset == INT_MAX2147483647)

3040

Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

3041

3042

StringRef GuardReg = M->getStackProtectorGuardReg();

3043

if (GuardReg == "fs")

3044

AddressSpace = X86AS::FS;

3045

else if (GuardReg == "gs")

3046

AddressSpace = X86AS::GS;

3047

3048

// Use symbol guard if user specify.

3049

StringRef GuardSymb = M->getStackProtectorGuardSymbol();

3050

if (!GuardSymb.empty()) {

3051

GlobalVariable *GV = M->getGlobalVariable(GuardSymb);

3052

if (!GV) {

3053

Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())

3054

: Type::getInt32Ty(M->getContext());

3055

GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,

3056

nullptr, GuardSymb, nullptr,

3057

GlobalValue::NotThreadLocal, AddressSpace);

3058

}

3059

return GV;

3060

}

3061

3062

return SegmentOffset(IRB, Offset, AddressSpace);

3063

}

3064

}

3065

return TargetLowering::getIRStackGuard(IRB);

3066

}

3067

3068

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

3069

// MSVC CRT provides functionalities for stack protection.

3070

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3071

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3072

// MSVC CRT has a global variable holding security cookie.

3073

M.getOrInsertGlobal("__security_cookie",

3074

Type::getInt8PtrTy(M.getContext()));

3075

3076

// MSVC CRT has a function to validate security cookie.

3077

FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(

3078

"__security_check_cookie", Type::getVoidTy(M.getContext()),

3079

Type::getInt8PtrTy(M.getContext()));

3080

if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

3081

F->setCallingConv(CallingConv::X86_FastCall);

3082

F->addParamAttr(0, Attribute::AttrKind::InReg);

3083

}

3084

return;

3085

}

3086

3087

StringRef GuardMode = M.getStackProtectorGuard();

3088

3089

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

3090

if ((GuardMode == "tls" || GuardMode.empty()) &&

3091

hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

3092

return;

3093

TargetLowering::insertSSPDeclarations(M);

3094

}

3095

3096

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

3097

// MSVC CRT has a global variable holding security cookie.

3098

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3099

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3100

return M.getGlobalVariable("__security_cookie");

3101

}

3102

return TargetLowering::getSDagStackGuard(M);

3103

}

3104

3105

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

3106

// MSVC CRT has a function to validate security cookie.

3107

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3108

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3109

return M.getFunction("__security_check_cookie");

3110

}

3111

return TargetLowering::getSSPStackGuardCheck(M);

3112

}

3113

3114

Value *

3115

X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {

3116

if (Subtarget.getTargetTriple().isOSContiki())

3117

return getDefaultSafeStackPointerLocation(IRB, false);

3118

3119

// Android provides a fixed TLS slot for the SafeStack pointer. See the

3120

// definition of TLS_SLOT_SAFESTACK in

3121

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

3122

if (Subtarget.isTargetAndroid()) {

3123

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

3124

// %gs:0x24 on i386

3125

int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

3126

return SegmentOffset(IRB, Offset, getAddressSpace());

3127

}

3128

3129

// Fuchsia is similar.

3130

if (Subtarget.isTargetFuchsia()) {

3131

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

3132

return SegmentOffset(IRB, 0x18, getAddressSpace());

3133

}

3134

3135

return TargetLowering::getSafeStackPointerLocation(IRB);

3136

}

3137

3138

//===----------------------------------------------------------------------===//

3139

// Return Value Calling Convention Implementation

3140

//===----------------------------------------------------------------------===//

3141

3142

bool X86TargetLowering::CanLowerReturn(

3143

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

3144

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

3145

SmallVector<CCValAssign, 16> RVLocs;

3146

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

3147

return CCInfo.CheckReturn(Outs, RetCC_X86);

3148

}

3149

3150

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

3151

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

3152

return ScratchRegs;

3153

}

3154

3155

ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {

3156

// FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit

3157

// tests at the moment, which is not what we expected.

3158

static const MCPhysReg RCRegs[] = {X86::MXCSR};

3159

return RCRegs;

3160

}

3161

3162

/// Lowers masks values (v*i1) to the local register values

3163

/// \returns DAG node after lowering to register type

3164

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

3165

const SDLoc &Dl, SelectionDAG &DAG) {

3166

EVT ValVT = ValArg.getValueType();

3167

3168

if (ValVT == MVT::v1i1)

3169

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,

3170

DAG.getIntPtrConstant(0, Dl));

3171

3172

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

3173

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

3174

// Two stage lowering might be required

3175

// bitcast: v8i1 -> i8 / v16i1 -> i16

3176

// anyextend: i8 -> i32 / i16 -> i32

3177

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

3178

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

3179

if (ValLoc == MVT::i32)

3180

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

3181

return ValToCopy;

3182

}

3183

3184

if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

3185

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

3186

// One stage lowering is required

3187

// bitcast: v32i1 -> i32 / v64i1 -> i64

3188

return DAG.getBitcast(ValLoc, ValArg);

3189

}

3190

3191

return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);

3192

}

3193

3194

/// Breaks v64i1 value into two registers and adds the new node to the DAG

3195

static void Passv64i1ArgInRegs(

3196

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

3197

SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

3198

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

3199

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3199, __extension__
__PRETTY_FUNCTION__));

3200

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3200, __extension__
__PRETTY_FUNCTION__));

3201

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3201, __extension__
__PRETTY_FUNCTION__));

3202

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__
__PRETTY_FUNCTION__))

3203

"The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__
__PRETTY_FUNCTION__));

3204

3205

// Before splitting the value we cast it to i64

3206

Arg = DAG.getBitcast(MVT::i64, Arg);

3207

3208

// Splitting the value into two i32 types

3209

SDValue Lo, Hi;

3210

std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);

3211

3212

// Attach the two i32 types into corresponding registers

3213

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

3214

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

3215

}

3216

3217

SDValue

3218

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

3219

bool isVarArg,

3220

const SmallVectorImpl<ISD::OutputArg> &Outs,

3221

const SmallVectorImpl<SDValue> &OutVals,

3222

const SDLoc &dl, SelectionDAG &DAG) const {

3223

MachineFunction &MF = DAG.getMachineFunction();

3224

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3225

3226

// In some cases we need to disable registers from the default CSR list.

3227

// For example, when they are used as return registers (preserve_* and X86's

3228

// regcall) or for argument passing (X86's regcall).

3229

bool ShouldDisableCalleeSavedRegister =

3230

shouldDisableRetRegFromCSR(CallConv) ||

3231

MF.getFunction().hasFnAttribute("no_caller_saved_registers");

3232

3233

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

3234

report_fatal_error("X86 interrupts may not return any value");

3235

3236

SmallVector<CCValAssign, 16> RVLocs;

3237

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

3238

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

3239

3240

SmallVector<std::pair<Register, SDValue>, 4> RetVals;

3241

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

3242

++I, ++OutsIndex) {

3243

CCValAssign &VA = RVLocs[I];

3244

assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3244, __extension__
__PRETTY_FUNCTION__));

3245

3246

// Add the register to the CalleeSaveDisableRegs list.

3247

if (ShouldDisableCalleeSavedRegister)

3248

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

3249

3250

SDValue ValToCopy = OutVals[OutsIndex];

3251

EVT ValVT = ValToCopy.getValueType();

3252

3253

// Promote values to the appropriate types.

3254

if (VA.getLocInfo() == CCValAssign::SExt)

3255

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

3256

else if (VA.getLocInfo() == CCValAssign::ZExt)

3257

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

3258

else if (VA.getLocInfo() == CCValAssign::AExt) {

3259

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

3260

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

3261

else

3262

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

3263

}

3264

else if (VA.getLocInfo() == CCValAssign::BCvt)

3265

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

3266

3267

assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__
__PRETTY_FUNCTION__))

3268

"Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__
__PRETTY_FUNCTION__));

3269

3270

// Report an error if we have attempted to return a value via an XMM

3271

// register and SSE was disabled.

3272

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3273

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3274

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3275

} else if (!Subtarget.hasSSE2() &&

3276

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3277

ValVT == MVT::f64) {

3278

// When returning a double via an XMM register, report an error if SSE2 is

3279

// not enabled.

3280

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3281

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3282

}

3283

3284

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

3285

// the RET instruction and handled by the FP Stackifier.

3286

if (VA.getLocReg() == X86::FP0 ||

3287

VA.getLocReg() == X86::FP1) {

3288

// If this is a copy from an xmm register to ST(0), use an FPExtend to

3289

// change the value to the FP stack register class.

3290

if (isScalarFPTypeInSSEReg(VA.getValVT()))

3291

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

3292

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3293

// Don't emit a copytoreg.

3294

continue;

3295

}

3296

3297

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

3298

// which is returned in RAX / RDX.

3299

if (Subtarget.is64Bit()) {

3300

if (ValVT == MVT::x86mmx) {

3301

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

3302

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

3303

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

3304

ValToCopy);

3305

// If we don't have SSE2 available, convert to v4f32 so the generated

3306

// register is legal.

3307

if (!Subtarget.hasSSE2())

3308

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

3309

}

3310

}

3311

}

3312

3313

if (VA.needsCustom()) {

3314

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__
__PRETTY_FUNCTION__))

3315

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__
__PRETTY_FUNCTION__));

3316

3317

Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

3318

Subtarget);

3319

3320

// Add the second register to the CalleeSaveDisableRegs list.

3321

if (ShouldDisableCalleeSavedRegister)

3322

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

3323

} else {

3324

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3325

}

3326

}

3327

3328

SDValue Glue;

3329

SmallVector<SDValue, 6> RetOps;

3330

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

3331

// Operand #1 = Bytes To Pop

3332

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

3333

MVT::i32));

3334

3335

// Copy the result values into the output registers.

3336

for (auto &RetVal : RetVals) {

3337

if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

3338

RetOps.push_back(RetVal.second);

3339

continue; // Don't emit a copytoreg.

3340

}

3341

3342

Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);

3343

Glue = Chain.getValue(1);

3344

RetOps.push_back(

3345

DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

3346

}

3347

3348

// Swift calling convention does not require we copy the sret argument

3349

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

3350

3351

// All x86 ABIs require that for returning structs by value we copy

3352

// the sret argument into %rax/%eax (depending on ABI) for the return.

3353

// We saved the argument into a virtual register in the entry block,

3354

// so now we copy the value out and into %rax/%eax.

3355

//

3356

// Checking Function.hasStructRetAttr() here is insufficient because the IR

3357

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

3358

// false, then an sret argument may be implicitly inserted in the SelDAG. In

3359

// either case FuncInfo->setSRetReturnReg() will have been called.

3360

if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

3361

// When we have both sret and another return value, we should use the

3362

// original Chain stored in RetOps[0], instead of the current Chain updated

3363

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

3364

3365

// For the case of sret and another return value, we have

3366

// Chain_0 at the function entry

3367

// Chain_1 = getCopyToReg(Chain_0) in the above loop

3368

// If we use Chain_1 in getCopyFromReg, we will have

3369

// Val = getCopyFromReg(Chain_1)

3370

// Chain_2 = getCopyToReg(Chain_1, Val) from below

3371

3372

// getCopyToReg(Chain_0) will be glued together with

3373

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

3374

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

3375

// Data dependency from Unit B to Unit A due to usage of Val in

3376

// getCopyToReg(Chain_1, Val)

3377

// Chain dependency from Unit A to Unit B

3378

3379

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

3380

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

3381

getPointerTy(MF.getDataLayout()));

3382

3383

Register RetValReg

3384

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

3385

X86::RAX : X86::EAX;

3386

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);

3387

Glue = Chain.getValue(1);

3388

3389

// RAX/EAX now acts like a return value.

3390

RetOps.push_back(

3391

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

3392

3393

// Add the returned register to the CalleeSaveDisableRegs list. Don't do

3394

// this however for preserve_most/preserve_all to minimize the number of

3395

// callee-saved registers for these CCs.

3396

if (ShouldDisableCalleeSavedRegister &&

3397

CallConv != CallingConv::PreserveAll &&

3398

CallConv != CallingConv::PreserveMost)

3399

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

3400

}

3401

3402

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

3403

const MCPhysReg *I =

3404

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

3405

if (I) {

3406

for (; *I; ++I) {

3407

if (X86::GR64RegClass.contains(*I))

3408

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

3409

else

3410

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3410);

3411

}

3412

}

3413

3414

RetOps[0] = Chain; // Update chain.

3415

3416

// Add the glue if we have it.

3417

if (Glue.getNode())

3418

RetOps.push_back(Glue);

3419

3420

X86ISD::NodeType opcode = X86ISD::RET_GLUE;

3421

if (CallConv == CallingConv::X86_INTR)

3422

opcode = X86ISD::IRET;

3423

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

3424

}

3425

3426

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

3427

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

3428

return false;

3429

3430

SDValue TCChain = Chain;

3431

SDNode *Copy = *N->use_begin();

3432

if (Copy->getOpcode() == ISD::CopyToReg) {

3433

// If the copy has a glue operand, we conservatively assume it isn't safe to

3434

// perform a tail call.

3435

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

3436

return false;

3437

TCChain = Copy->getOperand(0);

3438

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

3439

return false;

3440

3441

bool HasRet = false;

3442

for (const SDNode *U : Copy->uses()) {

3443

if (U->getOpcode() != X86ISD::RET_GLUE)

3444

return false;

3445

// If we are returning more than one value, we can definitely

3446

// not make a tail call see PR19530

3447

if (U->getNumOperands() > 4)

3448

return false;

3449

if (U->getNumOperands() == 4 &&

3450

U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)

3451

return false;

3452

HasRet = true;

3453

}

3454

3455

if (!HasRet)

3456

return false;

3457

3458

Chain = TCChain;

3459

return true;

3460

}

3461

3462

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

3463

ISD::NodeType ExtendKind) const {

3464

MVT ReturnMVT = MVT::i32;

3465

3466

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

3467

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

3468

// The ABI does not require i1, i8 or i16 to be extended.

3469

//

3470

// On Darwin, there is code in the wild relying on Clang's old behaviour of

3471

// always extending i8/i16 return values, so keep doing that for now.

3472

// (PR26665).

3473

ReturnMVT = MVT::i8;

3474

}

3475

3476

EVT MinVT = getRegisterType(Context, ReturnMVT);

3477

return VT.bitsLT(MinVT) ? MinVT : VT;

3478

}

3479

3480

/// Reads two 32 bit registers and creates a 64 bit mask value.

3481

/// \param VA The current 32 bit value that need to be assigned.

3482

/// \param NextVA The next 32 bit value that need to be assigned.

3483

/// \param Root The parent DAG node.

3484

/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for

3485

/// glue purposes. In the case the DAG is already using

3486

/// physical register instead of virtual, we should glue

3487

/// our new SDValue to InGlue SDvalue.

3488

/// \return a new SDvalue of size 64bit.

3489

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

3490

SDValue &Root, SelectionDAG &DAG,

3491

const SDLoc &Dl, const X86Subtarget &Subtarget,

3492

SDValue *InGlue = nullptr) {

3493

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3493, __extension__
__PRETTY_FUNCTION__));

3494

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3494, __extension__
__PRETTY_FUNCTION__));

3495

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__
__PRETTY_FUNCTION__))

3496

"Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__
__PRETTY_FUNCTION__));

3497

assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__
__PRETTY_FUNCTION__))

3498

"The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__
__PRETTY_FUNCTION__));

3499

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__))

3500

"The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__));

3501

3502

SDValue Lo, Hi;

3503

SDValue ArgValueLo, ArgValueHi;

3504

3505

MachineFunction &MF = DAG.getMachineFunction();

3506

const TargetRegisterClass *RC = &X86::GR32RegClass;

3507

3508

// Read a 32 bit value from the registers.

3509

if (nullptr == InGlue) {

3510

// When no physical register is present,

3511

// create an intermediate virtual register.

3512

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3513

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3514

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

3515

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3516

} else {

3517

// When a physical register is available read the value from it and glue

3518

// the reads together.

3519

ArgValueLo =

3520

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);

3521

*InGlue = ArgValueLo.getValue(2);

3522

ArgValueHi =

3523

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);

3524

*InGlue = ArgValueHi.getValue(2);

3525

}

3526

3527

// Convert the i32 type into v32i1 type.

3528

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

3529

3530

// Convert the i32 type into v32i1 type.

3531

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

3532

3533

// Concatenate the two values together.

3534

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

3535

}

3536

3537

/// The function will lower a register of various sizes (8/16/32/64)

3538

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

3539

/// \returns a DAG node contains the operand after lowering to mask type.

3540

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

3541

const EVT &ValLoc, const SDLoc &Dl,

3542

SelectionDAG &DAG) {

3543

SDValue ValReturned = ValArg;

3544

3545

if (ValVT == MVT::v1i1)

3546

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

3547

3548

if (ValVT == MVT::v64i1) {

3549

// In 32 bit machine, this case is handled by getv64i1Argument

3550

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3550, __extension__
__PRETTY_FUNCTION__));

3551

// In 64 bit machine, There is no need to truncate the value only bitcast

3552

} else {

3553

MVT maskLen;

3554

switch (ValVT.getSimpleVT().SimpleTy) {

3555

case MVT::v8i1:

3556

maskLen = MVT::i8;

3557

break;

3558

case MVT::v16i1:

3559

maskLen = MVT::i16;

3560

break;

3561

case MVT::v32i1:

3562

maskLen = MVT::i32;

3563

break;

3564

default:

3565

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3565);

3566

}

3567

3568

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

3569

}

3570

return DAG.getBitcast(ValVT, ValReturned);

3571

}

3572

3573

/// Lower the result values of a call into the

3574

/// appropriate copies out of appropriate physical registers.

3575

///

3576

SDValue X86TargetLowering::LowerCallResult(

3577

SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,

3578

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3579

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

3580

uint32_t *RegMask) const {

3581

3582

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

3583

// Assign locations to each value returned by this call.

3584

SmallVector<CCValAssign, 16> RVLocs;

3585

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

3586

*DAG.getContext());

3587

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

3588

3589

// Copy all of the result registers out of their specified physreg.

3590

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

3591

++I, ++InsIndex) {

3592

CCValAssign &VA = RVLocs[I];

3593

EVT CopyVT = VA.getLocVT();

3594

3595

// In some calling conventions we need to remove the used registers

3596

// from the register mask.

3597

if (RegMask) {

3598

for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))

3599

RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));

3600

}

3601

3602

// Report an error if there was an attempt to return FP values via XMM

3603

// registers.

3604

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3605

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3606

if (VA.getLocReg() == X86::XMM1)

3607

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3608

else

3609

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3610

} else if (!Subtarget.hasSSE2() &&

3611

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3612

CopyVT == MVT::f64) {

3613

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3614

if (VA.getLocReg() == X86::XMM1)

3615

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3616

else

3617

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3618

}

3619

3620

// If we prefer to use the value in xmm registers, copy it out as f80 and

3621

// use a truncate to move it from fp stack reg to xmm reg.

3622

bool RoundAfterCopy = false;

3623

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

3624

isScalarFPTypeInSSEReg(VA.getValVT())) {

3625

if (!Subtarget.hasX87())

3626

report_fatal_error("X87 register return with X87 disabled");

3627

CopyVT = MVT::f80;

3628

RoundAfterCopy = (CopyVT != VA.getLocVT());

3629

}

3630

3631

SDValue Val;

3632

if (VA.needsCustom()) {

3633

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__
__PRETTY_FUNCTION__))

3634

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__
__PRETTY_FUNCTION__));

3635

Val =

3636

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);

3637

} else {

3638

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)

3639

.getValue(1);

3640

Val = Chain.getValue(0);

3641

InGlue = Chain.getValue(2);

3642

}

3643

3644

if (RoundAfterCopy)

3645

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

3646

// This truncation won't change the value.

3647

DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));

3648

3649

if (VA.isExtInLoc()) {

3650

if (VA.getValVT().isVector() &&

3651

VA.getValVT().getScalarType() == MVT::i1 &&

3652

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3653

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3654

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3655

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

3656

} else

3657

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

3658

}

3659

3660

if (VA.getLocInfo() == CCValAssign::BCvt)

3661

Val = DAG.getBitcast(VA.getValVT(), Val);

3662

3663

InVals.push_back(Val);

3664

}

3665

3666

return Chain;

3667

}

3668

3669

//===----------------------------------------------------------------------===//

3670

// C & StdCall & Fast Calling Convention implementation

3671

//===----------------------------------------------------------------------===//

3672

// StdCall calling convention seems to be standard for many Windows' API

3673

// routines and around. It differs from C calling convention just a little:

3674

// callee should clean up the stack, not caller. Symbols should be also

3675

// decorated in some fancy way :) It doesn't support any vector arguments.

3676

// For info on fast calling convention see Fast Calling Convention (tail call)

3677

// implementation LowerX86_32FastCCCallTo.

3678

3679

/// Determines whether Args, either a set of outgoing arguments to a call, or a

3680

/// set of incoming args of a call, contains an sret pointer that the callee

3681

/// pops

3682

template <typename T>

3683

static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,

3684

const X86Subtarget &Subtarget) {

3685

// Not C++20 (yet), so no concepts available.

3686

static_assert(std::is_same_v<T, ISD::OutputArg> ||

3687

std::is_same_v<T, ISD::InputArg>,

3688

"requires ISD::OutputArg or ISD::InputArg");

3689

3690

// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out

3691

// for most compilations.

3692

if (!Subtarget.is32Bit())

3693

return false;

3694

3695

if (Args.empty())

3696

return false;

3697

3698

// Most calls do not have an sret argument, check the arg next.

3699

const ISD::ArgFlagsTy &Flags = Args[0].Flags;

3700

if (!Flags.isSRet() || Flags.isInReg())

3701

return false;

3702

3703

// The MSVCabi does not pop the sret.

3704

if (Subtarget.getTargetTriple().isOSMSVCRT())

3705

return false;

3706

3707

// MCUs don't pop the sret

3708

if (Subtarget.isTargetMCU())

3709

return false;

3710

3711

// Callee pops argument

3712

return true;

3713

}

3714

3715

/// Make a copy of an aggregate at address specified by "Src" to address

3716

/// "Dst" with size and alignment information specified by the specific

3717

/// parameter attribute. The copy will be passed as a byval function parameter.

3718

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

3719

SDValue Chain, ISD::ArgFlagsTy Flags,

3720

SelectionDAG &DAG, const SDLoc &dl) {

3721

SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

3722

3723

return DAG.getMemcpy(

3724

Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

3725

/*isVolatile*/ false, /*AlwaysInline=*/true,

3726

/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

3727

}

3728

3729

/// Return true if the calling convention is one that we can guarantee TCO for.

3730

static bool canGuaranteeTCO(CallingConv::ID CC) {

3731

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

3732

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

3733

CC == CallingConv::Tail || CC == CallingConv::SwiftTail);

3734

}

3735

3736

/// Return true if we might ever do TCO for calls with this calling convention.

3737

static bool mayTailCallThisCC(CallingConv::ID CC) {

3738

switch (CC) {

3739

// C calling conventions:

3740

case CallingConv::C:

3741

case CallingConv::Win64:

3742

case CallingConv::X86_64_SysV:

3743

// Callee pop conventions:

3744

case CallingConv::X86_ThisCall:

3745

case CallingConv::X86_StdCall:

3746

case CallingConv::X86_VectorCall:

3747

case CallingConv::X86_FastCall:

3748

// Swift:

3749

case CallingConv::Swift:

3750

return true;

3751

default:

3752

return canGuaranteeTCO(CC);

3753

}

3754

}

3755

3756

/// Return true if the function is being made into a tailcall target by

3757

/// changing its ABI.

3758

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

3759

return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||

3760

CC == CallingConv::Tail || CC == CallingConv::SwiftTail;

3761

}

3762

3763

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

3764

if (!CI->isTailCall())

3765

return false;

3766

3767

CallingConv::ID CalleeCC = CI->getCallingConv();

3768

if (!mayTailCallThisCC(CalleeCC))

3769

return false;

3770

3771

return true;

3772

}

3773

3774

SDValue

3775

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

3776

const SmallVectorImpl<ISD::InputArg> &Ins,

3777

const SDLoc &dl, SelectionDAG &DAG,

3778

const CCValAssign &VA,

3779

MachineFrameInfo &MFI, unsigned i) const {

3780

// Create the nodes corresponding to a load from this parameter slot.

3781

ISD::ArgFlagsTy Flags = Ins[i].Flags;

3782

bool AlwaysUseMutable = shouldGuaranteeTCO(

3783

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

3784

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

3785

EVT ValVT;

3786

MVT PtrVT = getPointerTy(DAG.getDataLayout());

3787

3788

// If value is passed by pointer we have address passed instead of the value

3789

// itself. No need to extend if the mask value and location share the same

3790

// absolute size.

3791

bool ExtendedInMem =

3792

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

3793

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

3794

3795

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

3796

ValVT = VA.getLocVT();

3797

else

3798

ValVT = VA.getValVT();

3799

3800

// FIXME: For now, all byval parameter objects are marked mutable. This can be

3801

// changed with more analysis.

3802

// In case of tail call optimization mark all arguments mutable. Since they

3803

// could be overwritten by lowering of arguments in case of a tail call.

3804

if (Flags.isByVal()) {

3805

unsigned Bytes = Flags.getByValSize();

3806

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

3807

3808

// FIXME: For now, all byval parameter objects are marked as aliasing. This

3809

// can be improved with deeper analysis.

3810

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,

3811

/*isAliased=*/true);

3812

return DAG.getFrameIndex(FI, PtrVT);

3813

}

3814

3815

EVT ArgVT = Ins[i].ArgVT;

3816

3817

// If this is a vector that has been split into multiple parts, and the

3818

// scalar size of the parts don't match the vector element size, then we can't

3819

// elide the copy. The parts will have padding between them instead of being

3820

// packed like a vector.

3821

bool ScalarizedAndExtendedVector =

3822

ArgVT.isVector() && !VA.getLocVT().isVector() &&

3823

VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();

3824

3825

// This is an argument in memory. We might be able to perform copy elision.

3826

// If the argument is passed directly in memory without any extension, then we

3827

// can perform copy elision. Large vector types, for example, may be passed

3828

// indirectly by pointer.

3829

if (Flags.isCopyElisionCandidate() &&

3830

VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&

3831

!ScalarizedAndExtendedVector) {

3832

SDValue PartAddr;

3833

if (Ins[i].PartOffset == 0) {

3834

// If this is a one-part value or the first part of a multi-part value,

3835

// create a stack object for the entire argument value type and return a

3836

// load from our portion of it. This assumes that if the first part of an

3837

// argument is in memory, the rest will also be in memory.

3838

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

3839

/*IsImmutable=*/false);

3840

PartAddr = DAG.getFrameIndex(FI, PtrVT);

3841

return DAG.getLoad(

3842

ValVT, dl, Chain, PartAddr,

3843

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3844

} else {

3845

// This is not the first piece of an argument in memory. See if there is

3846

// already a fixed stack object including this offset. If so, assume it

3847

// was created by the PartOffset == 0 branch above and create a load from

3848

// the appropriate offset into it.

3849

int64_t PartBegin = VA.getLocMemOffset();

3850

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

3851

int FI = MFI.getObjectIndexBegin();

3852

for (; MFI.isFixedObjectIndex(FI); ++FI) {

3853

int64_t ObjBegin = MFI.getObjectOffset(FI);

3854

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

3855

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

3856

break;

3857

}

3858

if (MFI.isFixedObjectIndex(FI)) {

3859

SDValue Addr =

3860

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

3861

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

3862

return DAG.getLoad(

3863

ValVT, dl, Chain, Addr,

3864

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

3865

Ins[i].PartOffset));

3866

}

3867

}

3868

}

3869

3870

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

3871

VA.getLocMemOffset(), isImmutable);

3872

3873

// Set SExt or ZExt flag.

3874

if (VA.getLocInfo() == CCValAssign::ZExt) {

3875

MFI.setObjectZExt(FI, true);

3876

} else if (VA.getLocInfo() == CCValAssign::SExt) {

3877

MFI.setObjectSExt(FI, true);

3878

}

3879

3880

MaybeAlign Alignment;

3881

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

3882

ValVT != MVT::f80)

3883

Alignment = MaybeAlign(4);

3884

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

3885

SDValue Val = DAG.getLoad(

3886

ValVT, dl, Chain, FIN,

3887

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),

3888

Alignment);

3889

return ExtendedInMem

3890

? (VA.getValVT().isVector()

3891

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

3892

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

3893

: Val;

3894

}

3895

3896

// FIXME: Get this from tablegen.

3897

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

3898

const X86Subtarget &Subtarget) {

3899

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3899, __extension__ __PRETTY_FUNCTION__));

3900

3901

if (Subtarget.isCallingConvWin64(CallConv)) {

3902

static const MCPhysReg GPR64ArgRegsWin64[] = {

3903

X86::RCX, X86::RDX, X86::R8, X86::R9

3904

};

3905

return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

3906

}

3907

3908

static const MCPhysReg GPR64ArgRegs64Bit[] = {

3909

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

3910

};

3911

return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

3912

}

3913

3914

// FIXME: Get this from tablegen.

3915

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

3916

CallingConv::ID CallConv,

3917

const X86Subtarget &Subtarget) {

3918

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3918, __extension__ __PRETTY_FUNCTION__));

3919

if (Subtarget.isCallingConvWin64(CallConv)) {

3920

// The XMM registers which might contain var arg parameters are shadowed

3921

// in their paired GPR. So we only need to save the GPR to their home

3922

// slots.

3923

// TODO: __vectorcall will change this.

3924

return std::nullopt;

3925

}

3926

3927

bool isSoftFloat = Subtarget.useSoftFloat();

3928

if (isSoftFloat || !Subtarget.hasSSE1())

3929

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

3930

// registers.

3931

return std::nullopt;

3932

3933

static const MCPhysReg XMMArgRegs64Bit[] = {

3934

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3935

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3936

};

3937

return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

3938

}

3939

3940

#ifndef NDEBUG

3941

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

3942

return llvm::is_sorted(

3943

ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

3944

return A.getValNo() < B.getValNo();

3945

});

3946

}

3947

#endif

3948

3949

namespace {

3950

/// This is a helper class for lowering variable arguments parameters.

3951

class VarArgsLoweringHelper {

3952

public:

3953

VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

3954

SelectionDAG &DAG, const X86Subtarget &Subtarget,

3955

CallingConv::ID CallConv, CCState &CCInfo)

3956

: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

3957

TheMachineFunction(DAG.getMachineFunction()),

3958

TheFunction(TheMachineFunction.getFunction()),

3959

FrameInfo(TheMachineFunction.getFrameInfo()),

3960

FrameLowering(*Subtarget.getFrameLowering()),

3961

TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

3962

CCInfo(CCInfo) {}

3963

3964

// Lower variable arguments parameters.

3965

void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

3966

3967

private:

3968

void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

3969

3970

void forwardMustTailParameters(SDValue &Chain);

3971

3972

bool is64Bit() const { return Subtarget.is64Bit(); }

3973

bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }

3974

3975

X86MachineFunctionInfo *FuncInfo;

3976

const SDLoc &DL;

3977

SelectionDAG &DAG;

3978

const X86Subtarget &Subtarget;

3979

MachineFunction &TheMachineFunction;

3980

const Function &TheFunction;

3981

MachineFrameInfo &FrameInfo;

3982

const TargetFrameLowering &FrameLowering;

3983

const TargetLowering &TargLowering;

3984

CallingConv::ID CallConv;

3985

CCState &CCInfo;

3986

};

3987

} // namespace

3988

3989

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

3990

SDValue &Chain, unsigned StackSize) {

3991

// If the function takes variable number of arguments, make a frame index for

3992

// the start of the first vararg value... for expansion of llvm.va_start. We

3993

// can skip this if there are no va_start calls.

3994

if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

3995

CallConv != CallingConv::X86_ThisCall)) {

3996

FuncInfo->setVarArgsFrameIndex(

3997

FrameInfo.CreateFixedObject(1, StackSize, true));

3998

}

3999

4000

// 64-bit calling conventions support varargs and register parameters, so we

4001

// have to do extra work to spill them in the prologue.

4002

if (is64Bit()) {

4003

// Find the first unallocated argument registers.

4004

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

4005

ArrayRef<MCPhysReg> ArgXMMs =

4006

get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

4007

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

4008

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

4009

4010

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__
__PRETTY_FUNCTION__))

4011

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__
__PRETTY_FUNCTION__));

4012

4013

if (isWin64()) {

4014

// Get to the caller-allocated home save location. Add 8 to account

4015

// for the return address.

4016

int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

4017

FuncInfo->setRegSaveFrameIndex(

4018

FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

4019

// Fixup to set vararg frame on shadow area (4 x i64).

4020

if (NumIntRegs < 4)

4021

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

4022

} else {

4023

// For X86-64, if there are vararg parameters that are passed via

4024

// registers, then we must store them to their spots on the stack so

4025

// they may be loaded by dereferencing the result of va_next.

4026

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

4027

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

4028

FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

4029

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

4030

}

4031

4032

SmallVector<SDValue, 6>

4033

LiveGPRs; // list of SDValue for GPR registers keeping live input value

4034

SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

4035

// keeping live input value

4036

SDValue ALVal; // if applicable keeps SDValue for %al register

4037

4038

// Gather all the live in physical registers.

4039

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

4040

Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

4041

LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

4042

}

4043

const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

4044

if (!AvailableXmms.empty()) {

4045

Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4046

ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

4047

for (MCPhysReg Reg : AvailableXmms) {

4048

// FastRegisterAllocator spills virtual registers at basic

4049

// block boundary. That leads to usages of xmm registers

4050

// outside of check for %al. Pass physical registers to

4051

// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.

4052

TheMachineFunction.getRegInfo().addLiveIn(Reg);

4053

LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));

4054

}

4055

}

4056

4057

// Store the integer parameter registers.

4058

SmallVector<SDValue, 8> MemOps;

4059

SDValue RSFIN =

4060

DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

4061

TargLowering.getPointerTy(DAG.getDataLayout()));

4062

unsigned Offset = FuncInfo->getVarArgsGPOffset();

4063

for (SDValue Val : LiveGPRs) {

4064

SDValue FIN = DAG.getNode(ISD::ADD, DL,

4065

TargLowering.getPointerTy(DAG.getDataLayout()),

4066

RSFIN, DAG.getIntPtrConstant(Offset, DL));

4067

SDValue Store =

4068

DAG.getStore(Val.getValue(1), DL, Val, FIN,

4069

MachinePointerInfo::getFixedStack(

4070

DAG.getMachineFunction(),

4071

FuncInfo->getRegSaveFrameIndex(), Offset));

4072

MemOps.push_back(Store);

4073

Offset += 8;

4074

}

4075

4076

// Now store the XMM (fp + vector) parameter registers.

4077

if (!LiveXMMRegs.empty()) {

4078

SmallVector<SDValue, 12> SaveXMMOps;

4079

SaveXMMOps.push_back(Chain);

4080

SaveXMMOps.push_back(ALVal);

4081

SaveXMMOps.push_back(RSFIN);

4082

SaveXMMOps.push_back(

4083

DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));

4084

llvm::append_range(SaveXMMOps, LiveXMMRegs);

4085

MachineMemOperand *StoreMMO =

4086

DAG.getMachineFunction().getMachineMemOperand(

4087

MachinePointerInfo::getFixedStack(

4088

DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),

4089

Offset),

4090

MachineMemOperand::MOStore, 128, Align(16));

4091

MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,

4092

DL, DAG.getVTList(MVT::Other),

4093

SaveXMMOps, MVT::i8, StoreMMO));

4094

}

4095

4096

if (!MemOps.empty())

4097

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

4098

}

4099

}

4100

4101

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

4102

// Find the largest legal vector type.

4103

MVT VecVT = MVT::Other;

4104

// FIXME: Only some x86_32 calling conventions support AVX512.

4105

if (Subtarget.useAVX512Regs() &&

4106

(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

4107

CallConv == CallingConv::Intel_OCL_BI)))

4108

VecVT = MVT::v16f32;

4109

else if (Subtarget.hasAVX())

4110

VecVT = MVT::v8f32;

4111

else if (Subtarget.hasSSE2())

4112

VecVT = MVT::v4f32;

4113

4114

// We forward some GPRs and some vector types.

4115

SmallVector<MVT, 2> RegParmTypes;

4116

MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

4117

RegParmTypes.push_back(IntVT);

4118

if (VecVT != MVT::Other)

4119

RegParmTypes.push_back(VecVT);

4120

4121

// Compute the set of forwarded registers. The rest are scratch.

4122

SmallVectorImpl<ForwardedRegister> &Forwards =

4123

FuncInfo->getForwardedMustTailRegParms();

4124

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

4125

4126

// Forward AL for SysV x86_64 targets, since it is used for varargs.

4127

if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

4128

Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4129

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

4130

}

4131

4132

// Copy all forwards from physical to virtual registers.

4133

for (ForwardedRegister &FR : Forwards) {

4134

// FIXME: Can we use a less constrained schedule?

4135

SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

4136

FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

4137

TargLowering.getRegClassFor(FR.VT));

4138

Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

4139

}

4140

}

4141

4142

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

4143

unsigned StackSize) {

4144

// Set FrameIndex to the 0xAAAAAAA value to mark unset state.

4145

// If necessary, it would be set into the correct value later.

4146

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

4147

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4148

4149

if (FrameInfo.hasVAStart())

4150

createVarArgAreaAndStoreRegisters(Chain, StackSize);

4151

4152

if (FrameInfo.hasMustTailInVarArgFunc())

4153

forwardMustTailParameters(Chain);

4154

}

4155

4156

SDValue X86TargetLowering::LowerFormalArguments(

4157

SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

4158

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

4159

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

4160

MachineFunction &MF = DAG.getMachineFunction();

4161

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

4162

4163

const Function &F = MF.getFunction();

4164

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

4165

F.getName() == "main")

4166

FuncInfo->setForceFramePointer(true);

4167

4168

MachineFrameInfo &MFI = MF.getFrameInfo();

4169

bool Is64Bit = Subtarget.is64Bit();

4170

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4171

4172

assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))

4173

!(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__))

4174

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__
__PRETTY_FUNCTION__));

4175

4176

// Assign locations to all of the incoming arguments.

4177

SmallVector<CCValAssign, 16> ArgLocs;

4178

CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

4179

4180

// Allocate shadow area for Win64.

4181

if (IsWin64)

4182

CCInfo.AllocateStack(32, Align(8));

4183

4184

CCInfo.AnalyzeArguments(Ins, CC_X86);

4185

4186

// In vectorcall calling convention a second pass is required for the HVA

4187

// types.

4188

if (CallingConv::X86_VectorCall == CallConv) {

4189

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

4190

}

4191

4192

// The next loop assumes that the locations are in the same order of the

4193

// input arguments.

4194

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__))

4195

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__));

4196

4197

SDValue ArgValue;

4198

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

4199

++I, ++InsIndex) {

4200

assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4200, __extension__
__PRETTY_FUNCTION__));

4201

CCValAssign &VA = ArgLocs[I];

4202

4203

if (VA.isRegLoc()) {

4204

EVT RegVT = VA.getLocVT();

4205

if (VA.needsCustom()) {

4206

assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))

4207

VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__))

4208

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__
__PRETTY_FUNCTION__));

4209

4210

// v64i1 values, in regcall calling convention, that are

4211

// compiled to 32 bit arch, are split up into two registers.

4212

ArgValue =

4213

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

4214

} else {

4215

const TargetRegisterClass *RC;

4216

if (RegVT == MVT::i8)

4217

RC = &X86::GR8RegClass;

4218

else if (RegVT == MVT::i16)

4219

RC = &X86::GR16RegClass;

4220

else if (RegVT == MVT::i32)

4221

RC = &X86::GR32RegClass;

4222

else if (Is64Bit && RegVT == MVT::i64)

4223

RC = &X86::GR64RegClass;

4224

else if (RegVT == MVT::f16)

4225

RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;

4226

else if (RegVT == MVT::f32)

4227

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

4228

else if (RegVT == MVT::f64)

4229

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

4230

else if (RegVT == MVT::f80)

4231

RC = &X86::RFP80RegClass;

4232

else if (RegVT == MVT::f128)

4233

RC = &X86::VR128RegClass;

4234

else if (RegVT.is512BitVector())

4235

RC = &X86::VR512RegClass;

4236

else if (RegVT.is256BitVector())

4237

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

4238

else if (RegVT.is128BitVector())

4239

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

4240

else if (RegVT == MVT::x86mmx)

4241

RC = &X86::VR64RegClass;

4242

else if (RegVT == MVT::v1i1)

4243

RC = &X86::VK1RegClass;

4244

else if (RegVT == MVT::v8i1)

4245

RC = &X86::VK8RegClass;

4246

else if (RegVT == MVT::v16i1)

4247

RC = &X86::VK16RegClass;

4248

else if (RegVT == MVT::v32i1)

4249

RC = &X86::VK32RegClass;

4250

else if (RegVT == MVT::v64i1)

4251

RC = &X86::VK64RegClass;

4252

else

4253

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4253);

4254

4255

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

4256

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

4257

}

4258

4259

// If this is an 8 or 16-bit value, it is really passed promoted to 32

4260

// bits. Insert an assert[sz]ext to capture this, then truncate to the

4261

// right size.

4262

if (VA.getLocInfo() == CCValAssign::SExt)

4263

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

4264

DAG.getValueType(VA.getValVT()));

4265

else if (VA.getLocInfo() == CCValAssign::ZExt)

4266

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

4267

DAG.getValueType(VA.getValVT()));

4268

else if (VA.getLocInfo() == CCValAssign::BCvt)

4269

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

4270

4271

if (VA.isExtInLoc()) {

4272

// Handle MMX values passed in XMM regs.

4273

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

4274

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

4275

else if (VA.getValVT().isVector() &&

4276

VA.getValVT().getScalarType() == MVT::i1 &&

4277

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

4278

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

4279

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

4280

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

4281

} else

4282

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

4283

}

4284

} else {

4285

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4285, __extension__ __PRETTY_FUNCTION__));

4286

ArgValue =

4287

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

4288

}

4289

4290

// If value is passed via pointer - do a load.

4291

if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())

4292

ArgValue =

4293

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

4294

4295

InVals.push_back(ArgValue);

4296

}

4297

4298

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

4299

if (Ins[I].Flags.isSwiftAsync()) {

4300

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

4301

if (Subtarget.is64Bit())

4302

X86FI->setHasSwiftAsyncContext(true);

4303

else {

4304

int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

4305

X86FI->setSwiftAsyncContextFrameIdx(FI);

4306

SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],

4307

DAG.getFrameIndex(FI, MVT::i32),

4308

MachinePointerInfo::getFixedStack(MF, FI));

4309

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);

4310

}

4311

}

4312

4313

// Swift calling convention does not require we copy the sret argument

4314

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

4315

if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)

4316

continue;

4317

4318

// All x86 ABIs require that for returning structs by value we copy the

4319

// sret argument into %rax/%eax (depending on ABI) for the return. Save

4320

// the argument into a virtual register so that we can access it from the

4321

// return points.

4322

if (Ins[I].Flags.isSRet()) {

4323

assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__
__PRETTY_FUNCTION__))

4324

"SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__
__PRETTY_FUNCTION__));

4325

MVT PtrTy = getPointerTy(DAG.getDataLayout());

4326

Register Reg =

4327

MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

4328

FuncInfo->setSRetReturnReg(Reg);

4329

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

4330

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

4331

break;

4332

}

4333

}

4334

4335

unsigned StackSize = CCInfo.getNextStackOffset();

4336

// Align stack specially for tail calls.

4337

if (shouldGuaranteeTCO(CallConv,

4338

MF.getTarget().Options.GuaranteedTailCallOpt))

4339

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

4340

4341

if (IsVarArg)

4342

VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

4343

.lowerVarArgsParameters(Chain, StackSize);

4344

4345

// Some CCs need callee pop.

4346

if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

4347

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4348

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

4349

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

4350

// X86 interrupts must pop the error code (and the alignment padding) if

4351

// present.

4352

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

4353

} else {

4354

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

4355

// If this is an sret function, the return should pop the hidden pointer.

4356

if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))

4357

FuncInfo->setBytesToPopOnReturn(4);

4358

}

4359

4360

if (!Is64Bit) {

4361

// RegSaveFrameIndex is X86-64 only.

4362

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4363

}

4364

4365

FuncInfo->setArgumentStackSize(StackSize);

4366

4367

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

4368

EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());

4369

if (Personality == EHPersonality::CoreCLR) {

4370

assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4370,
__extension__ __PRETTY_FUNCTION__));

4371

// TODO: Add a mechanism to frame lowering that will allow us to indicate

4372

// that we'd prefer this slot be allocated towards the bottom of the frame

4373

// (i.e. near the stack pointer after allocating the frame). Every

4374

// funclet needs a copy of this slot in its (mostly empty) frame, and the

4375

// offset from the bottom of this and each funclet's frame must be the

4376

// same, so the size of funclets' (mostly empty) frames is dictated by

4377

// how far this slot is from the bottom (since they allocate just enough

4378

// space to accommodate holding this slot at the correct offset).

4379

int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);

4380

EHInfo->PSPSymFrameIdx = PSPSymFI;

4381

}

4382

}

4383

4384

if (shouldDisableArgRegFromCSR(CallConv) ||

4385

F.hasFnAttribute("no_caller_saved_registers")) {

4386

MachineRegisterInfo &MRI = MF.getRegInfo();

4387

for (std::pair<Register, Register> Pair : MRI.liveins())

4388

MRI.disableCalleeSavedRegister(Pair.first);

4389

}

4390

4391

return Chain;

4392

}

4393

4394

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

4395

SDValue Arg, const SDLoc &dl,

4396

SelectionDAG &DAG,

4397

const CCValAssign &VA,

4398

ISD::ArgFlagsTy Flags,

4399

bool isByVal) const {

4400

unsigned LocMemOffset = VA.getLocMemOffset();

4401

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

4402

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4403

StackPtr, PtrOff);

4404

if (isByVal)

4405

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

4406

4407

MaybeAlign Alignment;

4408

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

4409

Arg.getSimpleValueType() != MVT::f80)

4410

Alignment = MaybeAlign(4);

4411

return DAG.getStore(

4412

Chain, dl, Arg, PtrOff,

4413

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),

4414

Alignment);

4415

}

4416

4417

/// Emit a load of return address if tail call

4418

/// optimization is performed and it is required.

4419

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

4420

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

4421

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

4422

// Adjust the Return address stack slot.

4423

EVT VT = getPointerTy(DAG.getDataLayout());

4424

OutRetAddr = getReturnAddressFrameIndex(DAG);

4425

4426

// Load the "old" Return address.

4427

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

4428

return SDValue(OutRetAddr.getNode(), 1);

4429

}

4430

4431

/// Emit a store of the return address if tail call

4432

/// optimization is performed and it is required (FPDiff!=0).

4433

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

4434

SDValue Chain, SDValue RetAddrFrIdx,

4435

EVT PtrVT, unsigned SlotSize,

4436

int FPDiff, const SDLoc &dl) {

4437

// Store the return address to the appropriate stack slot.

4438

if (!FPDiff) return Chain;

4439

// Calculate the new stack slot for the return address.

4440

int NewReturnAddrFI =

4441

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

4442

false);

4443

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

4444

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

4445

MachinePointerInfo::getFixedStack(

4446

DAG.getMachineFunction(), NewReturnAddrFI));

4447

return Chain;

4448

}

4449

4450

/// Returns a vector_shuffle mask for an movs{s|d}, movd

4451

/// operation of specified width.

4452

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

4453

SDValue V2) {

4454

unsigned NumElems = VT.getVectorNumElements();

4455

SmallVector<int, 8> Mask;

4456

Mask.push_back(NumElems);

4457

for (unsigned i = 1; i != NumElems; ++i)

4458

Mask.push_back(i);

4459

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

4460

}

4461

4462

SDValue

4463

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

4464

SmallVectorImpl<SDValue> &InVals) const {

4465

SelectionDAG &DAG = CLI.DAG;

4466

SDLoc &dl = CLI.DL;

4467

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

4468

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

4469

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

4470

SDValue Chain = CLI.Chain;

4471

SDValue Callee = CLI.Callee;

4472

CallingConv::ID CallConv = CLI.CallConv;

4473

bool &isTailCall = CLI.IsTailCall;

4474

bool isVarArg = CLI.IsVarArg;

4475

const auto *CB = CLI.CB;

4476

4477

MachineFunction &MF = DAG.getMachineFunction();

4478

bool Is64Bit = Subtarget.is64Bit();

4479

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4480

bool IsSibcall = false;

4481

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

4482

CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;

4483

bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);

4484

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

4485

bool HasNCSR = (CB && isa<CallInst>(CB) &&

4486

CB->hasFnAttr("no_caller_saved_registers"));

4487

bool HasNoCfCheck = (CB && CB->doesNoCfCheck());

4488

bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());

4489

bool IsCFICall = IsIndirectCall && CLI.CFIType;

4490

const Module *M = MF.getMMI().getModule();

4491

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

4492

4493

MachineFunction::CallSiteInfo CSInfo;

4494

if (CallConv == CallingConv::X86_INTR)

4495

report_fatal_error("X86 interrupts may not be called directly");

4496

4497

bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

4498

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {

4499

// If we are using a GOT, disable tail calls to external symbols with

4500

// default visibility. Tail calling such a symbol requires using a GOT

4501

// relocation, which forces early binding of the symbol. This breaks code

4502

// that require lazy function symbol resolution. Using musttail or

4503

// GuaranteedTailCallOpt will override this.

4504

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4505

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

4506

G->getGlobal()->hasDefaultVisibility()))

4507

isTailCall = false;

4508

}

4509

4510

if (isTailCall && !IsMustTail) {

4511

// Check if it's really possible to do a tail call.

4512

isTailCall = IsEligibleForTailCallOptimization(

4513

Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,

4514

Ins, DAG);

4515

4516

// Sibcalls are automatically detected tailcalls which do not require

4517

// ABI changes.

4518

if (!IsGuaranteeTCO && isTailCall)

4519

IsSibcall = true;

4520

4521

if (isTailCall)

4522

++NumTailCalls;

4523

}

4524

4525

if (IsMustTail && !isTailCall)

4526

report_fatal_error("failed to perform tail call elimination on a call "

4527

"site marked musttail");

4528

4529

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__
__PRETTY_FUNCTION__))

4530

"Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__
__PRETTY_FUNCTION__));

4531

4532

// Analyze operands of the call, assigning locations to each operand.

4533

SmallVector<CCValAssign, 16> ArgLocs;

4534

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

4535

4536

// Allocate shadow area for Win64.

4537

if (IsWin64)

4538

CCInfo.AllocateStack(32, Align(8));

4539

4540

CCInfo.AnalyzeArguments(Outs, CC_X86);

4541

4542

// In vectorcall calling convention a second pass is required for the HVA

4543

// types.

4544

if (CallingConv::X86_VectorCall == CallConv) {

4545

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

4546

}

4547

4548

// Get a count of how many bytes are to be pushed on the stack.

4549

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

4550

if (IsSibcall)

4551

// This is a sibcall. The memory operands are available in caller's

4552

// own caller's stack.

4553

NumBytes = 0;

4554

else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))

4555

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

4556

4557

int FPDiff = 0;

4558

if (isTailCall &&

4559

shouldGuaranteeTCO(CallConv,

4560

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4561

// Lower arguments at fp - stackoffset + fpdiff.

4562

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

4563

4564

FPDiff = NumBytesCallerPushed - NumBytes;

4565

4566

// Set the delta of movement of the returnaddr stackslot.

4567

// But only set if delta is greater than previous delta.

4568

if (FPDiff < X86Info->getTCReturnAddrDelta())

4569

X86Info->setTCReturnAddrDelta(FPDiff);

4570

}

4571

4572

unsigned NumBytesToPush = NumBytes;

4573

unsigned NumBytesToPop = NumBytes;

4574

4575

// If we have an inalloca argument, all stack space has already been allocated

4576

// for us and be right at the top of the stack. We don't support multiple

4577

// arguments passed in memory when using inalloca.

4578

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

4579

NumBytesToPush = 0;

4580

if (!ArgLocs.back().isMemLoc())

4581

report_fatal_error("cannot use inalloca attribute on a register "

4582

"parameter");

4583

if (ArgLocs.back().getLocMemOffset() != 0)

4584

report_fatal_error("any parameter with the inalloca attribute must be "

4585

"the only memory argument");

4586

} else if (CLI.IsPreallocated) {

4587

assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))

4588

"cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__))

4589

"parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__
__PRETTY_FUNCTION__));

4590

SmallVector<size_t, 4> PreallocatedOffsets;

4591

for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

4592

if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

4593

PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

4594

}

4595

}

4596

auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

4597

size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

4598

MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

4599

MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

4600

NumBytesToPush = 0;

4601

}

4602

4603

if (!IsSibcall && !IsMustTail)

4604

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

4605

NumBytes - NumBytesToPush, dl);

4606

4607

SDValue RetAddrFrIdx;

4608

// Load return address for tail calls.

4609

if (isTailCall && FPDiff)

4610

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

4611

Is64Bit, FPDiff, dl);

4612

4613

SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

4614

SmallVector<SDValue, 8> MemOpChains;

4615

SDValue StackPtr;

4616

4617

// The next loop assumes that the locations are in the same order of the

4618

// input arguments.

4619

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__
__PRETTY_FUNCTION__))

4620

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__
__PRETTY_FUNCTION__));

4621

4622

// Walk the register/memloc assignments, inserting copies/loads. In the case

4623

// of tail call optimization arguments are handle later.

4624

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4625

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

4626

++I, ++OutIndex) {

4627

assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4627, __extension__
__PRETTY_FUNCTION__));

4628

// Skip inalloca/preallocated arguments, they have already been written.

4629

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

4630

if (Flags.isInAlloca() || Flags.isPreallocated())

4631

continue;

4632

4633

CCValAssign &VA = ArgLocs[I];

4634

EVT RegVT = VA.getLocVT();

4635

SDValue Arg = OutVals[OutIndex];

4636

bool isByVal = Flags.isByVal();

4637

4638

// Promote the value if needed.

4639

switch (VA.getLocInfo()) {

4640

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4640);

4641

case CCValAssign::Full: break;

4642

case CCValAssign::SExt:

4643

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

4644

break;

4645

case CCValAssign::ZExt:

4646

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

4647

break;

4648

case CCValAssign::AExt:

4649

if (Arg.getValueType().isVector() &&

4650

Arg.getValueType().getVectorElementType() == MVT::i1)

4651

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

4652

else if (RegVT.is128BitVector()) {

4653

// Special case: passing MMX values in XMM registers.

4654

Arg = DAG.getBitcast(MVT::i64, Arg);

4655

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

4656

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

4657

} else

4658

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

4659

break;

4660

case CCValAssign::BCvt:

4661

Arg = DAG.getBitcast(RegVT, Arg);

4662

break;

4663

case CCValAssign::Indirect: {

4664

if (isByVal) {

4665

// Memcpy the argument to a temporary stack slot to prevent

4666

// the caller from seeing any modifications the callee may make

4667

// as guaranteed by the `byval` attribute.

4668

int FrameIdx = MF.getFrameInfo().CreateStackObject(

4669

Flags.getByValSize(),

4670

std::max(Align(16), Flags.getNonZeroByValAlign()), false);

4671

SDValue StackSlot =

4672

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

4673

Chain =

4674

CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);

4675

// From now on treat this as a regular pointer

4676

Arg = StackSlot;

4677

isByVal = false;

4678

} else {

4679

// Store the argument.

4680

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

4681

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

4682

Chain = DAG.getStore(

4683

Chain, dl, Arg, SpillSlot,

4684

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

4685

Arg = SpillSlot;

4686

}

4687

break;

4688

}

4689

}

4690

4691

if (VA.needsCustom()) {

4692

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__
__PRETTY_FUNCTION__))

4693

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__
__PRETTY_FUNCTION__));

4694

// Split v64i1 value into two registers

4695

Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);

4696

} else if (VA.isRegLoc()) {

4697

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

4698

const TargetOptions &Options = DAG.getTarget().Options;

4699

if (Options.EmitCallSiteInfo)

4700

CSInfo.emplace_back(VA.getLocReg(), I);

4701

if (isVarArg && IsWin64) {

4702

// Win64 ABI requires argument XMM reg to be copied to the corresponding

4703

// shadow reg if callee is a varargs function.

4704

Register ShadowReg;

4705

switch (VA.getLocReg()) {

4706

case X86::XMM0: ShadowReg = X86::RCX; break;

4707

case X86::XMM1: ShadowReg = X86::RDX; break;

4708

case X86::XMM2: ShadowReg = X86::R8; break;

4709

case X86::XMM3: ShadowReg = X86::R9; break;

4710

}

4711

if (ShadowReg)

4712

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

4713

}

4714

} else if (!IsSibcall && (!isTailCall || isByVal)) {

4715

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4715, __extension__ __PRETTY_FUNCTION__));

4716

if (!StackPtr.getNode())

4717

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4718

getPointerTy(DAG.getDataLayout()));

4719

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

4720

dl, DAG, VA, Flags, isByVal));

4721

}

4722

}

4723

4724

if (!MemOpChains.empty())

4725

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

4726

4727

if (Subtarget.isPICStyleGOT()) {

4728

// ELF / PIC requires GOT in the EBX register before function calls via PLT

4729

// GOT pointer (except regcall).

4730

if (!isTailCall) {

4731

// Indirect call with RegCall calling convertion may use up all the

4732

// general registers, so it is not suitable to bind EBX reister for

4733

// GOT address, just let register allocator handle it.

4734

if (CallConv != CallingConv::X86_RegCall)

4735

RegsToPass.push_back(std::make_pair(

4736

Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

4737

getPointerTy(DAG.getDataLayout()))));

4738

} else {

4739

// If we are tail calling and generating PIC/GOT style code load the

4740

// address of the callee into ECX. The value in ecx is used as target of

4741

// the tail jump. This is done to circumvent the ebx/callee-saved problem

4742

// for tail calls on PIC/GOT architectures. Normally we would just put the

4743

// address of GOT into ebx and then call target@PLT. But for tail calls

4744

// ebx would be restored (since ebx is callee saved) before jumping to the

4745

// target@PLT.

4746

4747

// Note: The actual moving to ECX is done further down.

4748

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4749

if (G && !G->getGlobal()->hasLocalLinkage() &&

4750

G->getGlobal()->hasDefaultVisibility())

4751

Callee = LowerGlobalAddress(Callee, DAG);

4752

else if (isa<ExternalSymbolSDNode>(Callee))

4753

Callee = LowerExternalSymbol(Callee, DAG);

4754

}

4755

}

4756

4757

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&

4758

(Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {

4759

// From AMD64 ABI document:

4760

// For calls that may call functions that use varargs or stdargs

4761

// (prototype-less calls or calls to functions containing ellipsis (...) in

4762

// the declaration) %al is used as hidden argument to specify the number

4763

// of SSE registers used. The contents of %al do not need to match exactly

4764

// the number of registers, but must be an ubound on the number of SSE

4765

// registers used and is in the range 0 - 8 inclusive.

4766

4767

// Count the number of XMM registers allocated.

4768

static const MCPhysReg XMMArgRegs[] = {

4769

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

4770

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

4771

};

4772

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

4773

assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__
__PRETTY_FUNCTION__))

4774

&& "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__
__PRETTY_FUNCTION__));

4775

RegsToPass.push_back(std::make_pair(Register(X86::AL),

4776

DAG.getConstant(NumXMMRegs, dl,

4777

MVT::i8)));

4778

}

4779

4780

if (isVarArg && IsMustTail) {

4781

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

4782

for (const auto &F : Forwards) {

4783

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

4784

RegsToPass.push_back(std::make_pair(F.PReg, Val));

4785

}

4786

}

4787

4788

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

4789

// don't need this because the eligibility check rejects calls that require

4790

// shuffling arguments passed in memory.

4791

if (!IsSibcall && isTailCall) {

4792

// Force all the incoming stack arguments to be loaded from the stack

4793

// before any new outgoing arguments are stored to the stack, because the

4794

// outgoing stack slots may alias the incoming argument stack slots, and

4795

// the alias isn't otherwise explicit. This is slightly more conservative

4796

// than necessary, because it means that each store effectively depends

4797

// on every argument instead of just those arguments it would clobber.

4798

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

4799

4800

SmallVector<SDValue, 8> MemOpChains2;

4801

SDValue FIN;

4802

int FI = 0;

4803

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

4804

++I, ++OutsIndex) {

4805

CCValAssign &VA = ArgLocs[I];

4806

4807

if (VA.isRegLoc()) {

4808

if (VA.needsCustom()) {

4809

assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__
__PRETTY_FUNCTION__))

4810

"Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__
__PRETTY_FUNCTION__));

4811

// This means that we are in special case where one argument was

4812

// passed through two register locations - Skip the next location

4813

++I;

4814

}

4815

4816

continue;

4817

}

4818

4819

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4819, __extension__ __PRETTY_FUNCTION__));

4820

SDValue Arg = OutVals[OutsIndex];

4821

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

4822

// Skip inalloca/preallocated arguments. They don't require any work.

4823

if (Flags.isInAlloca() || Flags.isPreallocated())

4824

continue;

4825

// Create frame index.

4826

int32_t Offset = VA.getLocMemOffset()+FPDiff;

4827

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

4828

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

4829

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

4830

4831

if (Flags.isByVal()) {

4832

// Copy relative to framepointer.

4833

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

4834

if (!StackPtr.getNode())

4835

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4836

getPointerTy(DAG.getDataLayout()));

4837

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4838

StackPtr, Source);

4839

4840

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

4841

ArgChain,

4842

Flags, DAG, dl));

4843

} else {

4844

// Store relative to framepointer.

4845

MemOpChains2.push_back(DAG.getStore(

4846

ArgChain, dl, Arg, FIN,

4847

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

4848

}

4849

}

4850

4851

if (!MemOpChains2.empty())

4852

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

4853

4854

// Store the return address to the appropriate stack slot.

4855

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

4856

getPointerTy(DAG.getDataLayout()),

4857

RegInfo->getSlotSize(), FPDiff, dl);

4858

}

4859

4860

// Build a sequence of copy-to-reg nodes chained together with token chain

4861

// and glue operands which copy the outgoing args into registers.

4862

SDValue InGlue;

4863

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

4864

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

4865

RegsToPass[i].second, InGlue);

4866

InGlue = Chain.getValue(1);

4867

}

4868

4869

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

4870

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4870, __extension__
__PRETTY_FUNCTION__));

4871

// In the 64-bit large code model, we have to make all calls

4872

// through a register, since the call instruction's 32-bit

4873

// pc-relative offset may not be large enough to hold the whole

4874

// address.

4875

} else if (Callee->getOpcode() == ISD::GlobalAddress ||

4876

Callee->getOpcode() == ISD::ExternalSymbol) {

4877

// Lower direct calls to global addresses and external symbols. Setting

4878

// ForCall to true here has the effect of removing WrapperRIP when possible

4879

// to allow direct calls to be selected without first materializing the

4880

// address into a register.

4881

Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);

4882

} else if (Subtarget.isTarget64BitILP32() &&

4883

Callee.getValueType() == MVT::i32) {

4884

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

4885

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

4886

}

4887

4888

// Returns a chain & a glue for retval copy to use.

4889

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

4890

SmallVector<SDValue, 8> Ops;

4891

4892

if (!IsSibcall && isTailCall && !IsMustTail) {

4893

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);

4894

InGlue = Chain.getValue(1);

4895

}

4896

4897

Ops.push_back(Chain);

4898

Ops.push_back(Callee);

4899

4900

if (isTailCall)

4901

Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));

4902

4903

// Add argument registers to the end of the list so that they are known live

4904

// into the call.

4905

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

4906

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

4907

RegsToPass[i].second.getValueType()));

4908

4909

// Add a register mask operand representing the call-preserved registers.

4910

const uint32_t *Mask = [&]() {

4911

auto AdaptedCC = CallConv;

4912

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),

4913

// use X86_INTR calling convention because it has the same CSR mask

4914

// (same preserved registers).

4915

if (HasNCSR)

4916

AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;

4917

// If NoCalleeSavedRegisters is requested, than use GHC since it happens

4918

// to use the CSR_NoRegs_RegMask.

4919

if (CB && CB->hasFnAttr("no_callee_saved_registers"))

4920

AdaptedCC = (CallingConv::ID)CallingConv::GHC;

4921

return RegInfo->getCallPreservedMask(MF, AdaptedCC);

4922

}();

4923

assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4923, __extension__
__PRETTY_FUNCTION__));

4924

4925

// If this is an invoke in a 32-bit function using a funclet-based

4926

// personality, assume the function clobbers all registers. If an exception

4927

// is thrown, the runtime will not restore CSRs.

4928

// FIXME: Model this more precisely so that we can register allocate across

4929

// the normal edge and spill and fill across the exceptional edge.

4930

if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

4931

const Function &CallerFn = MF.getFunction();

4932

EHPersonality Pers =

4933

CallerFn.hasPersonalityFn()

4934

? classifyEHPersonality(CallerFn.getPersonalityFn())

4935

: EHPersonality::Unknown;

4936

if (isFuncletEHPersonality(Pers))

4937

Mask = RegInfo->getNoPreservedMask();

4938

}

4939

4940

// Define a new register mask from the existing mask.

4941

uint32_t *RegMask = nullptr;

4942

4943

// In some calling conventions we need to remove the used physical registers

4944

// from the reg mask. Create a new RegMask for such calling conventions.

4945

// RegMask for calling conventions that disable only return registers (e.g.

4946

// preserve_most) will be modified later in LowerCallResult.

4947

bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;

4948

if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {

4949

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

4950

4951

// Allocate a new Reg Mask and copy Mask.

4952

RegMask = MF.allocateRegMask();

4953

unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());

4954

memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

4955

4956

// Make sure all sub registers of the argument registers are reset

4957

// in the RegMask.

4958

if (ShouldDisableArgRegs) {

4959

for (auto const &RegPair : RegsToPass)

4960

for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))

4961

RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));

4962

}

4963

4964

// Create the RegMask Operand according to our updated mask.

4965

Ops.push_back(DAG.getRegisterMask(RegMask));

4966

} else {

4967

// Create the RegMask Operand according to the static mask.

4968

Ops.push_back(DAG.getRegisterMask(Mask));

4969

}

4970

4971

if (InGlue.getNode())

4972

Ops.push_back(InGlue);

4973

4974

if (isTailCall) {

4975

// We used to do:

4976

//// If this is the first return lowered for this function, add the regs

4977

//// to the liveout set for the function.

4978

// This isn't right, although it's probably harmless on x86; liveouts

4979

// should be computed from returns not tail calls. Consider a void

4980

// function making a tail call to a function returning int.

4981

MF.getFrameInfo().setHasTailCall();

4982

SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

4983

4984

if (IsCFICall)

4985

Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4986

4987

DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

4988

return Ret;

4989

}

4990

4991

if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {

4992

Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);

4993

} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {

4994

// Calls with a "clang.arc.attachedcall" bundle are special. They should be

4995

// expanded to the call, directly followed by a special marker sequence and

4996

// a call to a ObjC library function. Use the CALL_RVMARKER to do that.

4997

assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__
__PRETTY_FUNCTION__))

4998

"tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__
__PRETTY_FUNCTION__));

4999

assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4999, __extension__
__PRETTY_FUNCTION__));

5000

5001

// Add a target global address for the retainRV/claimRV runtime function

5002

// just before the call target.

5003

Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);

5004

auto PtrVT = getPointerTy(DAG.getDataLayout());

5005

auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);

5006

Ops.insert(Ops.begin() + 1, GA);

5007

Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);

5008

} else {

5009

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

5010

}

5011

5012

if (IsCFICall)

5013

Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());

5014

5015

InGlue = Chain.getValue(1);

5016

DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

5017

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

5018

5019

// Save heapallocsite metadata.

5020

if (CLI.CB)

5021

if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

5022

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

5023

5024

// Create the CALLSEQ_END node.

5025

unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.

5026

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

5027

DAG.getTarget().Options.GuaranteedTailCallOpt))

5028

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

5029

else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)

5030

// If this call passes a struct-return pointer, the callee

5031

// pops that struct pointer.

5032

NumBytesForCalleeToPop = 4;

5033

5034

// Returns a glue for retval copy to use.

5035

if (!IsSibcall) {

5036

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,

5037

InGlue, dl);

5038

InGlue = Chain.getValue(1);

5039

}

5040

5041

// Handle result values, copying them out of physregs into vregs that we

5042

// return.

5043

return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,

5044

InVals, RegMask);

5045

}

5046

5047

//===----------------------------------------------------------------------===//

5048

// Fast Calling Convention (tail call) implementation

5049

//===----------------------------------------------------------------------===//

5050

5051

// Like std call, callee cleans arguments, convention except that ECX is

5052

// reserved for storing the tail called function address. Only 2 registers are

5053

// free for argument passing (inreg). Tail call optimization is performed

5054

// provided:

5055

// * tailcallopt is enabled

5056

// * caller/callee are fastcc

5057

// On X86_64 architecture with GOT-style position independent code only local

5058

// (within module) calls are supported at the moment.

5059

// To keep the stack aligned according to platform abi the function

5060

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

5061

// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

5062

// If a tail called function callee has more arguments than the caller the

5063

// caller needs to make sure that there is room to move the RETADDR to. This is

5064

// achieved by reserving an area the size of the argument delta right after the

5065

// original RETADDR, but before the saved framepointer or the spilled registers

5066

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

5067

// stack layout:

5068

// arg1

5069

// arg2

5070

// RETADDR

5071

// [ new RETADDR

5072

// move area ]

5073

// (possible EBP)

5074

// ESI

5075

// EDI

5076

// local1 ..

5077

5078

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

5079

/// requirement.

5080

unsigned

5081

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

5082

SelectionDAG &DAG) const {

5083

const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

5084

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

5085

assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__
__PRETTY_FUNCTION__))

5086

"StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__
__PRETTY_FUNCTION__));

5087

return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

5088

}

5089

5090

/// Return true if the given stack call argument is already available in the

5091

/// same position (relatively) of the caller's incoming argument stack.

5092

static

5093

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

5094

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

5095

const X86InstrInfo *TII, const CCValAssign &VA) {

5096

unsigned Bytes = Arg.getValueSizeInBits() / 8;

5097

5098

for (;;) {

5099

// Look through nodes that don't alter the bits of the incoming value.

5100

unsigned Op = Arg.getOpcode();

5101

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

5102

Arg = Arg.getOperand(0);

5103

continue;

5104

}

5105

if (Op == ISD::TRUNCATE) {

5106

const SDValue &TruncInput = Arg.getOperand(0);

5107

if (TruncInput.getOpcode() == ISD::AssertZext &&

5108

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

5109

Arg.getValueType()) {

5110

Arg = TruncInput.getOperand(0);

5111

continue;

5112

}

5113

}

5114

break;

5115

}

5116

5117

int FI = INT_MAX2147483647;

5118

if (Arg.getOpcode() == ISD::CopyFromReg) {

5119

Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

5120

if (!VR.isVirtual())

5121

return false;

5122

MachineInstr *Def = MRI->getVRegDef(VR);

5123

if (!Def)

5124

return false;

5125

if (!Flags.isByVal()) {

5126

if (!TII->isLoadFromStackSlot(*Def, FI))

5127

return false;

5128

} else {

5129

unsigned Opcode = Def->getOpcode();

5130

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

5131

Opcode == X86::LEA64_32r) &&

5132

Def->getOperand(1).isFI()) {

5133

FI = Def->getOperand(1).getIndex();

5134

Bytes = Flags.getByValSize();

5135

} else

5136

return false;

5137

}

5138

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

5139

if (Flags.isByVal())

5140

// ByVal argument is passed in as a pointer but it's now being

5141

// dereferenced. e.g.

5142

// define @foo(%struct.X* %A) {

5143

// tail call @bar(%struct.X* byval %A)

5144

// }

5145

return false;

5146

SDValue Ptr = Ld->getBasePtr();

5147

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

5148

if (!FINode)

5149

return false;

5150

FI = FINode->getIndex();

5151

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

5152

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

5153

FI = FINode->getIndex();

5154

Bytes = Flags.getByValSize();

5155

} else

5156

return false;

5157

5158

assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5158, __extension__ __PRETTY_FUNCTION__));

5159

if (!MFI.isFixedObjectIndex(FI))

5160

return false;

5161

5162

if (Offset != MFI.getObjectOffset(FI))

5163

return false;

5164

5165

// If this is not byval, check that the argument stack object is immutable.

5166

// inalloca and argument copy elision can create mutable argument stack

5167

// objects. Byval objects can be mutated, but a byval call intends to pass the

5168

// mutated memory.

5169

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

5170

return false;

5171

5172

if (VA.getLocVT().getFixedSizeInBits() >

5173

Arg.getValueSizeInBits().getFixedValue()) {

5174

// If the argument location is wider than the argument type, check that any

5175

// extension flags match.

5176

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

5177

Flags.isSExt() != MFI.isObjectSExt(FI)) {

5178

return false;

5179

}

5180

}

5181

5182

return Bytes == MFI.getObjectSize(FI);

5183

}

5184

5185

/// Check whether the call is eligible for tail call optimization. Targets

5186

/// that want to do tail call optimization should implement this function.

5187

bool X86TargetLowering::IsEligibleForTailCallOptimization(

5188

SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,

5189

bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,

5190

const SmallVectorImpl<SDValue> &OutVals,

5191

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

5192

if (!mayTailCallThisCC(CalleeCC))

5193

return false;

5194

5195

// If -tailcallopt is specified, make fastcc functions tail-callable.

5196

MachineFunction &MF = DAG.getMachineFunction();

5197

const Function &CallerF = MF.getFunction();

5198

5199

// If the function return type is x86_fp80 and the callee return type is not,

5200

// then the FP_EXTEND of the call result is not a nop. It's not safe to

5201

// perform a tailcall optimization here.

5202

if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

5203

return false;

5204

5205

CallingConv::ID CallerCC = CallerF.getCallingConv();

5206

bool CCMatch = CallerCC == CalleeCC;

5207

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

5208

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

5209

bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||

5210

CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;

5211

5212

// Win64 functions have extra shadow space for argument homing. Don't do the

5213

// sibcall if the caller and callee have mismatched expectations for this

5214

// space.

5215

if (IsCalleeWin64 != IsCallerWin64)

5216

return false;

5217

5218

if (IsGuaranteeTCO) {

5219

if (canGuaranteeTCO(CalleeCC) && CCMatch)

5220

return true;

5221

return false;

5222

}

5223

5224

// Look for obvious safe cases to perform tail call optimization that do not

5225

// require ABI changes. This is what gcc calls sibcall.

5226

5227

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

5228

// emit a special epilogue.

5229

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5230

if (RegInfo->hasStackRealignment(MF))

5231

return false;

5232

5233

// Also avoid sibcall optimization if we're an sret return fn and the callee

5234

// is incompatible. See comment in LowerReturn about why hasStructRetAttr is

5235

// insufficient.

5236

if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {

5237

// For a compatible tail call the callee must return our sret pointer. So it

5238

// needs to be (a) an sret function itself and (b) we pass our sret as its

5239

// sret. Condition #b is harder to determine.

5240

return false;

5241

} else if (IsCalleePopSRet)

5242

// The callee pops an sret, so we cannot tail-call, as our caller doesn't

5243

// expect that.

5244

return false;

5245

5246

// Do not sibcall optimize vararg calls unless all arguments are passed via

5247

// registers.

5248

LLVMContext &C = *DAG.getContext();

5249

if (isVarArg && !Outs.empty()) {

5250

// Optimizing for varargs on Win64 is unlikely to be safe without

5251

// additional testing.

5252

if (IsCalleeWin64 || IsCallerWin64)

5253

return false;

5254

5255

SmallVector<CCValAssign, 16> ArgLocs;

5256

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5257

5258

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5259

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

5260

if (!ArgLocs[i].isRegLoc())

5261

return false;

5262

}

5263

5264

// If the call result is in ST0 / ST1, it needs to be popped off the x87

5265

// stack. Therefore, if it's not used by the call it is not safe to optimize

5266

// this into a sibcall.

5267

bool Unused = false;

5268

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

5269

if (!Ins[i].Used) {

5270

Unused = true;

5271

break;

5272

}

5273

}

5274

if (Unused) {

5275

SmallVector<CCValAssign, 16> RVLocs;

5276

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

5277

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

5278

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

5279

CCValAssign &VA = RVLocs[i];

5280

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

5281

return false;

5282

}

5283

}

5284

5285

// Check that the call results are passed in the same way.

5286

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

5287

RetCC_X86, RetCC_X86))

5288

return false;

5289

// The callee has to preserve all registers the caller needs to preserve.

5290

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

5291

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

5292

if (!CCMatch) {

5293

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

5294

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

5295

return false;

5296

}

5297

5298

unsigned StackArgsSize = 0;

5299

5300

// If the callee takes no arguments then go on to check the results of the

5301

// call.

5302

if (!Outs.empty()) {

5303

// Check if stack adjustment is needed. For now, do not do this if any

5304

// argument is passed on the stack.

5305

SmallVector<CCValAssign, 16> ArgLocs;

5306

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5307

5308

// Allocate shadow area for Win64

5309

if (IsCalleeWin64)

5310

CCInfo.AllocateStack(32, Align(8));

5311

5312

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5313

StackArgsSize = CCInfo.getNextStackOffset();

5314

5315

if (CCInfo.getNextStackOffset()) {

5316

// Check if the arguments are already laid out in the right way as

5317

// the caller's fixed stack objects.

5318

MachineFrameInfo &MFI = MF.getFrameInfo();

5319

const MachineRegisterInfo *MRI = &MF.getRegInfo();

5320

const X86InstrInfo *TII = Subtarget.getInstrInfo();

5321

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5322

CCValAssign &VA = ArgLocs[i];

5323

SDValue Arg = OutVals[i];

5324

ISD::ArgFlagsTy Flags = Outs[i].Flags;

5325

if (VA.getLocInfo() == CCValAssign::Indirect)

5326

return false;

5327

if (!VA.isRegLoc()) {

5328

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

5329

MFI, MRI, TII, VA))

5330

return false;

5331

}

5332

}

5333

}

5334

5335

bool PositionIndependent = isPositionIndependent();

5336

// If the tailcall address may be in a register, then make sure it's

5337

// possible to register allocate for it. In 32-bit, the call address can

5338

// only target EAX, EDX, or ECX since the tail call must be scheduled after

5339

// callee-saved registers are restored. These happen to be the same

5340

// registers used to pass 'inreg' arguments so watch out for those.

5341

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

5342

!isa<ExternalSymbolSDNode>(Callee)) ||

5343

PositionIndependent)) {

5344

unsigned NumInRegs = 0;

5345

// In PIC we need an extra register to formulate the address computation

5346

// for the callee.

5347

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

5348

5349

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5350

CCValAssign &VA = ArgLocs[i];

5351

if (!VA.isRegLoc())

5352

continue;

5353

Register Reg = VA.getLocReg();

5354

switch (Reg) {

5355

default: break;

5356

case X86::EAX: case X86::EDX: case X86::ECX:

5357

if (++NumInRegs == MaxInRegs)

5358

return false;

5359

break;

5360

}

5361

}

5362

}

5363

5364

const MachineRegisterInfo &MRI = MF.getRegInfo();

5365

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

5366

return false;

5367

}

5368

5369

bool CalleeWillPop =

5370

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

5371

MF.getTarget().Options.GuaranteedTailCallOpt);

5372

5373

if (unsigned BytesToPop =

5374

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

5375

// If we have bytes to pop, the callee must pop them.

5376

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

5377

if (!CalleePopMatches)

5378

return false;

5379

} else if (CalleeWillPop && StackArgsSize > 0) {

5380

// If we don't have bytes to pop, make sure the callee doesn't pop any.

5381

return false;

5382

}

5383

5384

return true;

5385

}

5386

5387

FastISel *

5388

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

5389

const TargetLibraryInfo *libInfo) const {

5390

return X86::createFastISel(funcInfo, libInfo);

5391

}

5392

5393

//===----------------------------------------------------------------------===//

5394

// Other Lowering Hooks

5395

//===----------------------------------------------------------------------===//

5396

5397

bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,

5398

bool AssumeSingleUse) {

5399

if (!AssumeSingleUse && !Op.hasOneUse())

5400

return false;

5401

if (!ISD::isNormalLoad(Op.getNode()))

5402

return false;

5403

5404

// If this is an unaligned vector, make sure the target supports folding it.

5405

auto *Ld = cast<LoadSDNode>(Op.getNode());

5406

if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&

5407

Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))

5408

return false;

5409

5410

// TODO: If this is a non-temporal load and the target has an instruction

5411

// for it, it should not be folded. See "useNonTemporalLoad()".

5412

5413

return true;

5414

}

5415

5416

bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,

5417

const X86Subtarget &Subtarget,

5418

bool AssumeSingleUse) {

5419

assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5419, __extension__
__PRETTY_FUNCTION__));

5420

if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))

5421

return false;

5422

5423

// We can not replace a wide volatile load with a broadcast-from-memory,

5424

// because that would narrow the load, which isn't legal for volatiles.

5425

auto *Ld = cast<LoadSDNode>(Op.getNode());

5426

return !Ld->isVolatile() ||

5427

Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();

5428

}

5429

5430

bool X86::mayFoldIntoStore(SDValue Op) {

5431

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

5432

}

5433

5434

bool X86::mayFoldIntoZeroExtend(SDValue Op) {

5435

if (Op.hasOneUse()) {

5436

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

5437

return (ISD::ZERO_EXTEND == Opcode);

5438

}

5439

return false;

5440

}

5441

5442

static bool isTargetShuffle(unsigned Opcode) {

5443

switch(Opcode) {

5444

default: return false;

5445

case X86ISD::BLENDI:

5446

case X86ISD::PSHUFB:

5447

case X86ISD::PSHUFD:

5448

case X86ISD::PSHUFHW:

5449

case X86ISD::PSHUFLW:

5450

case X86ISD::SHUFP:

5451

case X86ISD::INSERTPS:

5452

case X86ISD::EXTRQI:

5453

case X86ISD::INSERTQI:

5454

case X86ISD::VALIGN:

5455

case X86ISD::PALIGNR:

5456

case X86ISD::VSHLDQ:

5457

case X86ISD::VSRLDQ:

5458

case X86ISD::MOVLHPS:

5459

case X86ISD::MOVHLPS:

5460

case X86ISD::MOVSHDUP:

5461

case X86ISD::MOVSLDUP:

5462

case X86ISD::MOVDDUP:

5463

case X86ISD::MOVSS:

5464

case X86ISD::MOVSD:

5465

case X86ISD::MOVSH:

5466

case X86ISD::UNPCKL:

5467

case X86ISD::UNPCKH:

5468

case X86ISD::VBROADCAST:

5469

case X86ISD::VPERMILPI:

5470

case X86ISD::VPERMILPV:

5471

case X86ISD::VPERM2X128:

5472

case X86ISD::SHUF128:

5473

case X86ISD::VPERMIL2:

5474

case X86ISD::VPERMI:

5475

case X86ISD::VPPERM:

5476

case X86ISD::VPERMV:

5477

case X86ISD::VPERMV3:

5478

case X86ISD::VZEXT_MOVL:

5479

return true;

5480

}

5481

}

5482

5483

static bool isTargetShuffleVariableMask(unsigned Opcode) {

5484

switch (Opcode) {

5485

default: return false;

5486

// Target Shuffles.

5487

case X86ISD::PSHUFB:

5488

case X86ISD::VPERMILPV:

5489

case X86ISD::VPERMIL2:

5490

case X86ISD::VPPERM:

5491

case X86ISD::VPERMV:

5492

case X86ISD::VPERMV3:

5493

return true;

5494

// 'Faux' Target Shuffles.

5495

case ISD::OR:

5496

case ISD::AND:

5497

case X86ISD::ANDNP:

5498

return true;

5499

}

5500

}

5501

5502

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

5503

MachineFunction &MF = DAG.getMachineFunction();

5504

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5505

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

5506

int ReturnAddrIndex = FuncInfo->getRAIndex();

5507

5508

if (ReturnAddrIndex == 0) {

5509

// Set up a frame object for the return address.

5510

unsigned SlotSize = RegInfo->getSlotSize();

5511

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

5512

-(int64_t)SlotSize,

5513

false);

5514

FuncInfo->setRAIndex(ReturnAddrIndex);

5515

}

5516

5517

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

5518

}

5519

5520

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

5521

bool hasSymbolicDisplacement) {

5522

// Offset should fit into 32 bit immediate field.

5523

if (!isInt<32>(Offset))

5524

return false;

5525

5526

// If we don't have a symbolic displacement - we don't have any extra

5527

// restrictions.

5528

if (!hasSymbolicDisplacement)

5529

return true;

5530

5531

// FIXME: Some tweaks might be needed for medium code model.

5532

if (M != CodeModel::Small && M != CodeModel::Kernel)

5533

return false;

5534

5535

// For small code model we assume that latest object is 16MB before end of 31

5536

// bits boundary. We may also accept pretty large negative constants knowing

5537

// that all objects are in the positive half of address space.

5538

if (M == CodeModel::Small && Offset < 16*1024*1024)

5539

return true;

5540

5541

// For kernel code model we know that all object resist in the negative half

5542

// of 32bits address space. We may not accept negative offsets, since they may

5543

// be just off and we may accept pretty large positive ones.

5544

if (M == CodeModel::Kernel && Offset >= 0)

5545

return true;

5546

5547

return false;

5548

}

5549

5550

/// Determines whether the callee is required to pop its own arguments.

5551

/// Callee pop is necessary to support tail calls.

5552

bool X86::isCalleePop(CallingConv::ID CallingConv,

5553

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

5554

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

5555

// can guarantee TCO.

5556

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

5557

return true;

5558

5559

switch (CallingConv) {

5560

default:

5561

return false;

5562

case CallingConv::X86_StdCall:

5563

case CallingConv::X86_FastCall:

5564

case CallingConv::X86_ThisCall:

5565

case CallingConv::X86_VectorCall:

5566

return !is64Bit;

5567

}

5568

}

5569

5570

/// Return true if the condition is an signed comparison operation.

5571

static bool isX86CCSigned(unsigned X86CC) {

5572

switch (X86CC) {

5573

default:

5574

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5574);

5575

case X86::COND_E:

5576

case X86::COND_NE:

5577

case X86::COND_B:

5578

case X86::COND_A:

5579

case X86::COND_BE:

5580

case X86::COND_AE:

5581

return false;

5582

case X86::COND_G:

5583

case X86::COND_GE:

5584

case X86::COND_L:

5585

case X86::COND_LE:

5586

return true;

5587

}

5588

}

5589

5590

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

5591

switch (SetCCOpcode) {

5592

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5592);

5593

case ISD::SETEQ: return X86::COND_E;

5594

case ISD::SETGT: return X86::COND_G;

5595

case ISD::SETGE: return X86::COND_GE;

5596

case ISD::SETLT: return X86::COND_L;

5597

case ISD::SETLE: return X86::COND_LE;

5598

case ISD::SETNE: return X86::COND_NE;

5599

case ISD::SETULT: return X86::COND_B;

5600

case ISD::SETUGT: return X86::COND_A;

5601

case ISD::SETULE: return X86::COND_BE;

5602

case ISD::SETUGE: return X86::COND_AE;

5603

}

5604

}

5605

5606

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

5607

/// condition code, returning the condition code and the LHS/RHS of the

5608

/// comparison to make.

5609

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

5610

bool isFP, SDValue &LHS, SDValue &RHS,

5611

SelectionDAG &DAG) {

5612

if (!isFP) {

5613

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

5614

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {

5615

// X > -1 -> X == 0, jump !sign.

5616

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5617

return X86::COND_NS;

5618

}

5619

if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {

5620

// X < 0 -> X == 0, jump on sign.

5621

return X86::COND_S;

5622

}

5623

if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {

5624

// X >= 0 -> X == 0, jump on !sign.

5625

return X86::COND_NS;

5626

}

5627

if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

5628

// X < 1 -> X <= 0

5629

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5630

return X86::COND_LE;

5631

}

5632

}

5633

5634

return TranslateIntegerX86CC(SetCCOpcode);

5635

}

5636

5637

// First determine if it is required or is profitable to flip the operands.

5638

5639

// If LHS is a foldable load, but RHS is not, flip the condition.

5640

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

5641

!ISD::isNON_EXTLoad(RHS.getNode())) {

5642

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

5643

std::swap(LHS, RHS);

5644

}

5645

5646

switch (SetCCOpcode) {

5647

default: break;

5648

case ISD::SETOLT:

5649

case ISD::SETOLE:

5650

case ISD::SETUGT:

5651

case ISD::SETUGE:

5652

std::swap(LHS, RHS);

5653

break;

5654

}

5655

5656

// On a floating point condition, the flags are set as follows:

5657

// ZF PF CF op

5658

// 0 | 0 | 0 | X > Y

5659

// 0 | 0 | 1 | X < Y

5660

// 1 | 0 | 0 | X == Y

5661

// 1 | 1 | 1 | unordered

5662

switch (SetCCOpcode) {

5663

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5663);

5664

case ISD::SETUEQ:

5665

case ISD::SETEQ: return X86::COND_E;

5666

case ISD::SETOLT: // flipped

5667

case ISD::SETOGT:

5668

case ISD::SETGT: return X86::COND_A;

5669

case ISD::SETOLE: // flipped

5670

case ISD::SETOGE:

5671

case ISD::SETGE: return X86::COND_AE;

5672

case ISD::SETUGT: // flipped

5673

case ISD::SETULT:

5674

case ISD::SETLT: return X86::COND_B;

5675

case ISD::SETUGE: // flipped

5676

case ISD::SETULE:

5677

case ISD::SETLE: return X86::COND_BE;

5678

case ISD::SETONE:

5679

case ISD::SETNE: return X86::COND_NE;

5680

case ISD::SETUO: return X86::COND_P;

5681

case ISD::SETO: return X86::COND_NP;

5682

case ISD::SETOEQ:

5683

case ISD::SETUNE: return X86::COND_INVALID;

5684

}

5685

}

5686

5687

/// Is there a floating point cmov for the specific X86 condition code?

5688

/// Current x86 isa includes the following FP cmov instructions:

5689

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

5690

static bool hasFPCMov(unsigned X86CC) {

5691

switch (X86CC) {

5692

default:

5693

return false;

5694

case X86::COND_B:

5695

case X86::COND_BE:

5696

case X86::COND_E:

5697

case X86::COND_P:

5698

case X86::COND_A:

5699

case X86::COND_AE:

5700

case X86::COND_NE:

5701

case X86::COND_NP:

5702

return true;

5703

}

5704

}

5705

5706

static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {

5707

return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||

5708

VT.is512BitVector();

5709

}

5710

5711

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

5712

const CallInst &I,

5713

MachineFunction &MF,

5714

unsigned Intrinsic) const {

5715

Info.flags = MachineMemOperand::MONone;

5716

Info.offset = 0;

5717

5718

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

5719

if (!IntrData) {

5720

switch (Intrinsic) {

5721

case Intrinsic::x86_aesenc128kl:

5722

case Intrinsic::x86_aesdec128kl:

5723

Info.opc = ISD::INTRINSIC_W_CHAIN;

5724

Info.ptrVal = I.getArgOperand(1);

5725

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5726

Info.align = Align(1);

5727

Info.flags |= MachineMemOperand::MOLoad;

5728

return true;

5729

case Intrinsic::x86_aesenc256kl:

5730

case Intrinsic::x86_aesdec256kl:

5731

Info.opc = ISD::INTRINSIC_W_CHAIN;

5732

Info.ptrVal = I.getArgOperand(1);

5733

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5734

Info.align = Align(1);

5735

Info.flags |= MachineMemOperand::MOLoad;

5736

return true;

5737

case Intrinsic::x86_aesencwide128kl:

5738

case Intrinsic::x86_aesdecwide128kl:

5739

Info.opc = ISD::INTRINSIC_W_CHAIN;

5740

Info.ptrVal = I.getArgOperand(0);

5741

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5742

Info.align = Align(1);

5743

Info.flags |= MachineMemOperand::MOLoad;

5744

return true;

5745

case Intrinsic::x86_aesencwide256kl:

5746

case Intrinsic::x86_aesdecwide256kl:

5747

Info.opc = ISD::INTRINSIC_W_CHAIN;

5748

Info.ptrVal = I.getArgOperand(0);

5749

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5750

Info.align = Align(1);

5751

Info.flags |= MachineMemOperand::MOLoad;

5752

return true;

5753

case Intrinsic::x86_cmpccxadd32:

5754

case Intrinsic::x86_cmpccxadd64:

5755

case Intrinsic::x86_atomic_bts:

5756

case Intrinsic::x86_atomic_btc:

5757

case Intrinsic::x86_atomic_btr: {

5758

Info.opc = ISD::INTRINSIC_W_CHAIN;

5759

Info.ptrVal = I.getArgOperand(0);

5760

unsigned Size = I.getType()->getScalarSizeInBits();

5761

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5762

Info.align = Align(Size);

5763

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5764

MachineMemOperand::MOVolatile;

5765

return true;

5766

}

5767

case Intrinsic::x86_atomic_bts_rm:

5768

case Intrinsic::x86_atomic_btc_rm:

5769

case Intrinsic::x86_atomic_btr_rm: {

5770

Info.opc = ISD::INTRINSIC_W_CHAIN;

5771

Info.ptrVal = I.getArgOperand(0);

5772

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5773

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5774

Info.align = Align(Size);

5775

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5776

MachineMemOperand::MOVolatile;

5777

return true;

5778

}

5779

case Intrinsic::x86_aadd32:

5780

case Intrinsic::x86_aadd64:

5781

case Intrinsic::x86_aand32:

5782

case Intrinsic::x86_aand64:

5783

case Intrinsic::x86_aor32:

5784

case Intrinsic::x86_aor64:

5785

case Intrinsic::x86_axor32:

5786

case Intrinsic::x86_axor64:

5787

case Intrinsic::x86_atomic_add_cc:

5788

case Intrinsic::x86_atomic_sub_cc:

5789

case Intrinsic::x86_atomic_or_cc:

5790

case Intrinsic::x86_atomic_and_cc:

5791

case Intrinsic::x86_atomic_xor_cc: {

5792

Info.opc = ISD::INTRINSIC_W_CHAIN;

5793

Info.ptrVal = I.getArgOperand(0);

5794

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5795

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5796

Info.align = Align(Size);

5797

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5798

MachineMemOperand::MOVolatile;

5799

return true;

5800

}

5801

}

5802

return false;

5803

}

5804

5805

switch (IntrData->Type) {

5806

case TRUNCATE_TO_MEM_VI8:

5807

case TRUNCATE_TO_MEM_VI16:

5808

case TRUNCATE_TO_MEM_VI32: {

5809

Info.opc = ISD::INTRINSIC_VOID;

5810

Info.ptrVal = I.getArgOperand(0);

5811

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

5812

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

5813

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

5814

ScalarVT = MVT::i8;

5815

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

5816

ScalarVT = MVT::i16;

5817

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

5818

ScalarVT = MVT::i32;

5819

5820

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

5821

Info.align = Align(1);

5822

Info.flags |= MachineMemOperand::MOStore;

5823

break;

5824

}

5825

case GATHER:

5826

case GATHER_AVX2: {

5827

Info.opc = ISD::INTRINSIC_W_CHAIN;

5828

Info.ptrVal = nullptr;

5829

MVT DataVT = MVT::getVT(I.getType());

5830

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5831

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5832

IndexVT.getVectorNumElements());

5833

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5834

Info.align = Align(1);

5835

Info.flags |= MachineMemOperand::MOLoad;

5836

break;

5837

}

5838

case SCATTER: {

5839

Info.opc = ISD::INTRINSIC_VOID;

5840

Info.ptrVal = nullptr;

5841

MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

5842

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5843

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5844

IndexVT.getVectorNumElements());

5845

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5846

Info.align = Align(1);

5847

Info.flags |= MachineMemOperand::MOStore;

5848

break;

5849

}

5850

default:

5851

return false;

5852

}

5853

5854

return true;

5855

}

5856

5857

/// Returns true if the target can instruction select the

5858

/// specified FP immediate natively. If false, the legalizer will

5859

/// materialize the FP immediate as a load from a constant pool.

5860

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

5861

bool ForCodeSize) const {

5862

for (const APFloat &FPImm : LegalFPImmediates)

5863

if (Imm.bitwiseIsEqual(FPImm))

5864

return true;

5865

return false;

5866

}

5867

5868

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

5869

ISD::LoadExtType ExtTy,

5870

EVT NewVT) const {

5871

assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5871, __extension__
__PRETTY_FUNCTION__));

5872

5873

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

5874

// relocation target a movq or addq instruction: don't let the load shrink.

5875

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

5876

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

5877

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

5878

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

5879

5880

// If this is an (1) AVX vector load with (2) multiple uses and (3) all of

5881

// those uses are extracted directly into a store, then the extract + store

5882

// can be store-folded. Therefore, it's probably not worth splitting the load.

5883

EVT VT = Load->getValueType(0);

5884

if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {

5885

for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {

5886

// Skip uses of the chain value. Result 0 of the node is the load value.

5887

if (UI.getUse().getResNo() != 0)

5888

continue;

5889

5890

// If this use is not an extract + store, it's probably worth splitting.

5891

if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||

5892

UI->use_begin()->getOpcode() != ISD::STORE)

5893

return true;

5894

}

5895

// All non-chain uses are extract + store.

5896

return false;

5897

}

5898

5899

return true;

5900

}

5901

5902

/// Returns true if it is beneficial to convert a load of a constant

5903

/// to just the constant itself.

5904

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

5905

Type *Ty) const {

5906

assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5906, __extension__ __PRETTY_FUNCTION__));

5907

5908

unsigned BitSize = Ty->getPrimitiveSizeInBits();

5909

if (BitSize == 0 || BitSize > 64)

5910

return false;

5911

return true;

5912

}

5913

5914

bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

5915

// If we are using XMM registers in the ABI and the condition of the select is

5916

// a floating-point compare and we have blendv or conditional move, then it is

5917

// cheaper to select instead of doing a cross-register move and creating a

5918

// load that depends on the compare result.

5919

bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

5920

return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

5921

}

5922

5923

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

5924

// TODO: It might be a win to ease or lift this restriction, but the generic

5925

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

5926

if (VT.isVector() && Subtarget.hasAVX512())

5927

return false;

5928

5929

return true;

5930

}

5931

5932

bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

5933

SDValue C) const {

5934

// TODO: We handle scalars using custom code, but generic combining could make

5935

// that unnecessary.

5936

APInt MulC;

5937

if (!ISD::isConstantSplatVector(C.getNode(), MulC))

5938

return false;

5939

5940

// Find the type this will be legalized too. Otherwise we might prematurely

5941

// convert this to shl+add/sub and then still have to type legalize those ops.

5942

// Another choice would be to defer the decision for illegal types until

5943

// after type legalization. But constant splat vectors of i64 can't make it

5944

// through type legalization on 32-bit targets so we would need to special

5945

// case vXi64.

5946

while (getTypeAction(Context, VT) != TypeLegal)

5947

VT = getTypeToTransformTo(Context, VT);

5948

5949

// If vector multiply is legal, assume that's faster than shl + add/sub.

5950

// Multiply is a complex op with higher latency and lower throughput in

5951

// most implementations, sub-vXi32 vector multiplies are always fast,

5952

// vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)

5953

// is always going to be slow.

5954

unsigned EltSizeInBits = VT.getScalarSizeInBits();

5955

if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&

5956

(EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))

5957

return false;

5958

5959

// shl+add, shl+sub, shl+add+neg

5960

return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

5961

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

5962

}

5963

5964

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

5965

unsigned Index) const {

5966

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

5967

return false;

5968

5969

// Mask vectors support all subregister combinations and operations that

5970

// extract half of vector.

5971

if (ResVT.getVectorElementType() == MVT::i1)

5972

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

5973

(Index == ResVT.getVectorNumElements()));

5974

5975

return (Index % ResVT.getVectorNumElements()) == 0;

5976

}

5977

5978

bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

5979

unsigned Opc = VecOp.getOpcode();

5980

5981

// Assume target opcodes can't be scalarized.

5982

// TODO - do we have any exceptions?

5983

if (Opc >= ISD::BUILTIN_OP_END)

5984

return false;

5985

5986

// If the vector op is not supported, try to convert to scalar.

5987

EVT VecVT = VecOp.getValueType();

5988

if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

5989

return true;

5990

5991

// If the vector op is supported, but the scalar op is not, the transform may

5992

// not be worthwhile.

5993

EVT ScalarVT = VecVT.getScalarType();

5994

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

5995

}

5996

5997

bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

5998

bool) const {

5999

// TODO: Allow vectors?

6000

if (VT.isVector())

6001

return false;

6002

return VT.isSimple() || !isOperationExpand(Opcode, VT);

6003

}

6004

6005

bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {

6006

// Speculate cttz only if we can directly use TZCNT or can promote to i32.

6007

return Subtarget.hasBMI() ||

6008

(!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);

6009

}

6010

6011

bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {

6012

// Speculate ctlz only if we can directly use LZCNT.

6013

return Subtarget.hasLZCNT();

6014

}

6015

6016

bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

6017

// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more

6018

// expensive than a straight movsd. On the other hand, it's important to

6019

// shrink long double fp constant since fldt is very slow.

6020

return !Subtarget.hasSSE2() || VT == MVT::f80;

6021

}

6022

6023

bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {

6024

return (VT == MVT::f64 && Subtarget.hasSSE2()) ||

6025

(VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;

6026

}

6027

6028

bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

6029

const SelectionDAG &DAG,

6030

const MachineMemOperand &MMO) const {

6031

if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

6032

BitcastVT.getVectorElementType() == MVT::i1)

6033

return false;

6034

6035

if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

6036

return false;

6037

6038

// If both types are legal vectors, it's always ok to convert them.

6039

if (LoadVT.isVector() && BitcastVT.isVector() &&

6040

isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

6041

return true;

6042

6043

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

6044

}

6045

6046

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

6047

const MachineFunction &MF) const {

6048

// Do not merge to float value size (128 bytes) if no implicit

6049

// float attribute is set.

6050

bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);

6051

6052

if (NoFloat) {

6053

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

6054

return (MemVT.getSizeInBits() <= MaxIntSize);

6055

}

6056

// Make sure we don't merge greater than our preferred vector

6057

// width.

6058

if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

6059

return false;

6060

6061

return true;

6062

}

6063

6064

bool X86TargetLowering::isCtlzFast() const {

6065

return Subtarget.hasFastLZCNT();

6066

}

6067

6068

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

6069

const Instruction &AndI) const {

6070

return true;

6071

}

6072

6073

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

6074

EVT VT = Y.getValueType();

6075

6076

if (VT.isVector())

6077

return false;

6078

6079

if (!Subtarget.hasBMI())

6080

return false;

6081

6082

// There are only 32-bit and 64-bit forms for 'andn'.

6083

if (VT != MVT::i32 && VT != MVT::i64)

6084

return false;

6085

6086

return !isa<ConstantSDNode>(Y);

6087

}

6088

6089

bool X86TargetLowering::hasAndNot(SDValue Y) const {

6090

EVT VT = Y.getValueType();

6091

6092

if (!VT.isVector())

6093

return hasAndNotCompare(Y);

6094

6095

// Vector.

6096

6097

if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

6098

return false;

6099

6100

if (VT == MVT::v4i32)

6101

return true;

6102

6103

return Subtarget.hasSSE2();

6104

}

6105

6106

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

6107

return X.getValueType().isScalarInteger(); // 'bt'

6108

}

6109

6110

bool X86TargetLowering::

6111

shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6112

SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

6113

unsigned OldShiftOpcode, unsigned NewShiftOpcode,

6114

SelectionDAG &DAG) const {

6115

// Does baseline recommend not to perform the fold by default?

6116

if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6117

X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

6118

return false;

6119

// For scalars this transform is always beneficial.

6120

if (X.getValueType().isScalarInteger())

6121

return true;

6122

// If all the shift amounts are identical, then transform is beneficial even

6123

// with rudimentary SSE2 shifts.

6124

if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

6125

return true;

6126

// If we have AVX2 with it's powerful shift operations, then it's also good.

6127

if (Subtarget.hasAVX2())

6128

return true;

6129

// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

6130

return NewShiftOpcode == ISD::SHL;

6131

}

6132

6133

bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {

6134

return N->getOpcode() != ISD::FP_EXTEND;

6135

}

6136

6137

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

6138

const SDNode *N, CombineLevel Level) const {

6139

assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))

6140

N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))

6141

(N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))

6142

N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__))

6143

"Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__
__PRETTY_FUNCTION__));

6144

// TODO: Should we always create i64 masks? Or only folded immediates?

6145

EVT VT = N->getValueType(0);

6146

if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

6147

(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

6148

// Only fold if the shift values are equal - so it folds to AND.

6149

// TODO - we should fold if either is a non-uniform vector but we don't do

6150

// the fold for non-splats yet.

6151

return N->getOperand(1) == N->getOperand(0).getOperand(1);

6152

}

6153

return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);

6154

}

6155

6156

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

6157

EVT VT = Y.getValueType();

6158

6159

// For vectors, we don't have a preference, but we probably want a mask.

6160

if (VT.isVector())

6161

return false;

6162

6163

// 64-bit shifts on 32-bit targets produce really bad bloated code.

6164

if (VT == MVT::i64 && !Subtarget.is64Bit())

6165

return false;

6166

6167

return true;

6168

}

6169

6170

TargetLowering::ShiftLegalizationStrategy

6171

X86TargetLowering::preferredShiftLegalizationStrategy(

6172

SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {

6173

if (DAG.getMachineFunction().getFunction().hasMinSize() &&

6174

!Subtarget.isOSWindows())

6175

return ShiftLegalizationStrategy::LowerToLibcall;

6176

return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,

6177

ExpansionFactor);

6178

}

6179

6180

bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

6181

// Any legal vector type can be splatted more efficiently than

6182

// loading/spilling from memory.

6183

return isTypeLegal(VT);

6184

}

6185

6186

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

6187

MVT VT = MVT::getIntegerVT(NumBits);

6188

if (isTypeLegal(VT))

6189

return VT;

6190

6191

// PMOVMSKB can handle this.

6192

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

6193

return MVT::v16i8;

6194

6195

// VPMOVMSKB can handle this.

6196

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

6197

return MVT::v32i8;

6198

6199

// TODO: Allow 64-bit type for 32-bit target.

6200

// TODO: 512-bit types should be allowed, but make sure that those

6201

// cases are handled in combineVectorSizedSetCCEquality().

6202

6203

return MVT::INVALID_SIMPLE_VALUE_TYPE;

6204

}

6205

6206

/// Val is the undef sentinel value or equal to the specified value.

6207

static bool isUndefOrEqual(int Val, int CmpVal) {

6208

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

6209

}

6210

6211

/// Return true if every element in Mask is the undef sentinel value or equal to

6212

/// the specified value..

6213

static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {

6214

return llvm::all_of(Mask, [CmpVal](int M) {

6215

return (M == SM_SentinelUndef) || (M == CmpVal);

6216

});

6217

}

6218

6219

/// Val is either the undef or zero sentinel value.

6220

static bool isUndefOrZero(int Val) {

6221

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

6222

}

6223

6224

/// Return true if every element in Mask, beginning from position Pos and ending

6225

/// in Pos+Size is the undef sentinel value.

6226

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

6227

return llvm::all_of(Mask.slice(Pos, Size),

6228

[](int M) { return M == SM_SentinelUndef; });

6229

}

6230

6231

/// Return true if the mask creates a vector whose lower half is undefined.

6232

static bool isUndefLowerHalf(ArrayRef<int> Mask) {

6233

unsigned NumElts = Mask.size();

6234

return isUndefInRange(Mask, 0, NumElts / 2);

6235

}

6236

6237

/// Return true if the mask creates a vector whose upper half is undefined.

6238

static bool isUndefUpperHalf(ArrayRef<int> Mask) {

6239

unsigned NumElts = Mask.size();

6240

return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

6241

}

6242

6243

/// Return true if Val falls within the specified range (L, H].

6244

static bool isInRange(int Val, int Low, int Hi) {

6245

return (Val >= Low && Val < Hi);

6246

}

6247

6248

/// Return true if the value of any element in Mask falls within the specified

6249

/// range (L, H].

6250

static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

6251

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

6252

}

6253

6254

/// Return true if the value of any element in Mask is the zero sentinel value.

6255

static bool isAnyZero(ArrayRef<int> Mask) {

6256

return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

6257

}

6258

6259

/// Return true if the value of any element in Mask is the zero or undef

6260

/// sentinel values.

6261

static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

6262

return llvm::any_of(Mask, [](int M) {

6263

return M == SM_SentinelZero || M == SM_SentinelUndef;

6264

});

6265

}

6266

6267

/// Return true if Val is undef or if its value falls within the

6268

/// specified range (L, H].

6269

static bool isUndefOrInRange(int Val, int Low, int Hi) {

6270

return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

6271

}

6272

6273

/// Return true if every element in Mask is undef or if its value

6274

/// falls within the specified range (L, H].

6275

static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6276

return llvm::all_of(

6277

Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

6278

}

6279

6280

/// Return true if Val is undef, zero or if its value falls within the

6281

/// specified range (L, H].

6282

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

6283

return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

6284

}

6285

6286

/// Return true if every element in Mask is undef, zero or if its value

6287

/// falls within the specified range (L, H].

6288

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6289

return llvm::all_of(

6290

Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

6291

}

6292

6293

/// Return true if every element in Mask, beginning

6294

/// from position Pos and ending in Pos + Size, falls within the specified

6295

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.

6296

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

6297

unsigned Size, int Low, int Step = 1) {

6298

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6299

if (!isUndefOrEqual(Mask[i], Low))

6300

return false;

6301

return true;

6302

}

6303

6304

/// Return true if every element in Mask, beginning

6305

/// from position Pos and ending in Pos+Size, falls within the specified

6306

/// sequential range (Low, Low+Size], or is undef or is zero.

6307

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6308

unsigned Size, int Low,

6309

int Step = 1) {

6310

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6311

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

6312

return false;

6313

return true;

6314

}

6315

6316

/// Return true if every element in Mask, beginning

6317

/// from position Pos and ending in Pos+Size is undef or is zero.

6318

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6319

unsigned Size) {

6320

return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);

6321

}

6322

6323

/// Helper function to test whether a shuffle mask could be

6324

/// simplified by widening the elements being shuffled.

6325

///

6326

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

6327

/// leaves it in an unspecified state.

6328

///

6329

/// NOTE: This must handle normal vector shuffle masks and *target* vector

6330

/// shuffle masks. The latter have the special property of a '-2' representing

6331

/// a zero-ed lane of a vector.

6332

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6333

SmallVectorImpl<int> &WidenedMask) {

6334

WidenedMask.assign(Mask.size() / 2, 0);

6335

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

6336

int M0 = Mask[i];

6337

int M1 = Mask[i + 1];

6338

6339

// If both elements are undef, its trivial.

6340

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

6341

WidenedMask[i / 2] = SM_SentinelUndef;

6342

continue;

6343

}

6344

6345

// Check for an undef mask and a mask value properly aligned to fit with

6346

// a pair of values. If we find such a case, use the non-undef mask's value.

6347

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

6348

WidenedMask[i / 2] = M1 / 2;

6349

continue;

6350

}

6351

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

6352

WidenedMask[i / 2] = M0 / 2;

6353

continue;

6354

}

6355

6356

// When zeroing, we need to spread the zeroing across both lanes to widen.

6357

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

6358

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

6359

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

6360

WidenedMask[i / 2] = SM_SentinelZero;

6361

continue;

6362

}

6363

return false;

6364

}

6365

6366

// Finally check if the two mask values are adjacent and aligned with

6367

// a pair.

6368

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

6369

WidenedMask[i / 2] = M0 / 2;

6370

continue;

6371

}

6372

6373

// Otherwise we can't safely widen the elements used in this shuffle.

6374

return false;

6375

}

6376

assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__
__PRETTY_FUNCTION__))

6377

"Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__
__PRETTY_FUNCTION__));

6378

6379

return true;

6380

}

6381

6382

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6383

const APInt &Zeroable,

6384

bool V2IsZero,

6385

SmallVectorImpl<int> &WidenedMask) {

6386

// Create an alternative mask with info about zeroable elements.

6387

// Here we do not set undef elements as zeroable.

6388

SmallVector<int, 64> ZeroableMask(Mask);

6389

if (V2IsZero) {

6390

assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6390, __extension__
__PRETTY_FUNCTION__));

6391

for (int i = 0, Size = Mask.size(); i != Size; ++i)

6392

if (Mask[i] != SM_SentinelUndef && Zeroable[i])

6393

ZeroableMask[i] = SM_SentinelZero;

6394

}

6395

return canWidenShuffleElements(ZeroableMask, WidenedMask);

6396

}

6397

6398

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

6399

SmallVector<int, 32> WidenedMask;

6400

return canWidenShuffleElements(Mask, WidenedMask);

6401

}

6402

6403

// Attempt to narrow/widen shuffle mask until it matches the target number of

6404

// elements.

6405

static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

6406

SmallVectorImpl<int> &ScaledMask) {

6407

unsigned NumSrcElts = Mask.size();

6408

assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__
__PRETTY_FUNCTION__))

6409

"Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__
__PRETTY_FUNCTION__));

6410

6411

// Narrowing is guaranteed to work.

6412

if (NumDstElts >= NumSrcElts) {

6413

int Scale = NumDstElts / NumSrcElts;

6414

llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

6415

return true;

6416

}

6417

6418

// We have to repeat the widening until we reach the target size, but we can

6419

// split out the first widening as it sets up ScaledMask for us.

6420

if (canWidenShuffleElements(Mask, ScaledMask)) {

6421

while (ScaledMask.size() > NumDstElts) {

6422

SmallVector<int, 16> WidenedMask;

6423

if (!canWidenShuffleElements(ScaledMask, WidenedMask))

6424

return false;

6425

ScaledMask = std::move(WidenedMask);

6426

}

6427

return true;

6428

}

6429

6430

return false;

6431

}

6432

6433

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

6434

bool X86::isZeroNode(SDValue Elt) {

6435

return isNullConstant(Elt) || isNullFPConstant(Elt);

6436

}

6437

6438

// Build a vector of constants.

6439

// Use an UNDEF node if MaskElt == -1.

6440

// Split 64-bit constants in the 32-bit mode.

6441

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

6442

const SDLoc &dl, bool IsMask = false) {

6443

6444

SmallVector<SDValue, 32> Ops;

6445

bool Split = false;

6446

6447

MVT ConstVecVT = VT;

6448

unsigned NumElts = VT.getVectorNumElements();

6449

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6450

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6451

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6452

Split = true;

6453

}

6454

6455

MVT EltVT = ConstVecVT.getVectorElementType();

6456

for (unsigned i = 0; i < NumElts; ++i) {

6457

bool IsUndef = Values[i] < 0 && IsMask;

6458

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

6459

DAG.getConstant(Values[i], dl, EltVT);

6460

Ops.push_back(OpNode);

6461

if (Split)

6462

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

6463

DAG.getConstant(0, dl, EltVT));

6464

}

6465

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6466

if (Split)

6467

ConstsNode = DAG.getBitcast(VT, ConstsNode);

6468

return ConstsNode;

6469

}

6470

6471

static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,

6472

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6473

assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__
__PRETTY_FUNCTION__))

6474

"Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__
__PRETTY_FUNCTION__));

6475

SmallVector<SDValue, 32> Ops;

6476

bool Split = false;

6477

6478

MVT ConstVecVT = VT;

6479

unsigned NumElts = VT.getVectorNumElements();

6480

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6481

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6482

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6483

Split = true;

6484

}

6485

6486

MVT EltVT = ConstVecVT.getVectorElementType();

6487

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

6488

if (Undefs[i]) {

6489

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

6490

continue;

6491

}

6492

const APInt &V = Bits[i];

6493

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6493, __extension__
__PRETTY_FUNCTION__));

6494

if (Split) {

6495

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

6496

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

6497

} else if (EltVT == MVT::f32) {

6498

APFloat FV(APFloat::IEEEsingle(), V);

6499

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6500

} else if (EltVT == MVT::f64) {

6501

APFloat FV(APFloat::IEEEdouble(), V);

6502

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6503

} else {

6504

Ops.push_back(DAG.getConstant(V, dl, EltVT));

6505

}

6506

}

6507

6508

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6509

return DAG.getBitcast(VT, ConstsNode);

6510

}

6511

6512

static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,

6513

SelectionDAG &DAG, const SDLoc &dl) {

6514

APInt Undefs = APInt::getZero(Bits.size());

6515

return getConstVector(Bits, Undefs, VT, DAG, dl);

6516

}

6517

6518

/// Returns a vector of specified type with all zero elements.

6519

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

6520

SelectionDAG &DAG, const SDLoc &dl) {

6521

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))

6522

VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))

6523

"Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__));

6524

6525

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

6526

// type. This ensures they get CSE'd. But if the integer type is not

6527

// available, use a floating-point +0.0 instead.

6528

SDValue Vec;

6529

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

6530

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

6531

} else if (VT.isFloatingPoint()) {

6532

Vec = DAG.getConstantFP(+0.0, dl, VT);

6533

} else if (VT.getVectorElementType() == MVT::i1) {

6534

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__
__PRETTY_FUNCTION__))

6535

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__
__PRETTY_FUNCTION__));

6536

Vec = DAG.getConstant(0, dl, VT);

6537

} else {

6538

unsigned Num32BitElts = VT.getSizeInBits() / 32;

6539

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

6540

}

6541

return DAG.getBitcast(VT, Vec);

6542

}

6543

6544

// Helper to determine if the ops are all the extracted subvectors come from a

6545

// single source. If we allow commute they don't have to be in order (Lo/Hi).

6546

static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {

6547

if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6548

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6549

LHS.getValueType() != RHS.getValueType() ||

6550

LHS.getOperand(0) != RHS.getOperand(0))

6551

return SDValue();

6552

6553

SDValue Src = LHS.getOperand(0);

6554

if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))

6555

return SDValue();

6556

6557

unsigned NumElts = LHS.getValueType().getVectorNumElements();

6558

if ((LHS.getConstantOperandAPInt(1) == 0 &&

6559

RHS.getConstantOperandAPInt(1) == NumElts) ||

6560

(AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&

6561

LHS.getConstantOperandAPInt(1) == NumElts))

6562

return Src;

6563

6564

return SDValue();

6565

}

6566

6567

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

6568

const SDLoc &dl, unsigned vectorWidth) {

6569

EVT VT = Vec.getValueType();

6570

EVT ElVT = VT.getVectorElementType();

6571

unsigned Factor = VT.getSizeInBits() / vectorWidth;

6572

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

6573

VT.getVectorNumElements() / Factor);

6574

6575

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

6576

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

6577

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6577, __extension__
__PRETTY_FUNCTION__));

6578

6579

// This is the index of the first element of the vectorWidth-bit chunk

6580

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6581

IdxVal &= ~(ElemsPerChunk - 1);

6582

6583

// If the input is a buildvector just emit a smaller one.

6584

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

6585

return DAG.getBuildVector(ResultVT, dl,

6586

Vec->ops().slice(IdxVal, ElemsPerChunk));

6587

6588

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6589

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

6590

}

6591

6592

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

6593

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

6594

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

6595

/// instructions or a simple subregister reference. Idx is an index in the

6596

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

6597

/// lowering EXTRACT_VECTOR_ELT operations easier.

6598

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

6599

SelectionDAG &DAG, const SDLoc &dl) {

6600

assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))

6601

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__));

6602

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

6603

}

6604

6605

/// Generate a DAG to grab 256-bits from a 512-bit vector.

6606

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

6607

SelectionDAG &DAG, const SDLoc &dl) {

6608

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6608, __extension__
__PRETTY_FUNCTION__));

6609

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

6610

}

6611

6612

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6613

SelectionDAG &DAG, const SDLoc &dl,

6614

unsigned vectorWidth) {

6615

assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__
__PRETTY_FUNCTION__))

6616

"Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__
__PRETTY_FUNCTION__));

6617

// Inserting UNDEF is Result

6618

if (Vec.isUndef())

6619

return Result;

6620

EVT VT = Vec.getValueType();

6621

EVT ElVT = VT.getVectorElementType();

6622

EVT ResultVT = Result.getValueType();

6623

6624

// Insert the relevant vectorWidth bits.

6625

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

6626

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6626, __extension__
__PRETTY_FUNCTION__));

6627

6628

// This is the index of the first element of the vectorWidth-bit chunk

6629

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6630

IdxVal &= ~(ElemsPerChunk - 1);

6631

6632

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6633

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

6634

}

6635

6636

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

6637

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

6638

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

6639

/// simple superregister reference. Idx is an index in the 128 bits

6640

/// we want. It need not be aligned to a 128-bit boundary. That makes

6641

/// lowering INSERT_VECTOR_ELT operations easier.

6642

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6643

SelectionDAG &DAG, const SDLoc &dl) {

6644

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6644, __extension__
__PRETTY_FUNCTION__));

6645

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

6646

}

6647

6648

/// Widen a vector to a larger size with the same scalar type, with the new

6649

/// elements either zero or undef.

6650

static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

6651

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6652

const SDLoc &dl) {

6653

assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))

6654

Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__))

6655

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__
__PRETTY_FUNCTION__));

6656

SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

6657

: DAG.getUNDEF(VT);

6658

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,

6659

DAG.getIntPtrConstant(0, dl));

6660

}

6661

6662

/// Widen a vector to a larger size with the same scalar type, with the new

6663

/// elements either zero or undef.

6664

static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

6665

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6666

const SDLoc &dl, unsigned WideSizeInBits) {

6667

assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))

6668

(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__))

6669

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__
__PRETTY_FUNCTION__));

6670

unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

6671

MVT SVT = Vec.getSimpleValueType().getScalarType();

6672

MVT VT = MVT::getVectorVT(SVT, WideNumElts);

6673

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

6674

}

6675

6676

// Helper function to collect subvector ops that are concatenated together,

6677

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

6678

// The subvectors in Ops are guaranteed to be the same type.

6679

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,

6680

SelectionDAG &DAG) {

6681

assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6681, __extension__
__PRETTY_FUNCTION__));

6682

6683

if (N->getOpcode() == ISD::CONCAT_VECTORS) {

6684

Ops.append(N->op_begin(), N->op_end());

6685

return true;

6686

}

6687

6688

if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

6689

SDValue Src = N->getOperand(0);

6690

SDValue Sub = N->getOperand(1);

6691

const APInt &Idx = N->getConstantOperandAPInt(2);

6692

EVT VT = Src.getValueType();

6693

EVT SubVT = Sub.getValueType();

6694

6695

// TODO - Handle more general insert_subvector chains.

6696

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {

6697

// insert_subvector(undef, x, lo)

6698

if (Idx == 0 && Src.isUndef()) {

6699

Ops.push_back(Sub);

6700

Ops.push_back(DAG.getUNDEF(SubVT));

6701

return true;

6702

}

6703

if (Idx == (VT.getVectorNumElements() / 2)) {

6704

// insert_subvector(insert_subvector(undef, x, lo), y, hi)

6705

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

6706

Src.getOperand(1).getValueType() == SubVT &&

6707

isNullConstant(Src.getOperand(2))) {

6708

Ops.push_back(Src.getOperand(1));

6709

Ops.push_back(Sub);

6710

return true;

6711

}

6712

// insert_subvector(x, extract_subvector(x, lo), hi)

6713

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6714

Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

6715

Ops.append(2, Sub);

6716

return true;

6717

}

6718

// insert_subvector(undef, x, hi)

6719

if (Src.isUndef()) {

6720

Ops.push_back(DAG.getUNDEF(SubVT));

6721

Ops.push_back(Sub);

6722

return true;

6723

}

6724

}

6725

}

6726

}

6727

6728

return false;

6729

}

6730

6731

static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

6732

const SDLoc &dl) {

6733

EVT VT = Op.getValueType();

6734

unsigned NumElems = VT.getVectorNumElements();

6735

unsigned SizeInBits = VT.getSizeInBits();

6736

assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__
__PRETTY_FUNCTION__))

6737

"Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__
__PRETTY_FUNCTION__));

6738

6739

// If this is a splat value (with no-undefs) then use the lower subvector,

6740

// which should be a free extraction.

6741

SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

6742

if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))

6743

return std::make_pair(Lo, Lo);

6744

6745

SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

6746

return std::make_pair(Lo, Hi);

6747

}

6748

6749

/// Break an operation into 2 half sized ops and then concatenate the results.

6750

static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {

6751

unsigned NumOps = Op.getNumOperands();

6752

EVT VT = Op.getValueType();

6753

SDLoc dl(Op);

6754

6755

// Extract the LHS Lo/Hi vectors

6756

SmallVector<SDValue> LoOps(NumOps, SDValue());

6757

SmallVector<SDValue> HiOps(NumOps, SDValue());

6758

for (unsigned I = 0; I != NumOps; ++I) {

6759

SDValue SrcOp = Op.getOperand(I);

6760

if (!SrcOp.getValueType().isVector()) {

6761

LoOps[I] = HiOps[I] = SrcOp;

6762

continue;

6763

}

6764

std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);

6765

}

6766

6767

EVT LoVT, HiVT;

6768

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

6769

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

6770

DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),

6771

DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));

6772

}

6773

6774

/// Break an unary integer operation into 2 half sized ops and then

6775

/// concatenate the result back.

6776

static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

6777

// Make sure we only try to split 256/512-bit types to avoid creating

6778

// narrow vectors.

6779

EVT VT = Op.getValueType();

6780

(void)VT;

6781

assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))

6782

Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__))

6783

(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__
__PRETTY_FUNCTION__));

6784

assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))

6785

VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__))

6786

"Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__
__PRETTY_FUNCTION__));

6787

return splitVectorOp(Op, DAG);

6788

}

6789

6790

/// Break a binary integer operation into 2 half sized ops and then

6791

/// concatenate the result back.

6792

static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

6793

// Assert that all the types match.

6794

EVT VT = Op.getValueType();

6795

(void)VT;

6796

assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__
__PRETTY_FUNCTION__))

6797

Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__
__PRETTY_FUNCTION__));

6798

assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__
__PRETTY_FUNCTION__));

6799

return splitVectorOp(Op, DAG);

6800

}

6801

6802

// Helper for splitting operands of an operation to legal target size and

6803

// apply a function on each part.

6804

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

6805

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

6806

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

6807

// The argument Builder is a function that will be applied on each split part:

6808

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

6809

template <typename F>

6810

SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

6811

const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

6812

F Builder, bool CheckBWI = true) {

6813

assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6813, __extension__
__PRETTY_FUNCTION__));

6814

unsigned NumSubs = 1;

6815

if ((CheckBWI && Subtarget.useBWIRegs()) ||

6816

(!CheckBWI && Subtarget.useAVX512Regs())) {

6817

if (VT.getSizeInBits() > 512) {

6818

NumSubs = VT.getSizeInBits() / 512;

6819

assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6819, __extension__
__PRETTY_FUNCTION__));

6820

}

6821

} else if (Subtarget.hasAVX2()) {

6822

if (VT.getSizeInBits() > 256) {

6823

NumSubs = VT.getSizeInBits() / 256;

6824

assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6824, __extension__
__PRETTY_FUNCTION__));

6825

}

6826

} else {

6827

if (VT.getSizeInBits() > 128) {

6828

NumSubs = VT.getSizeInBits() / 128;

6829

assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6829, __extension__
__PRETTY_FUNCTION__));

6830

}

6831

}

6832

6833

if (NumSubs == 1)

6834

return Builder(DAG, DL, Ops);

6835

6836

SmallVector<SDValue, 4> Subs;

6837

for (unsigned i = 0; i != NumSubs; ++i) {

6838

SmallVector<SDValue, 2> SubOps;

6839

for (SDValue Op : Ops) {

6840

EVT OpVT = Op.getValueType();

6841

unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

6842

unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

6843

SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

6844

}

6845

Subs.push_back(Builder(DAG, DL, SubOps));

6846

}

6847

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

6848

}

6849

6850

// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX

6851

// targets.

6852

static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,

6853

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

6854

const X86Subtarget &Subtarget) {

6855

assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6855, __extension__
__PRETTY_FUNCTION__));

6856

MVT SVT = VT.getScalarType();

6857

6858

// If we have a 32/64 splatted constant, splat it to DstTy to

6859

// encourage a foldable broadcast'd operand.

6860

auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {

6861

unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();

6862

// AVX512 broadcasts 32/64-bit operands.

6863

// TODO: Support float once getAVX512Node is used by fp-ops.

6864

if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||

6865

!DAG.getTargetLoweringInfo().isTypeLegal(SVT))

6866

return SDValue();

6867

// If we're not widening, don't bother if we're not bitcasting.

6868

if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)

6869

return SDValue();

6870

if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {

6871

APInt SplatValue, SplatUndef;

6872

unsigned SplatBitSize;

6873

bool HasAnyUndefs;

6874

if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

6875

HasAnyUndefs, OpEltSizeInBits) &&

6876

!HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)

6877

return DAG.getConstant(SplatValue, DL, DstVT);

6878

}

6879

return SDValue();

6880

};

6881

6882

bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());

6883

6884

MVT DstVT = VT;

6885

if (Widen)

6886

DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());

6887

6888

// Canonicalize src operands.

6889

SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());

6890

for (SDValue &Op : SrcOps) {

6891

MVT OpVT = Op.getSimpleValueType();

6892

// Just pass through scalar operands.

6893

if (!OpVT.isVector())

6894

continue;

6895

assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6895, __extension__
__PRETTY_FUNCTION__));

6896

6897

if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {

6898

Op = BroadcastOp;

6899

continue;

6900

}

6901

6902

// Just widen the subvector by inserting into an undef wide vector.

6903

if (Widen)

6904

Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);

6905

}

6906

6907

SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);

6908

6909

// Perform the 512-bit op then extract the bottom subvector.

6910

if (Widen)

6911

Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

6912

return Res;

6913

}

6914

6915

/// Insert i1-subvector to i1-vector.

6916

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

6917

const X86Subtarget &Subtarget) {

6918

6919

SDLoc dl(Op);

6920

SDValue Vec = Op.getOperand(0);

6921

SDValue SubVec = Op.getOperand(1);

6922

SDValue Idx = Op.getOperand(2);

6923

unsigned IdxVal = Op.getConstantOperandVal(2);

6924

6925

// Inserting undef is a nop. We can just return the original vector.

6926

if (SubVec.isUndef())

6927

return Vec;

6928

6929

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

6930

return Op;

6931

6932

MVT OpVT = Op.getSimpleValueType();

6933

unsigned NumElems = OpVT.getVectorNumElements();

6934

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

6935

6936

// Extend to natively supported kshift.

6937

MVT WideOpVT = OpVT;

6938

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

6939

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

6940

6941

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

6942

// if necessary.

6943

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

6944

// May need to promote to a legal type.

6945

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6946

DAG.getConstant(0, dl, WideOpVT),

6947

SubVec, Idx);

6948

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6949

}

6950

6951

MVT SubVecVT = SubVec.getSimpleValueType();

6952

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

6953

assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))

6954

IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__))

6955

"Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__
__PRETTY_FUNCTION__));

6956

6957

SDValue Undef = DAG.getUNDEF(WideOpVT);

6958

6959

if (IdxVal == 0) {

6960

// Zero lower bits of the Vec

6961

SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

6962

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

6963

ZeroIdx);

6964

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6965

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6966

// Merge them together, SubVec should be zero extended.

6967

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6968

DAG.getConstant(0, dl, WideOpVT),

6969

SubVec, ZeroIdx);

6970

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6971

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6972

}

6973

6974

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6975

Undef, SubVec, ZeroIdx);

6976

6977

if (Vec.isUndef()) {

6978

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6978, __extension__
__PRETTY_FUNCTION__));

6979

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6980

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6981

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6982

}

6983

6984

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

6985

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6985, __extension__
__PRETTY_FUNCTION__));

6986

// If upper elements of Vec are known undef, then just shift into place.

6987

if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),

6988

[](SDValue V) { return V.isUndef(); })) {

6989

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6990

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6991

} else {

6992

NumElems = WideOpVT.getVectorNumElements();

6993

unsigned ShiftLeft = NumElems - SubVecNumElems;

6994

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6995

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6996

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6997

if (ShiftRight != 0)

6998

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6999

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7000

}

7001

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

7002

}

7003

7004

// Simple case when we put subvector in the upper part

7005

if (IdxVal + SubVecNumElems == NumElems) {

7006

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7007

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

7008

if (SubVecNumElems * 2 == NumElems) {

7009

// Special case, use legal zero extending insert_subvector. This allows

7010

// isel to optimize when bits are known zero.

7011

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

7012

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

7013

DAG.getConstant(0, dl, WideOpVT),

7014

Vec, ZeroIdx);

7015

} else {

7016

// Otherwise use explicit shifts to zero the bits.

7017

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

7018

Undef, Vec, ZeroIdx);

7019

NumElems = WideOpVT.getVectorNumElements();

7020

SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

7021

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

7022

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

7023

}

7024

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7025

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7026

}

7027

7028

// Inserting into the middle is more complicated.

7029

7030

NumElems = WideOpVT.getVectorNumElements();

7031

7032

// Widen the vector if needed.

7033

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

7034

7035

unsigned ShiftLeft = NumElems - SubVecNumElems;

7036

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

7037

7038

// Do an optimization for the the most frequently used types.

7039

if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

7040

APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

7041

Mask0.flipAllBits();

7042

SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

7043

SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

7044

Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

7045

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7046

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7047

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7048

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7049

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7050

7051

// Reduce to original width if needed.

7052

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7053

}

7054

7055

// Clear the upper bits of the subvector and move it to its insert position.

7056

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7057

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7058

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7059

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7060

7061

// Isolate the bits below the insertion point.

7062

unsigned LowShift = NumElems - IdxVal;

7063

SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

7064

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7065

Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

7066

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7067

7068

// Isolate the bits after the last inserted bit.

7069

unsigned HighShift = IdxVal + SubVecNumElems;

7070

SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

7071

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7072

High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

7073

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7074

7075

// Now OR all 3 pieces together.

7076

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

7077

SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

7078

7079

// Reduce to original width if needed.

7080

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

7081

}

7082

7083

static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

7084

const SDLoc &dl) {

7085

assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7085, __extension__
__PRETTY_FUNCTION__));

7086

EVT SubVT = V1.getValueType();

7087

EVT SubSVT = SubVT.getScalarType();

7088

unsigned SubNumElts = SubVT.getVectorNumElements();

7089

unsigned SubVectorWidth = SubVT.getSizeInBits();

7090

EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

7091

SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

7092

return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

7093

}

7094

7095

/// Returns a vector of specified type with all bits set.

7096

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

7097

/// Then bitcast to their original type, ensuring they get CSE'd.

7098

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

7099

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__
__PRETTY_FUNCTION__))

7100

"Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__
__PRETTY_FUNCTION__));

7101

7102

APInt Ones = APInt::getAllOnes(32);

7103

unsigned NumElts = VT.getSizeInBits() / 32;

7104

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

7105

return DAG.getBitcast(VT, Vec);

7106

}

7107

7108

static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,

7109

SDValue In, SelectionDAG &DAG) {

7110

EVT InVT = In.getValueType();

7111

assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__
__PRETTY_FUNCTION__));

7112

assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))

7113

ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__))

7114

"Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__
__PRETTY_FUNCTION__));

7115

7116

// For 256-bit vectors, we only need the lower (128-bit) input half.

7117

// For 512-bit vectors, we only need the lower input half or quarter.

7118

if (InVT.getSizeInBits() > 128) {

7119

assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__
__PRETTY_FUNCTION__))

7120

"Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__
__PRETTY_FUNCTION__));

7121

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

7122

In = extractSubVector(In, 0, DAG, DL,

7123

std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

7124

InVT = In.getValueType();

7125

}

7126

7127

if (VT.getVectorNumElements() != InVT.getVectorNumElements())

7128

Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);

7129

7130

return DAG.getNode(Opcode, DL, VT, In);

7131

}

7132

7133

// Match (xor X, -1) -> X.

7134

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

7135

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

7136

static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

7137

V = peekThroughBitcasts(V);

7138

if (V.getOpcode() == ISD::XOR &&

7139

(ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||

7140

isAllOnesConstant(V.getOperand(1))))

7141

return V.getOperand(0);

7142

if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

7143

(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

7144

if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

7145

Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

7146

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),

7147

Not, V.getOperand(1));

7148

}

7149

}

7150

SmallVector<SDValue, 2> CatOps;

7151

if (collectConcatOps(V.getNode(), CatOps, DAG)) {

7152

for (SDValue &CatOp : CatOps) {

7153

SDValue NotCat = IsNOT(CatOp, DAG);

7154

if (!NotCat) return SDValue();

7155

CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

7156

}

7157

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);

7158

}

7159

return SDValue();

7160

}

7161

7162

void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,

7163

bool Lo, bool Unary) {

7164

assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__
__PRETTY_FUNCTION__))

7165

"Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__
__PRETTY_FUNCTION__));

7166

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7166, __extension__
__PRETTY_FUNCTION__));

7167

int NumElts = VT.getVectorNumElements();

7168

int NumEltsInLane = 128 / VT.getScalarSizeInBits();

7169

for (int i = 0; i < NumElts; ++i) {

7170

unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

7171

int Pos = (i % NumEltsInLane) / 2 + LaneStart;

7172

Pos += (Unary ? 0 : NumElts * (i % 2));

7173

Pos += (Lo ? 0 : NumEltsInLane / 2);

7174

Mask.push_back(Pos);

7175

}

7176

}

7177

7178

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

7179

/// imposed by AVX and specific to the unary pattern. Example:

7180

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

7181

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

7182

void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7183

bool Lo) {

7184

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7184, __extension__
__PRETTY_FUNCTION__));

7185

int NumElts = VT.getVectorNumElements();

7186

for (int i = 0; i < NumElts; ++i) {

7187

int Pos = i / 2;

7188

Pos += (Lo ? 0 : NumElts / 2);

7189

Mask.push_back(Pos);

7190

}

7191

}

7192

7193

// Attempt to constant fold, else just create a VECTOR_SHUFFLE.

7194

static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,

7195

SDValue V1, SDValue V2, ArrayRef<int> Mask) {

7196

if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&

7197

(ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {

7198

SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));

7199

for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {

7200

int M = Mask[I];

7201

if (M < 0)

7202

continue;

7203

SDValue V = (M < NumElts) ? V1 : V2;

7204

if (V.isUndef())

7205

continue;

7206

Ops[I] = V.getOperand(M % NumElts);

7207

}

7208

return DAG.getBuildVector(VT, dl, Ops);

7209

}

7210

7211

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

7212

}

7213

7214

/// Returns a vector_shuffle node for an unpackl operation.

7215

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7216

SDValue V1, SDValue V2) {

7217

SmallVector<int, 8> Mask;

7218

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

7219

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7220

}

7221

7222

/// Returns a vector_shuffle node for an unpackh operation.

7223

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7224

SDValue V1, SDValue V2) {

7225

SmallVector<int, 8> Mask;

7226

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

7227

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7228

}

7229

7230

/// Returns a node that packs the LHS + RHS nodes together at half width.

7231

/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.

7232

/// TODO: Add subvector splitting if/when we have a need for it.

7233

static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,

7234

const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,

7235

bool PackHiHalf = false) {

7236

MVT OpVT = LHS.getSimpleValueType();

7237

unsigned EltSizeInBits = VT.getScalarSizeInBits();

7238

bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;

7239

assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))

7240

VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))

7241

(EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__))

7242

"Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__
__PRETTY_FUNCTION__));

7243

assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__
__PRETTY_FUNCTION__))

7244

"Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__
__PRETTY_FUNCTION__));

7245

7246

// Rely on vector shuffles for vXi64 -> vXi32 packing.

7247

if (EltSizeInBits == 32) {

7248

SmallVector<int> PackMask;

7249

int Offset = PackHiHalf ? 1 : 0;

7250

int NumElts = VT.getVectorNumElements();

7251

for (int I = 0; I != NumElts; I += 4) {

7252

PackMask.push_back(I + Offset);

7253

PackMask.push_back(I + Offset + 2);

7254

PackMask.push_back(I + Offset + NumElts);

7255

PackMask.push_back(I + Offset + NumElts + 2);

7256

}

7257

return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),

7258

DAG.getBitcast(VT, RHS), PackMask);

7259

}

7260

7261

// See if we already have sufficient leading bits for PACKSS/PACKUS.

7262

if (!PackHiHalf) {

7263

if (UsePackUS &&

7264

DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&

7265

DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)

7266

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7267

7268

if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&

7269

DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)

7270

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7271

}

7272

7273

// Fallback to sign/zero extending the requested half and pack.

7274

SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);

7275

if (UsePackUS) {

7276

if (PackHiHalf) {

7277

LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);

7278

RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);

7279

} else {

7280

SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);

7281

LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);

7282

RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);

7283

};

7284

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7285

};

7286

7287

if (!PackHiHalf) {

7288

LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);

7289

RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);

7290

}

7291

LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);

7292

RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);

7293

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7294

}

7295

7296

/// Return a vector_shuffle of the specified vector of zero or undef vector.

7297

/// This produces a shuffle where the low element of V2 is swizzled into the

7298

/// zero/undef vector, landing at element Idx.

7299

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

7300

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

7301

bool IsZero,

7302

const X86Subtarget &Subtarget,

7303

SelectionDAG &DAG) {

7304

MVT VT = V2.getSimpleValueType();

7305

SDValue V1 = IsZero

7306

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

7307

int NumElems = VT.getVectorNumElements();

7308

SmallVector<int, 16> MaskVec(NumElems);

7309

for (int i = 0; i != NumElems; ++i)

7310

// If this is the insertion idx, put the low elt of V2 here.

7311

MaskVec[i] = (i == Idx) ? NumElems : i;

7312

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

7313

}

7314

7315

static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

7316

if (Ptr.getOpcode() == X86ISD::Wrapper ||

7317

Ptr.getOpcode() == X86ISD::WrapperRIP)

7318

Ptr = Ptr.getOperand(0);

7319

7320

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

7321

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

7322

return nullptr;

7323

7324

return CNode->getConstVal();

7325

}

7326

7327

static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

7328

if (!Load || !ISD::isNormalLoad(Load))

7329

return nullptr;

7330

return getTargetConstantFromBasePtr(Load->getBasePtr());

7331

}

7332

7333

static const Constant *getTargetConstantFromNode(SDValue Op) {

7334

Op = peekThroughBitcasts(Op);

7335

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

7336

}

7337

7338

const Constant *

7339

X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

7340

assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7340, __extension__
__PRETTY_FUNCTION__));

7341

return getTargetConstantFromNode(LD);

7342

}

7343

7344

// Extract raw constant bits from constant pools.

7345

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

7346

APInt &UndefElts,

7347

SmallVectorImpl<APInt> &EltBits,

7348

bool AllowWholeUndefs = true,

7349

bool AllowPartialUndefs = true) {

7350

assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7350, __extension__
__PRETTY_FUNCTION__));

7351

7352

Op = peekThroughBitcasts(Op);

7353

7354

EVT VT = Op.getValueType();

7355

unsigned SizeInBits = VT.getSizeInBits();

7356

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7356, __extension__
__PRETTY_FUNCTION__));

7357

unsigned NumElts = SizeInBits / EltSizeInBits;

7358

7359

// Bitcast a source array of element bits to the target size.

7360

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

7361

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

7362

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

7363

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__
__PRETTY_FUNCTION__))

7364

"Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__
__PRETTY_FUNCTION__));

7365

7366

// Don't split if we don't allow undef bits.

7367

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

7368

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

7369

return false;

7370

7371

// If we're already the right size, don't bother bitcasting.

7372

if (NumSrcElts == NumElts) {

7373

UndefElts = UndefSrcElts;

7374

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

7375

return true;

7376

}

7377

7378

// Extract all the undef/constant element data and pack into single bitsets.

7379

APInt UndefBits(SizeInBits, 0);

7380

APInt MaskBits(SizeInBits, 0);

7381

7382

for (unsigned i = 0; i != NumSrcElts; ++i) {

7383

unsigned BitOffset = i * SrcEltSizeInBits;

7384

if (UndefSrcElts[i])

7385

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

7386

MaskBits.insertBits(SrcEltBits[i], BitOffset);

7387

}

7388

7389

// Split the undef/constant single bitset data into the target elements.

7390

UndefElts = APInt(NumElts, 0);

7391

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

7392

7393

for (unsigned i = 0; i != NumElts; ++i) {

7394

unsigned BitOffset = i * EltSizeInBits;

7395

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

7396

7397

// Only treat an element as UNDEF if all bits are UNDEF.

7398

if (UndefEltBits.isAllOnes()) {

7399

if (!AllowWholeUndefs)

7400

return false;

7401

UndefElts.setBit(i);

7402

continue;

7403

}

7404

7405

// If only some bits are UNDEF then treat them as zero (or bail if not

7406

// supported).

7407

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

7408

return false;

7409

7410

EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

7411

}

7412

return true;

7413

};

7414

7415

// Collect constant bits and insert into mask/undef bit masks.

7416

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

7417

unsigned UndefBitIndex) {

7418

if (!Cst)

7419

return false;

7420

if (isa<UndefValue>(Cst)) {

7421

Undefs.setBit(UndefBitIndex);

7422

return true;

7423

}

7424

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

7425

Mask = CInt->getValue();

7426

return true;

7427

}

7428

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

7429

Mask = CFP->getValueAPF().bitcastToAPInt();

7430

return true;

7431

}

7432

return false;

7433

};

7434

7435

// Handle UNDEFs.

7436

if (Op.isUndef()) {

7437

APInt UndefSrcElts = APInt::getAllOnes(NumElts);

7438

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

7439

return CastBitData(UndefSrcElts, SrcEltBits);

7440

}

7441

7442

// Extract scalar constant bits.

7443

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

7444

APInt UndefSrcElts = APInt::getZero(1);

7445

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

7446

return CastBitData(UndefSrcElts, SrcEltBits);

7447

}

7448

if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7449

APInt UndefSrcElts = APInt::getZero(1);

7450

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

7451

SmallVector<APInt, 64> SrcEltBits(1, RawBits);

7452

return CastBitData(UndefSrcElts, SrcEltBits);

7453

}

7454

7455

// Extract constant bits from build vector.

7456

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {

7457

BitVector Undefs;

7458

SmallVector<APInt> SrcEltBits;

7459

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7460

if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {

7461

APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());

7462

for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)

7463

if (Undefs[I])

7464

UndefSrcElts.setBit(I);

7465

return CastBitData(UndefSrcElts, SrcEltBits);

7466

}

7467

}

7468

7469

// Extract constant bits from constant pool vector.

7470

if (auto *Cst = getTargetConstantFromNode(Op)) {

7471

Type *CstTy = Cst->getType();

7472

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7473

if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

7474

return false;

7475

7476

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

7477

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7478

7479

APInt UndefSrcElts(NumSrcElts, 0);

7480

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

7481

for (unsigned i = 0; i != NumSrcElts; ++i)

7482

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

7483

UndefSrcElts, i))

7484

return false;

7485

7486

return CastBitData(UndefSrcElts, SrcEltBits);

7487

}

7488

7489

// Extract constant bits from a broadcasted constant pool scalar.

7490

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

7491

EltSizeInBits <= VT.getScalarSizeInBits()) {

7492

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7493

if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())

7494

return false;

7495

7496

SDValue Ptr = MemIntr->getBasePtr();

7497

if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

7498

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

7499

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7500

7501

APInt UndefSrcElts(NumSrcElts, 0);

7502

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

7503

if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

7504

if (UndefSrcElts[0])

7505

UndefSrcElts.setBits(0, NumSrcElts);

7506

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

7507

return CastBitData(UndefSrcElts, SrcEltBits);

7508

}

7509

}

7510

}

7511

7512

// Extract constant bits from a subvector broadcast.

7513

if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

7514

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7515

SDValue Ptr = MemIntr->getBasePtr();

7516

// The source constant may be larger than the subvector broadcast,

7517

// ensure we extract the correct subvector constants.

7518

if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {

7519

Type *CstTy = Cst->getType();

7520

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7521

unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();

7522

if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||

7523

(SizeInBits % SubVecSizeInBits) != 0)

7524

return false;

7525

unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();

7526

unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;

7527

unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;

7528

APInt UndefSubElts(NumSubElts, 0);

7529

SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,

7530

APInt(CstEltSizeInBits, 0));

7531

for (unsigned i = 0; i != NumSubElts; ++i) {

7532

if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],

7533

UndefSubElts, i))

7534

return false;

7535

for (unsigned j = 1; j != NumSubVecs; ++j)

7536

SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];

7537

}

7538

UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),

7539

UndefSubElts);

7540

return CastBitData(UndefSubElts, SubEltBits);

7541

}

7542

}

7543

7544

// Extract a rematerialized scalar constant insertion.

7545

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

7546

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

7547

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

7548

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7549

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7550

7551

APInt UndefSrcElts(NumSrcElts, 0);

7552

SmallVector<APInt, 64> SrcEltBits;

7553

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

7554

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

7555

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

7556

return CastBitData(UndefSrcElts, SrcEltBits);

7557

}

7558

7559

// Insert constant bits from a base and sub vector sources.

7560

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

7561

// If bitcasts to larger elements we might lose track of undefs - don't

7562

// allow any to be safe.

7563

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7564

bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;

7565

7566

APInt UndefSrcElts, UndefSubElts;

7567

SmallVector<APInt, 32> EltSrcBits, EltSubBits;

7568

if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,

7569

UndefSubElts, EltSubBits,

7570

AllowWholeUndefs && AllowUndefs,

7571

AllowPartialUndefs && AllowUndefs) &&

7572

getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,

7573

UndefSrcElts, EltSrcBits,

7574

AllowWholeUndefs && AllowUndefs,

7575

AllowPartialUndefs && AllowUndefs)) {

7576

unsigned BaseIdx = Op.getConstantOperandVal(2);

7577

UndefSrcElts.insertBits(UndefSubElts, BaseIdx);

7578

for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

7579

EltSrcBits[BaseIdx + i] = EltSubBits[i];

7580

return CastBitData(UndefSrcElts, EltSrcBits);

7581

}

7582

}

7583

7584

// Extract constant bits from a subvector's source.

7585

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

7586

// TODO - support extract_subvector through bitcasts.

7587

if (EltSizeInBits != VT.getScalarSizeInBits())

7588

return false;

7589

7590

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7591

UndefElts, EltBits, AllowWholeUndefs,

7592

AllowPartialUndefs)) {

7593

EVT SrcVT = Op.getOperand(0).getValueType();

7594

unsigned NumSrcElts = SrcVT.getVectorNumElements();

7595

unsigned NumSubElts = VT.getVectorNumElements();

7596

unsigned BaseIdx = Op.getConstantOperandVal(1);

7597

UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

7598

if ((BaseIdx + NumSubElts) != NumSrcElts)

7599

EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

7600

if (BaseIdx != 0)

7601

EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

7602

return true;

7603

}

7604

}

7605

7606

// Extract constant bits from shuffle node sources.

7607

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

7608

// TODO - support shuffle through bitcasts.

7609

if (EltSizeInBits != VT.getScalarSizeInBits())

7610

return false;

7611

7612

ArrayRef<int> Mask = SVN->getMask();

7613

if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

7614

llvm::any_of(Mask, [](int M) { return M < 0; }))

7615

return false;

7616

7617

APInt UndefElts0, UndefElts1;

7618

SmallVector<APInt, 32> EltBits0, EltBits1;

7619

if (isAnyInRange(Mask, 0, NumElts) &&

7620

!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7621

UndefElts0, EltBits0, AllowWholeUndefs,

7622

AllowPartialUndefs))

7623

return false;

7624

if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

7625

!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

7626

UndefElts1, EltBits1, AllowWholeUndefs,

7627

AllowPartialUndefs))

7628

return false;

7629

7630

UndefElts = APInt::getZero(NumElts);

7631

for (int i = 0; i != (int)NumElts; ++i) {

7632

int M = Mask[i];

7633

if (M < 0) {

7634

UndefElts.setBit(i);

7635

EltBits.push_back(APInt::getZero(EltSizeInBits));

7636

} else if (M < (int)NumElts) {

7637

if (UndefElts0[M])

7638

UndefElts.setBit(i);

7639

EltBits.push_back(EltBits0[M]);

7640

} else {

7641

if (UndefElts1[M - NumElts])

7642

UndefElts.setBit(i);

7643

EltBits.push_back(EltBits1[M - NumElts]);

7644

}

7645

}

7646

return true;

7647

}

7648

7649

return false;

7650

}

7651

7652

namespace llvm {

7653

namespace X86 {

7654

bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

7655

APInt UndefElts;

7656

SmallVector<APInt, 16> EltBits;

7657

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

7658

UndefElts, EltBits, true,

7659

AllowPartialUndefs)) {

7660

int SplatIndex = -1;

7661

for (int i = 0, e = EltBits.size(); i != e; ++i) {

7662

if (UndefElts[i])

7663

continue;

7664

if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

7665

SplatIndex = -1;

7666

break;

7667

}

7668

SplatIndex = i;

7669

}

7670

if (0 <= SplatIndex) {

7671

SplatVal = EltBits[SplatIndex];

7672

return true;

7673

}

7674

}

7675

7676

return false;

7677

}

7678

} // namespace X86

7679

} // namespace llvm

7680

7681

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

7682

unsigned MaskEltSizeInBits,

7683

SmallVectorImpl<uint64_t> &RawMask,

7684

APInt &UndefElts) {

7685

// Extract the raw target constant bits.

7686

SmallVector<APInt, 64> EltBits;

7687

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

7688

EltBits, /* AllowWholeUndefs */ true,

7689

/* AllowPartialUndefs */ false))

7690

return false;

7691

7692

// Insert the extracted elements into the mask.

7693

for (const APInt &Elt : EltBits)

7694

RawMask.push_back(Elt.getZExtValue());

7695

7696

return true;

7697

}

7698

7699

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

7700

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

7701

/// Note: This ignores saturation, so inputs must be checked first.

7702

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7703

bool Unary, unsigned NumStages = 1) {

7704

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7704, __extension__
__PRETTY_FUNCTION__));

7705

unsigned NumElts = VT.getVectorNumElements();

7706

unsigned NumLanes = VT.getSizeInBits() / 128;

7707

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

7708

unsigned Offset = Unary ? 0 : NumElts;

7709

unsigned Repetitions = 1u << (NumStages - 1);

7710

unsigned Increment = 1u << NumStages;

7711

assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7711, __extension__
__PRETTY_FUNCTION__));

7712

7713

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

7714

for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

7715

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7716

Mask.push_back(Elt + (Lane * NumEltsPerLane));

7717

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7718

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

7719

}

7720

}

7721

}

7722

7723

// Split the demanded elts of a PACKSS/PACKUS node between its operands.

7724

static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

7725

APInt &DemandedLHS, APInt &DemandedRHS) {

7726

int NumLanes = VT.getSizeInBits() / 128;

7727

int NumElts = DemandedElts.getBitWidth();

7728

int NumInnerElts = NumElts / 2;

7729

int NumEltsPerLane = NumElts / NumLanes;

7730

int NumInnerEltsPerLane = NumInnerElts / NumLanes;

7731

7732

DemandedLHS = APInt::getZero(NumInnerElts);

7733

DemandedRHS = APInt::getZero(NumInnerElts);

7734

7735

// Map DemandedElts to the packed operands.

7736

for (int Lane = 0; Lane != NumLanes; ++Lane) {

7737

for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

7738

int OuterIdx = (Lane * NumEltsPerLane) + Elt;

7739

int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

7740

if (DemandedElts[OuterIdx])

7741

DemandedLHS.setBit(InnerIdx);

7742

if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

7743

DemandedRHS.setBit(InnerIdx);

7744

}

7745

}

7746

}

7747

7748

// Split the demanded elts of a HADD/HSUB node between its operands.

7749

static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

7750

APInt &DemandedLHS, APInt &DemandedRHS) {

7751

int NumLanes = VT.getSizeInBits() / 128;

7752

int NumElts = DemandedElts.getBitWidth();

7753

int NumEltsPerLane = NumElts / NumLanes;

7754

int HalfEltsPerLane = NumEltsPerLane / 2;

7755

7756

DemandedLHS = APInt::getZero(NumElts);

7757

DemandedRHS = APInt::getZero(NumElts);

7758

7759

// Map DemandedElts to the horizontal operands.

7760

for (int Idx = 0; Idx != NumElts; ++Idx) {

7761

if (!DemandedElts[Idx])

7762

continue;

7763

int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;

7764

int LocalIdx = Idx % NumEltsPerLane;

7765

if (LocalIdx < HalfEltsPerLane) {

7766

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7767

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7768

} else {

7769

LocalIdx -= HalfEltsPerLane;

7770

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7771

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7772

}

7773

}

7774

}

7775

7776

/// Calculates the shuffle mask corresponding to the target-specific opcode.

7777

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

7778

/// operands in \p Ops, and returns true.

7779

/// Sets \p IsUnary to true if only one source is used. Note that this will set

7780

/// IsUnary for shuffles which use a single input multiple times, and in those

7781

/// cases it will adjust the mask to only have indices within that single input.

7782

/// It is an error to call this with non-empty Mask/Ops vectors.

7783

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

7784

SmallVectorImpl<SDValue> &Ops,

7785

SmallVectorImpl<int> &Mask, bool &IsUnary) {

7786

unsigned NumElems = VT.getVectorNumElements();

7787

unsigned MaskEltSize = VT.getScalarSizeInBits();

7788

SmallVector<uint64_t, 32> RawMask;

7789

APInt RawUndefs;

7790

uint64_t ImmN;

7791

7792

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__
__PRETTY_FUNCTION__));

7793

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__));

7794

7795

IsUnary = false;

7796

bool IsFakeUnary = false;

7797

switch (N->getOpcode()) {

7798

case X86ISD::BLENDI:

7799

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__));

7800

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7800, __extension__
__PRETTY_FUNCTION__));

7801

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7802

DecodeBLENDMask(NumElems, ImmN, Mask);

7803

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7804

break;

7805

case X86ISD::SHUFP:

7806

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7806, __extension__
__PRETTY_FUNCTION__));

7807

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7807, __extension__
__PRETTY_FUNCTION__));

7808

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7809

DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

7810

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7811

break;

7812

case X86ISD::INSERTPS:

7813

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7813, __extension__
__PRETTY_FUNCTION__));

7814

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7814, __extension__
__PRETTY_FUNCTION__));

7815

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7816

DecodeINSERTPSMask(ImmN, Mask);

7817

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7818

break;

7819

case X86ISD::EXTRQI:

7820

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__
__PRETTY_FUNCTION__));

7821

if (isa<ConstantSDNode>(N->getOperand(1)) &&

7822

isa<ConstantSDNode>(N->getOperand(2))) {

7823

int BitLen = N->getConstantOperandVal(1);

7824

int BitIdx = N->getConstantOperandVal(2);

7825

DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7826

IsUnary = true;

7827

}

7828

break;

7829

case X86ISD::INSERTQI:

7830

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7830, __extension__
__PRETTY_FUNCTION__));

7831

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7831, __extension__
__PRETTY_FUNCTION__));

7832

if (isa<ConstantSDNode>(N->getOperand(2)) &&

7833

isa<ConstantSDNode>(N->getOperand(3))) {

7834

int BitLen = N->getConstantOperandVal(2);

7835

int BitIdx = N->getConstantOperandVal(3);

7836

DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7837

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7838

}

7839

break;

7840

case X86ISD::UNPCKH:

7841

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7841, __extension__
__PRETTY_FUNCTION__));

7842

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7842, __extension__
__PRETTY_FUNCTION__));

7843

DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

7844

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7845

break;

7846

case X86ISD::UNPCKL:

7847

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__));

7848

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7848, __extension__
__PRETTY_FUNCTION__));

7849

DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

7850

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7851

break;

7852

case X86ISD::MOVHLPS:

7853

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7853, __extension__
__PRETTY_FUNCTION__));

7854

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7854, __extension__
__PRETTY_FUNCTION__));

7855

DecodeMOVHLPSMask(NumElems, Mask);

7856

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7857

break;

7858

case X86ISD::MOVLHPS:

7859

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7859, __extension__
__PRETTY_FUNCTION__));

7860

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7860, __extension__
__PRETTY_FUNCTION__));

7861

DecodeMOVLHPSMask(NumElems, Mask);

7862

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7863

break;

7864

case X86ISD::VALIGN:

7865

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))

7866

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__));

7867

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7867, __extension__
__PRETTY_FUNCTION__));

7868

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7868, __extension__
__PRETTY_FUNCTION__));

7869

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7870

DecodeVALIGNMask(NumElems, ImmN, Mask);

7871

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7872

Ops.push_back(N->getOperand(1));

7873

Ops.push_back(N->getOperand(0));

7874

break;

7875

case X86ISD::PALIGNR:

7876

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7876, __extension__
__PRETTY_FUNCTION__));

7877

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7877, __extension__
__PRETTY_FUNCTION__));

7878

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7878, __extension__
__PRETTY_FUNCTION__));

7879

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7880

DecodePALIGNRMask(NumElems, ImmN, Mask);

7881

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7882

Ops.push_back(N->getOperand(1));

7883

Ops.push_back(N->getOperand(0));

7884

break;

7885

case X86ISD::VSHLDQ:

7886

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__
__PRETTY_FUNCTION__));

7887

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7887, __extension__
__PRETTY_FUNCTION__));

7888

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7889

DecodePSLLDQMask(NumElems, ImmN, Mask);

7890

IsUnary = true;

7891

break;

7892

case X86ISD::VSRLDQ:

7893

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7893, __extension__
__PRETTY_FUNCTION__));

7894

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7894, __extension__
__PRETTY_FUNCTION__));

7895

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7896

DecodePSRLDQMask(NumElems, ImmN, Mask);

7897

IsUnary = true;

7898

break;

7899

case X86ISD::PSHUFD:

7900

case X86ISD::VPERMILPI:

7901

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7901, __extension__
__PRETTY_FUNCTION__));

7902

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7903

DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

7904

IsUnary = true;

7905

break;

7906

case X86ISD::PSHUFHW:

7907

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7907, __extension__
__PRETTY_FUNCTION__));

7908

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7909

DecodePSHUFHWMask(NumElems, ImmN, Mask);

7910

IsUnary = true;

7911

break;

7912

case X86ISD::PSHUFLW:

7913

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__
__PRETTY_FUNCTION__));

7914

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7915

DecodePSHUFLWMask(NumElems, ImmN, Mask);

7916

IsUnary = true;

7917

break;

7918

case X86ISD::VZEXT_MOVL:

7919

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7919, __extension__
__PRETTY_FUNCTION__));

7920

DecodeZeroMoveLowMask(NumElems, Mask);

7921

IsUnary = true;

7922

break;

7923

case X86ISD::VBROADCAST:

7924

// We only decode broadcasts of same-sized vectors, peeking through to

7925

// extracted subvectors is likely to cause hasOneUse issues with

7926

// SimplifyDemandedBits etc.

7927

if (N->getOperand(0).getValueType() == VT) {

7928

DecodeVectorBroadcast(NumElems, Mask);

7929

IsUnary = true;

7930

break;

7931

}

7932

return false;

7933

case X86ISD::VPERMILPV: {

7934

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7934, __extension__
__PRETTY_FUNCTION__));

7935

IsUnary = true;

7936

SDValue MaskNode = N->getOperand(1);

7937

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7938

RawUndefs)) {

7939

DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

7940

break;

7941

}

7942

return false;

7943

}

7944

case X86ISD::PSHUFB: {

7945

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__));

7946

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7946, __extension__
__PRETTY_FUNCTION__));

7947

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7947, __extension__
__PRETTY_FUNCTION__));

7948

IsUnary = true;

7949

SDValue MaskNode = N->getOperand(1);

7950

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7951

DecodePSHUFBMask(RawMask, RawUndefs, Mask);

7952

break;

7953

}

7954

return false;

7955

}

7956

case X86ISD::VPERMI:

7957

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__
__PRETTY_FUNCTION__));

7958

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7959

DecodeVPERMMask(NumElems, ImmN, Mask);

7960

IsUnary = true;

7961

break;

7962

case X86ISD::MOVSS:

7963

case X86ISD::MOVSD:

7964

case X86ISD::MOVSH:

7965

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7965, __extension__
__PRETTY_FUNCTION__));

7966

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7966, __extension__
__PRETTY_FUNCTION__));

7967

DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

7968

break;

7969

case X86ISD::VPERM2X128:

7970

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7970, __extension__
__PRETTY_FUNCTION__));

7971

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7971, __extension__
__PRETTY_FUNCTION__));

7972

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7973

DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

7974

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7975

break;

7976

case X86ISD::SHUF128:

7977

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7977, __extension__
__PRETTY_FUNCTION__));

7978

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__
__PRETTY_FUNCTION__));

7979

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7980

decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

7981

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7982

break;

7983

case X86ISD::MOVSLDUP:

7984

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7984, __extension__
__PRETTY_FUNCTION__));

7985

DecodeMOVSLDUPMask(NumElems, Mask);

7986

IsUnary = true;

7987

break;

7988

case X86ISD::MOVSHDUP:

7989

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7989, __extension__
__PRETTY_FUNCTION__));

7990

DecodeMOVSHDUPMask(NumElems, Mask);

7991

IsUnary = true;

7992

break;

7993

case X86ISD::MOVDDUP:

7994

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7994, __extension__
__PRETTY_FUNCTION__));

7995

DecodeMOVDDUPMask(NumElems, Mask);

7996

IsUnary = true;

7997

break;

7998

case X86ISD::VPERMIL2: {

7999

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7999, __extension__
__PRETTY_FUNCTION__));

8000

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8000, __extension__
__PRETTY_FUNCTION__));

8001

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

8002

SDValue MaskNode = N->getOperand(2);

8003

SDValue CtrlNode = N->getOperand(3);

8004

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

8005

unsigned CtrlImm = CtrlOp->getZExtValue();

8006

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8007

RawUndefs)) {

8008

DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

8009

Mask);

8010

break;

8011

}

8012

}

8013

return false;

8014

}

8015

case X86ISD::VPPERM: {

8016

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8016, __extension__
__PRETTY_FUNCTION__));

8017

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8017, __extension__
__PRETTY_FUNCTION__));

8018

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

8019

SDValue MaskNode = N->getOperand(2);

8020

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

8021

DecodeVPPERMMask(RawMask, RawUndefs, Mask);

8022

break;

8023

}

8024

return false;

8025

}

8026

case X86ISD::VPERMV: {

8027

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8027, __extension__
__PRETTY_FUNCTION__));

8028

IsUnary = true;

8029

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

8030

Ops.push_back(N->getOperand(1));

8031

SDValue MaskNode = N->getOperand(0);

8032

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8033

RawUndefs)) {

8034

DecodeVPERMVMask(RawMask, RawUndefs, Mask);

8035

break;

8036

}

8037

return false;

8038

}

8039

case X86ISD::VPERMV3: {

8040

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8040, __extension__
__PRETTY_FUNCTION__));

8041

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8041, __extension__
__PRETTY_FUNCTION__));

8042

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

8043

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

8044

Ops.push_back(N->getOperand(0));

8045

Ops.push_back(N->getOperand(2));

8046

SDValue MaskNode = N->getOperand(1);

8047

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8048

RawUndefs)) {

8049

DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

8050

break;

8051

}

8052

return false;

8053

}

8054

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8054);

8055

}

8056

8057

// Empty mask indicates the decode failed.

8058

if (Mask.empty())

8059

return false;

8060

8061

// Check if we're getting a shuffle mask with zero'd elements.

8062

if (!AllowSentinelZero && isAnyZero(Mask))

8063

return false;

8064

8065

// If we have a fake unary shuffle, the shuffle mask is spread across two

8066

// inputs that are actually the same node. Re-map the mask to always point

8067

// into the first input.

8068

if (IsFakeUnary)

8069

for (int &M : Mask)

8070

if (M >= (int)Mask.size())

8071

M -= Mask.size();

8072

8073

// If we didn't already add operands in the opcode-specific code, default to

8074

// adding 1 or 2 operands starting at 0.

8075

if (Ops.empty()) {

8076

Ops.push_back(N->getOperand(0));

8077

if (!IsUnary || IsFakeUnary)

8078

Ops.push_back(N->getOperand(1));

8079

}

8080

8081

return true;

8082

}

8083

8084

// Wrapper for getTargetShuffleMask with InUnary;

8085

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

8086

SmallVectorImpl<SDValue> &Ops,

8087

SmallVectorImpl<int> &Mask) {

8088

bool IsUnary;

8089

return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);

8090

}

8091

8092

/// Compute whether each element of a shuffle is zeroable.

8093

///

8094

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

8095

/// Either it is an undef element in the shuffle mask, the element of the input

8096

/// referenced is undef, or the element of the input referenced is known to be

8097

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

8098

/// as many lanes with this technique as possible to simplify the remaining

8099

/// shuffle.

8100

static void computeZeroableShuffleElements(ArrayRef<int> Mask,

8101

SDValue V1, SDValue V2,

8102

APInt &KnownUndef, APInt &KnownZero) {

8103

int Size = Mask.size();

8104

KnownUndef = KnownZero = APInt::getZero(Size);

8105

8106

V1 = peekThroughBitcasts(V1);

8107

V2 = peekThroughBitcasts(V2);

8108

8109

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

8110

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

8111

8112

int VectorSizeInBits = V1.getValueSizeInBits();

8113

int ScalarSizeInBits = VectorSizeInBits / Size;

8114

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8114, __extension__
__PRETTY_FUNCTION__));

8115

8116

for (int i = 0; i < Size; ++i) {

8117

int M = Mask[i];

8118

// Handle the easy cases.

8119

if (M < 0) {

8120

KnownUndef.setBit(i);

8121

continue;

8122

}

8123

if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

8124

KnownZero.setBit(i);

8125

continue;

8126

}

8127

8128

// Determine shuffle input and normalize the mask.

8129

SDValue V = M < Size ? V1 : V2;

8130

M %= Size;

8131

8132

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

8133

if (V.getOpcode() != ISD::BUILD_VECTOR)

8134

continue;

8135

8136

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

8137

// the (larger) source element must be UNDEF/ZERO.

8138

if ((Size % V.getNumOperands()) == 0) {

8139

int Scale = Size / V->getNumOperands();

8140

SDValue Op = V.getOperand(M / Scale);

8141

if (Op.isUndef())

8142

KnownUndef.setBit(i);

8143

if (X86::isZeroNode(Op))

8144

KnownZero.setBit(i);

8145

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

8146

APInt Val = Cst->getAPIntValue();

8147

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8148

if (Val == 0)

8149

KnownZero.setBit(i);

8150

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

8151

APInt Val = Cst->getValueAPF().bitcastToAPInt();

8152

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8153

if (Val == 0)

8154

KnownZero.setBit(i);

8155

}

8156

continue;

8157

}

8158

8159

// If the BUILD_VECTOR has more elements then all the (smaller) source

8160

// elements must be UNDEF or ZERO.

8161

if ((V.getNumOperands() % Size) == 0) {

8162

int Scale = V->getNumOperands() / Size;

8163

bool AllUndef = true;

8164

bool AllZero = true;

8165

for (int j = 0; j < Scale; ++j) {

8166

SDValue Op = V.getOperand((M * Scale) + j);

8167

AllUndef &= Op.isUndef();

8168

AllZero &= X86::isZeroNode(Op);

8169

}

8170

if (AllUndef)

8171

KnownUndef.setBit(i);

8172

if (AllZero)

8173

KnownZero.setBit(i);

8174

continue;

8175

}

8176

}

8177

}

8178

8179

/// Decode a target shuffle mask and inputs and see if any values are

8180

/// known to be undef or zero from their inputs.

8181

/// Returns true if the target shuffle mask was decoded.

8182

/// FIXME: Merge this with computeZeroableShuffleElements?

8183

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

8184

SmallVectorImpl<SDValue> &Ops,

8185

APInt &KnownUndef, APInt &KnownZero) {

8186

bool IsUnary;

8187

if (!isTargetShuffle(N.getOpcode()))

8188

return false;

8189

8190

MVT VT = N.getSimpleValueType();

8191

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

8192

return false;

8193

8194

int Size = Mask.size();

8195

SDValue V1 = Ops[0];

8196

SDValue V2 = IsUnary ? V1 : Ops[1];

8197

KnownUndef = KnownZero = APInt::getZero(Size);

8198

8199

V1 = peekThroughBitcasts(V1);

8200

V2 = peekThroughBitcasts(V2);

8201

8202

assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__
__PRETTY_FUNCTION__))

8203

"Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__
__PRETTY_FUNCTION__));

8204

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

8205

8206

// Extract known constant input data.

8207

APInt UndefSrcElts[2];

8208

SmallVector<APInt, 32> SrcEltBits[2];

8209

bool IsSrcConstant[2] = {

8210

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

8211

SrcEltBits[0], true, false),

8212

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

8213

SrcEltBits[1], true, false)};

8214

8215

for (int i = 0; i < Size; ++i) {

8216

int M = Mask[i];

8217

8218

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

8219

if (M < 0) {

8220

assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8220, __extension__
__PRETTY_FUNCTION__));

8221

if (SM_SentinelUndef == M)

8222

KnownUndef.setBit(i);

8223

if (SM_SentinelZero == M)

8224

KnownZero.setBit(i);

8225

continue;

8226

}

8227

8228

// Determine shuffle input and normalize the mask.

8229

unsigned SrcIdx = M / Size;

8230

SDValue V = M < Size ? V1 : V2;

8231

M %= Size;

8232

8233

// We are referencing an UNDEF input.

8234

if (V.isUndef()) {

8235

KnownUndef.setBit(i);

8236

continue;

8237

}

8238

8239

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

8240

// TODO: We currently only set UNDEF for integer types - floats use the same

8241

// registers as vectors and many of the scalar folded loads rely on the

8242

// SCALAR_TO_VECTOR pattern.

8243

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

8244

(Size % V.getValueType().getVectorNumElements()) == 0) {

8245

int Scale = Size / V.getValueType().getVectorNumElements();

8246

int Idx = M / Scale;

8247

if (Idx != 0 && !VT.isFloatingPoint())

8248

KnownUndef.setBit(i);

8249

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

8250

KnownZero.setBit(i);

8251

continue;

8252

}

8253

8254

// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

8255

// base vectors.

8256

if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

8257

SDValue Vec = V.getOperand(0);

8258

int NumVecElts = Vec.getValueType().getVectorNumElements();

8259

if (Vec.isUndef() && Size == NumVecElts) {

8260

int Idx = V.getConstantOperandVal(2);

8261

int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

8262

if (M < Idx || (Idx + NumSubElts) <= M)

8263

KnownUndef.setBit(i);

8264

}

8265

continue;

8266

}

8267

8268

// Attempt to extract from the source's constant bits.

8269

if (IsSrcConstant[SrcIdx]) {

8270

if (UndefSrcElts[SrcIdx][M])

8271

KnownUndef.setBit(i);

8272

else if (SrcEltBits[SrcIdx][M] == 0)

8273

KnownZero.setBit(i);

8274

}

8275

}

8276

8277

assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__
__PRETTY_FUNCTION__))

8278

"Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__
__PRETTY_FUNCTION__));

8279

return true;

8280

}

8281

8282

// Replace target shuffle mask elements with known undef/zero sentinels.

8283

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

8284

const APInt &KnownUndef,

8285

const APInt &KnownZero,

8286

bool ResolveKnownZeros= true) {

8287

unsigned NumElts = Mask.size();

8288

assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__
__PRETTY_FUNCTION__))

8289

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__
__PRETTY_FUNCTION__));

8290

8291

for (unsigned i = 0; i != NumElts; ++i) {

8292

if (KnownUndef[i])

8293

Mask[i] = SM_SentinelUndef;

8294

else if (ResolveKnownZeros && KnownZero[i])

8295

Mask[i] = SM_SentinelZero;

8296

}

8297

}

8298

8299

// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.

8300

static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

8301

APInt &KnownUndef,

8302

APInt &KnownZero) {

8303

unsigned NumElts = Mask.size();

8304

KnownUndef = KnownZero = APInt::getZero(NumElts);

8305

8306

for (unsigned i = 0; i != NumElts; ++i) {

8307

int M = Mask[i];

8308

if (SM_SentinelUndef == M)

8309

KnownUndef.setBit(i);

8310

if (SM_SentinelZero == M)

8311

KnownZero.setBit(i);

8312

}

8313

}

8314

8315

// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.

8316

static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

8317

SDValue Cond, bool IsBLENDV = false) {

8318

EVT CondVT = Cond.getValueType();

8319

unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

8320

unsigned NumElts = CondVT.getVectorNumElements();

8321

8322

APInt UndefElts;

8323

SmallVector<APInt, 32> EltBits;

8324

if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

8325

true, false))

8326

return false;

8327

8328

Mask.resize(NumElts, SM_SentinelUndef);

8329

8330

for (int i = 0; i != (int)NumElts; ++i) {

8331

Mask[i] = i;

8332

// Arbitrarily choose from the 2nd operand if the select condition element

8333

// is undef.

8334

// TODO: Can we do better by matching patterns such as even/odd?

8335

if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||

8336

(IsBLENDV && EltBits[i].isNonNegative()))

8337

Mask[i] += NumElts;

8338

}

8339

8340

return true;

8341

}

8342

8343

// Forward declaration (for getFauxShuffleMask recursive check).

8344

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8345

SmallVectorImpl<SDValue> &Inputs,

8346

SmallVectorImpl<int> &Mask,

8347

const SelectionDAG &DAG, unsigned Depth,

8348

bool ResolveKnownElts);

8349

8350

// Attempt to decode ops that could be represented as a shuffle mask.

8351

// The decoded shuffle mask may contain a different number of elements to the

8352

// destination value type.

8353

// TODO: Merge into getTargetShuffleInputs()

8354

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

8355

SmallVectorImpl<int> &Mask,

8356

SmallVectorImpl<SDValue> &Ops,

8357

const SelectionDAG &DAG, unsigned Depth,

8358

bool ResolveKnownElts) {

8359

Mask.clear();

8360

Ops.clear();

8361

8362

MVT VT = N.getSimpleValueType();

8363

unsigned NumElts = VT.getVectorNumElements();

8364

unsigned NumSizeInBits = VT.getSizeInBits();

8365

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

8366

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

8367

return false;

8368

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8368, __extension__
__PRETTY_FUNCTION__));

8369

unsigned NumSizeInBytes = NumSizeInBits / 8;

8370

unsigned NumBytesPerElt = NumBitsPerElt / 8;

8371

8372

unsigned Opcode = N.getOpcode();

8373

switch (Opcode) {

8374

case ISD::VECTOR_SHUFFLE: {

8375

// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

8376

ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

8377

if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

8378

Mask.append(ShuffleMask.begin(), ShuffleMask.end());

8379

Ops.push_back(N.getOperand(0));

8380

Ops.push_back(N.getOperand(1));

8381

return true;

8382

}

8383

return false;

8384

}

8385

case ISD::AND:

8386

case X86ISD::ANDNP: {

8387

// Attempt to decode as a per-byte mask.

8388

APInt UndefElts;

8389

SmallVector<APInt, 32> EltBits;

8390

SDValue N0 = N.getOperand(0);

8391

SDValue N1 = N.getOperand(1);

8392

bool IsAndN = (X86ISD::ANDNP == Opcode);

8393

uint64_t ZeroMask = IsAndN ? 255 : 0;

8394

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

8395

return false;

8396

// We can't assume an undef src element gives an undef dst - the other src

8397

// might be zero.

8398

if (!UndefElts.isZero())

8399

return false;

8400

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

8401

const APInt &ByteBits = EltBits[i];

8402

if (ByteBits != 0 && ByteBits != 255)

8403

return false;

8404

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

8405

}

8406

Ops.push_back(IsAndN ? N1 : N0);

8407

return true;

8408

}

8409

case ISD::OR: {

8410

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

8411

// is a valid shuffle index.

8412

SDValue N0 = peekThroughBitcasts(N.getOperand(0));

8413

SDValue N1 = peekThroughBitcasts(N.getOperand(1));

8414

if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

8415

return false;

8416

8417

SmallVector<int, 64> SrcMask0, SrcMask1;

8418

SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

8419

APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());

8420

APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());

8421

if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,

8422

Depth + 1, true) ||

8423

!getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,

8424

Depth + 1, true))

8425

return false;

8426

8427

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

8428

SmallVector<int, 64> Mask0, Mask1;

8429

narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

8430

narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

8431

for (int i = 0; i != (int)MaskSize; ++i) {

8432

// NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite

8433

// loops converting between OR and BLEND shuffles due to

8434

// canWidenShuffleElements merging away undef elements, meaning we

8435

// fail to recognise the OR as the undef element isn't known zero.

8436

if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

8437

Mask.push_back(SM_SentinelZero);

8438

else if (Mask1[i] == SM_SentinelZero)

8439

Mask.push_back(i);

8440

else if (Mask0[i] == SM_SentinelZero)

8441

Mask.push_back(i + MaskSize);

8442

else

8443

return false;

8444

}

8445

Ops.push_back(N0);

8446

Ops.push_back(N1);

8447

return true;

8448

}

8449

case ISD::INSERT_SUBVECTOR: {

8450

SDValue Src = N.getOperand(0);

8451

SDValue Sub = N.getOperand(1);

8452

EVT SubVT = Sub.getValueType();

8453

unsigned NumSubElts = SubVT.getVectorNumElements();

8454

if (!N->isOnlyUserOf(Sub.getNode()))

8455

return false;

8456

uint64_t InsertIdx = N.getConstantOperandVal(2);

8457

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

8458

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

8459

Sub.getOperand(0).getValueType() == VT) {

8460

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

8461

for (int i = 0; i != (int)NumElts; ++i)

8462

Mask.push_back(i);

8463

for (int i = 0; i != (int)NumSubElts; ++i)

8464

Mask[InsertIdx + i] = NumElts + ExtractIdx + i;

8465

Ops.push_back(Src);

8466

Ops.push_back(Sub.getOperand(0));

8467

return true;

8468

}

8469

// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

8470

SmallVector<int, 64> SubMask;

8471

SmallVector<SDValue, 2> SubInputs;

8472

SDValue SubSrc = peekThroughOneUseBitcasts(Sub);

8473

EVT SubSrcVT = SubSrc.getValueType();

8474

if (!SubSrcVT.isVector())

8475

return false;

8476

8477

APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());

8478

if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,

8479

Depth + 1, ResolveKnownElts))

8480

return false;

8481

8482

// Subvector shuffle inputs must not be larger than the subvector.

8483

if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

8484

return SubVT.getFixedSizeInBits() <

8485

SubInput.getValueSizeInBits().getFixedValue();

8486

}))

8487

return false;

8488

8489

if (SubMask.size() != NumSubElts) {

8490

assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__
__PRETTY_FUNCTION__))

8491

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__
__PRETTY_FUNCTION__));

8492

if ((NumSubElts % SubMask.size()) == 0) {

8493

int Scale = NumSubElts / SubMask.size();

8494

SmallVector<int,64> ScaledSubMask;

8495

narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

8496

SubMask = ScaledSubMask;

8497

} else {

8498

int Scale = SubMask.size() / NumSubElts;

8499

NumSubElts = SubMask.size();

8500

NumElts *= Scale;

8501

InsertIdx *= Scale;

8502

}

8503

}

8504

Ops.push_back(Src);

8505

Ops.append(SubInputs.begin(), SubInputs.end());

8506

if (ISD::isBuildVectorAllZeros(Src.getNode()))

8507

Mask.append(NumElts, SM_SentinelZero);

8508

else

8509

for (int i = 0; i != (int)NumElts; ++i)

8510

Mask.push_back(i);

8511

for (int i = 0; i != (int)NumSubElts; ++i) {

8512

int M = SubMask[i];

8513

if (0 <= M) {

8514

int InputIdx = M / NumSubElts;

8515

M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

8516

}

8517

Mask[i + InsertIdx] = M;

8518

}

8519

return true;

8520

}

8521

case X86ISD::PINSRB:

8522

case X86ISD::PINSRW:

8523

case ISD::SCALAR_TO_VECTOR:

8524

case ISD::INSERT_VECTOR_ELT: {

8525

// Match against a insert_vector_elt/scalar_to_vector of an extract from a

8526

// vector, for matching src/dst vector types.

8527

SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

8528

8529

unsigned DstIdx = 0;

8530

if (Opcode != ISD::SCALAR_TO_VECTOR) {

8531

// Check we have an in-range constant insertion index.

8532

if (!isa<ConstantSDNode>(N.getOperand(2)) ||

8533

N.getConstantOperandAPInt(2).uge(NumElts))

8534

return false;

8535

DstIdx = N.getConstantOperandVal(2);

8536

8537

// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

8538

if (X86::isZeroNode(Scl)) {

8539

Ops.push_back(N.getOperand(0));

8540

for (unsigned i = 0; i != NumElts; ++i)

8541

Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

8542

return true;

8543

}

8544

}

8545

8546

// Peek through trunc/aext/zext.

8547

// TODO: aext shouldn't require SM_SentinelZero padding.

8548

// TODO: handle shift of scalars.

8549

unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

8550

while (Scl.getOpcode() == ISD::TRUNCATE ||

8551

Scl.getOpcode() == ISD::ANY_EXTEND ||

8552

Scl.getOpcode() == ISD::ZERO_EXTEND) {

8553

Scl = Scl.getOperand(0);

8554

MinBitsPerElt =

8555

std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

8556

}

8557

if ((MinBitsPerElt % 8) != 0)

8558

return false;

8559

8560

// Attempt to find the source vector the scalar was extracted from.

8561

SDValue SrcExtract;

8562

if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

8563

Scl.getOpcode() == X86ISD::PEXTRW ||

8564

Scl.getOpcode() == X86ISD::PEXTRB) &&

8565

Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

8566

SrcExtract = Scl;

8567

}

8568

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

8569

return false;

8570

8571

SDValue SrcVec = SrcExtract.getOperand(0);

8572

EVT SrcVT = SrcVec.getValueType();

8573

if (!SrcVT.getScalarType().isByteSized())

8574

return false;

8575

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

8576

unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

8577

unsigned DstByte = DstIdx * NumBytesPerElt;

8578

MinBitsPerElt =

8579

std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

8580

8581

// Create 'identity' byte level shuffle mask and then add inserted bytes.

8582

if (Opcode == ISD::SCALAR_TO_VECTOR) {

8583

Ops.push_back(SrcVec);

8584

Mask.append(NumSizeInBytes, SM_SentinelUndef);

8585

} else {

8586

Ops.push_back(SrcVec);

8587

Ops.push_back(N.getOperand(0));

8588

for (int i = 0; i != (int)NumSizeInBytes; ++i)

8589

Mask.push_back(NumSizeInBytes + i);

8590

}

8591

8592

unsigned MinBytesPerElts = MinBitsPerElt / 8;

8593

MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

8594

for (unsigned i = 0; i != MinBytesPerElts; ++i)

8595

Mask[DstByte + i] = SrcByte + i;

8596

for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

8597

Mask[DstByte + i] = SM_SentinelZero;

8598

return true;

8599

}

8600

case X86ISD::PACKSS:

8601

case X86ISD::PACKUS: {

8602

SDValue N0 = N.getOperand(0);

8603

SDValue N1 = N.getOperand(1);

8604

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))

8605

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__))

8606

"Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__
__PRETTY_FUNCTION__));

8607

8608

APInt EltsLHS, EltsRHS;

8609

getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

8610

8611

// If we know input saturation won't happen (or we don't care for particular

8612

// lanes), we can treat this as a truncation shuffle.

8613

bool Offset0 = false, Offset1 = false;

8614

if (Opcode == X86ISD::PACKSS) {

8615

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8616

DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

8617

(!(N1.isUndef() || EltsRHS.isZero()) &&

8618

DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

8619

return false;

8620

// We can't easily fold ASHR into a shuffle, but if it was feeding a

8621

// PACKSS then it was likely being used for sign-extension for a

8622

// truncation, so just peek through and adjust the mask accordingly.

8623

if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&

8624

N0.getConstantOperandAPInt(1) == NumBitsPerElt) {

8625

Offset0 = true;

8626

N0 = N0.getOperand(0);

8627

}

8628

if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&

8629

N1.getConstantOperandAPInt(1) == NumBitsPerElt) {

8630

Offset1 = true;

8631

N1 = N1.getOperand(0);

8632

}

8633

} else {

8634

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

8635

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8636

!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

8637

(!(N1.isUndef() || EltsRHS.isZero()) &&

8638

!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

8639

return false;

8640

}

8641

8642

bool IsUnary = (N0 == N1);

8643

8644

Ops.push_back(N0);

8645

if (!IsUnary)

8646

Ops.push_back(N1);

8647

8648

createPackShuffleMask(VT, Mask, IsUnary);

8649

8650

if (Offset0 || Offset1) {

8651

for (int &M : Mask)

8652

if ((Offset0 && isInRange(M, 0, NumElts)) ||

8653

(Offset1 && isInRange(M, NumElts, 2 * NumElts)))

8654

++M;

8655

}

8656

return true;

8657

}

8658

case ISD::VSELECT:

8659

case X86ISD::BLENDV: {

8660

SDValue Cond = N.getOperand(0);

8661

if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {

8662

Ops.push_back(N.getOperand(1));

8663

Ops.push_back(N.getOperand(2));

8664

return true;

8665

}

8666

return false;

8667

}

8668

case X86ISD::VTRUNC: {

8669

SDValue Src = N.getOperand(0);

8670

EVT SrcVT = Src.getValueType();

8671

// Truncated source must be a simple vector.

8672

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8673

(SrcVT.getScalarSizeInBits() % 8) != 0)

8674

return false;

8675

unsigned NumSrcElts = SrcVT.getVectorNumElements();

8676

unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

8677

unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

8678

assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8678, __extension__
__PRETTY_FUNCTION__));

8679

for (unsigned i = 0; i != NumSrcElts; ++i)

8680

Mask.push_back(i * Scale);

8681

Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

8682

Ops.push_back(Src);

8683

return true;

8684

}

8685

case X86ISD::VSHLI:

8686

case X86ISD::VSRLI: {

8687

uint64_t ShiftVal = N.getConstantOperandVal(1);

8688

// Out of range bit shifts are guaranteed to be zero.

8689

if (NumBitsPerElt <= ShiftVal) {

8690

Mask.append(NumElts, SM_SentinelZero);

8691

return true;

8692

}

8693

8694

// We can only decode 'whole byte' bit shifts as shuffles.

8695

if ((ShiftVal % 8) != 0)

8696

break;

8697

8698

uint64_t ByteShift = ShiftVal / 8;

8699

Ops.push_back(N.getOperand(0));

8700

8701

// Clear mask to all zeros and insert the shifted byte indices.

8702

Mask.append(NumSizeInBytes, SM_SentinelZero);

8703

8704

if (X86ISD::VSHLI == Opcode) {

8705

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8706

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8707

Mask[i + j] = i + j - ByteShift;

8708

} else {

8709

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8710

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8711

Mask[i + j - ByteShift] = i + j;

8712

}

8713

return true;

8714

}

8715

case X86ISD::VROTLI:

8716

case X86ISD::VROTRI: {

8717

// We can only decode 'whole byte' bit rotates as shuffles.

8718

uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

8719

if ((RotateVal % 8) != 0)

8720

return false;

8721

Ops.push_back(N.getOperand(0));

8722

int Offset = RotateVal / 8;

8723

Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

8724

for (int i = 0; i != (int)NumElts; ++i) {

8725

int BaseIdx = i * NumBytesPerElt;

8726

for (int j = 0; j != (int)NumBytesPerElt; ++j) {

8727

Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

8728

}

8729

}

8730

return true;

8731

}

8732

case X86ISD::VBROADCAST: {

8733

SDValue Src = N.getOperand(0);

8734

if (!Src.getSimpleValueType().isVector()) {

8735

if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8736

!isNullConstant(Src.getOperand(1)) ||

8737

Src.getOperand(0).getValueType().getScalarType() !=

8738

VT.getScalarType())

8739

return false;

8740

Src = Src.getOperand(0);

8741

}

8742

Ops.push_back(Src);

8743

Mask.append(NumElts, 0);

8744

return true;

8745

}

8746

case ISD::ZERO_EXTEND:

8747

case ISD::ANY_EXTEND:

8748

case ISD::ZERO_EXTEND_VECTOR_INREG:

8749

case ISD::ANY_EXTEND_VECTOR_INREG: {

8750

SDValue Src = N.getOperand(0);

8751

EVT SrcVT = Src.getValueType();

8752

8753

// Extended source must be a simple vector.

8754

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8755

(SrcVT.getScalarSizeInBits() % 8) != 0)

8756

return false;

8757

8758

bool IsAnyExtend =

8759

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

8760

DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

8761

IsAnyExtend, Mask);

8762

Ops.push_back(Src);

8763

return true;

8764

}

8765

}

8766

8767

return false;

8768

}

8769

8770

/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.

8771

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

8772

SmallVectorImpl<int> &Mask) {

8773

int MaskWidth = Mask.size();

8774

SmallVector<SDValue, 16> UsedInputs;

8775

for (int i = 0, e = Inputs.size(); i < e; ++i) {

8776

int lo = UsedInputs.size() * MaskWidth;

8777

int hi = lo + MaskWidth;

8778

8779

// Strip UNDEF input usage.

8780

if (Inputs[i].isUndef())

8781

for (int &M : Mask)

8782

if ((lo <= M) && (M < hi))

8783

M = SM_SentinelUndef;

8784

8785

// Check for unused inputs.

8786

if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

8787

for (int &M : Mask)

8788

if (lo <= M)

8789

M -= MaskWidth;

8790

continue;

8791

}

8792

8793

// Check for repeated inputs.

8794

bool IsRepeat = false;

8795

for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

8796

if (UsedInputs[j] != Inputs[i])

8797

continue;

8798

for (int &M : Mask)

8799

if (lo <= M)

8800

M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

8801

IsRepeat = true;

8802

break;

8803

}

8804

if (IsRepeat)

8805

continue;

8806

8807

UsedInputs.push_back(Inputs[i]);

8808

}

8809

Inputs = UsedInputs;

8810

}

8811

8812

/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

8813

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

8814

/// Returns true if the target shuffle mask was decoded.

8815

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8816

SmallVectorImpl<SDValue> &Inputs,

8817

SmallVectorImpl<int> &Mask,

8818

APInt &KnownUndef, APInt &KnownZero,

8819

const SelectionDAG &DAG, unsigned Depth,

8820

bool ResolveKnownElts) {

8821

if (Depth >= SelectionDAG::MaxRecursionDepth)

8822

return false; // Limit search depth.

8823

8824

EVT VT = Op.getValueType();

8825

if (!VT.isSimple() || !VT.isVector())

8826

return false;

8827

8828

if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

8829

if (ResolveKnownElts)

8830

resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

8831

return true;

8832

}

8833

if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

8834

ResolveKnownElts)) {

8835

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

8836

return true;

8837

}

8838

return false;

8839

}

8840

8841

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8842

SmallVectorImpl<SDValue> &Inputs,

8843

SmallVectorImpl<int> &Mask,

8844

const SelectionDAG &DAG, unsigned Depth,

8845

bool ResolveKnownElts) {

8846

APInt KnownUndef, KnownZero;

8847

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

8848

KnownZero, DAG, Depth, ResolveKnownElts);

8849

}

8850

8851

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

8852

SmallVectorImpl<int> &Mask,

8853

const SelectionDAG &DAG, unsigned Depth = 0,

8854

bool ResolveKnownElts = true) {

8855

EVT VT = Op.getValueType();

8856

if (!VT.isSimple() || !VT.isVector())

8857

return false;

8858

8859

unsigned NumElts = Op.getValueType().getVectorNumElements();

8860

APInt DemandedElts = APInt::getAllOnes(NumElts);

8861

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,

8862

ResolveKnownElts);

8863

}

8864

8865

// Attempt to create a scalar/subvector broadcast from the base MemSDNode.

8866

static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,

8867

EVT MemVT, MemSDNode *Mem, unsigned Offset,

8868

SelectionDAG &DAG) {

8869

assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))

8870

Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__))

8871

"Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__
__PRETTY_FUNCTION__));

8872

8873

// Ensure this is a simple (non-atomic, non-voltile), temporal read memop.

8874

if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())

8875

return SDValue();

8876

8877

SDValue Ptr =

8878

DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);

8879

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8880

SDValue Ops[] = {Mem->getChain(), Ptr};

8881

SDValue BcstLd = DAG.getMemIntrinsicNode(

8882

Opcode, DL, Tys, Ops, MemVT,

8883

DAG.getMachineFunction().getMachineMemOperand(

8884

Mem->getMemOperand(), Offset, MemVT.getStoreSize()));

8885

DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));

8886

return BcstLd;

8887

}

8888

8889

/// Returns the scalar element that will make up the i'th

8890

/// element of the result of the vector shuffle.

8891

static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

8892

SelectionDAG &DAG, unsigned Depth) {

8893

if (Depth >= SelectionDAG::MaxRecursionDepth)

8894

return SDValue(); // Limit search depth.

8895

8896

EVT VT = Op.getValueType();

8897

unsigned Opcode = Op.getOpcode();

8898

unsigned NumElems = VT.getVectorNumElements();

8899

8900

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

8901

if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

8902

int Elt = SV->getMaskElt(Index);

8903

8904

if (Elt < 0)

8905

return DAG.getUNDEF(VT.getVectorElementType());

8906

8907

SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

8908

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8909

}

8910

8911

// Recurse into target specific vector shuffles to find scalars.

8912

if (isTargetShuffle(Opcode)) {

8913

MVT ShufVT = VT.getSimpleVT();

8914

MVT ShufSVT = ShufVT.getVectorElementType();

8915

int NumElems = (int)ShufVT.getVectorNumElements();

8916

SmallVector<int, 16> ShuffleMask;

8917

SmallVector<SDValue, 16> ShuffleOps;

8918

if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

8919

ShuffleMask))

8920

return SDValue();

8921

8922

int Elt = ShuffleMask[Index];

8923

if (Elt == SM_SentinelZero)

8924

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

8925

: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

8926

if (Elt == SM_SentinelUndef)

8927

return DAG.getUNDEF(ShufSVT);

8928

8929

assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8929, __extension__
__PRETTY_FUNCTION__));

8930

SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

8931

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8932

}

8933

8934

// Recurse into insert_subvector base/sub vector to find scalars.

8935

if (Opcode == ISD::INSERT_SUBVECTOR) {

8936

SDValue Vec = Op.getOperand(0);

8937

SDValue Sub = Op.getOperand(1);

8938

uint64_t SubIdx = Op.getConstantOperandVal(2);

8939

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

8940

8941

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

8942

return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

8943

return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

8944

}

8945

8946

// Recurse into concat_vectors sub vector to find scalars.

8947

if (Opcode == ISD::CONCAT_VECTORS) {

8948

EVT SubVT = Op.getOperand(0).getValueType();

8949

unsigned NumSubElts = SubVT.getVectorNumElements();

8950

uint64_t SubIdx = Index / NumSubElts;

8951

uint64_t SubElt = Index % NumSubElts;

8952

return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

8953

}

8954

8955

// Recurse into extract_subvector src vector to find scalars.

8956

if (Opcode == ISD::EXTRACT_SUBVECTOR) {

8957

SDValue Src = Op.getOperand(0);

8958

uint64_t SrcIdx = Op.getConstantOperandVal(1);

8959

return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

8960

}

8961

8962

// We only peek through bitcasts of the same vector width.

8963

if (Opcode == ISD::BITCAST) {

8964

SDValue Src = Op.getOperand(0);

8965

EVT SrcVT = Src.getValueType();

8966

if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

8967

return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

8968

return SDValue();

8969

}

8970

8971

// Actual nodes that may contain scalar elements

8972

8973

// For insert_vector_elt - either return the index matching scalar or recurse

8974

// into the base vector.

8975

if (Opcode == ISD::INSERT_VECTOR_ELT &&

8976

isa<ConstantSDNode>(Op.getOperand(2))) {

8977

if (Op.getConstantOperandAPInt(2) == Index)

8978

return Op.getOperand(1);

8979

return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

8980

}

8981

8982

if (Opcode == ISD::SCALAR_TO_VECTOR)

8983

return (Index == 0) ? Op.getOperand(0)

8984

: DAG.getUNDEF(VT.getVectorElementType());

8985

8986

if (Opcode == ISD::BUILD_VECTOR)

8987

return Op.getOperand(Index);

8988

8989

return SDValue();

8990

}

8991

8992

// Use PINSRB/PINSRW/PINSRD to create a build vector.

8993

static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,

8994

unsigned NumNonZero, unsigned NumZero,

8995

SelectionDAG &DAG,

8996

const X86Subtarget &Subtarget) {

8997

MVT VT = Op.getSimpleValueType();

8998

unsigned NumElts = VT.getVectorNumElements();

8999

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))

9000

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__))

9001

"Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__
__PRETTY_FUNCTION__));

9002

9003

SDLoc dl(Op);

9004

SDValue V;

9005

bool First = true;

9006

9007

for (unsigned i = 0; i < NumElts; ++i) {

9008

bool IsNonZero = NonZeroMask[i];

9009

if (!IsNonZero)

9010

continue;

9011

9012

// If the build vector contains zeros or our first insertion is not the

9013

// first index then insert into zero vector to break any register

9014

// dependency else use SCALAR_TO_VECTOR.

9015

if (First) {

9016

First = false;

9017

if (NumZero || 0 != i)

9018

V = getZeroVector(VT, Subtarget, DAG, dl);

9019

else {

9020

assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9020, __extension__
__PRETTY_FUNCTION__));

9021

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9022

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

9023

V = DAG.getBitcast(VT, V);

9024

continue;

9025

}

9026

}

9027

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

9028

DAG.getIntPtrConstant(i, dl));

9029

}

9030

9031

return V;

9032

}

9033

9034

/// Custom lower build_vector of v16i8.

9035

static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,

9036

unsigned NumNonZero, unsigned NumZero,

9037

SelectionDAG &DAG,

9038

const X86Subtarget &Subtarget) {

9039

if (NumNonZero > 8 && !Subtarget.hasSSE41())

9040

return SDValue();

9041

9042

// SSE4.1 - use PINSRB to insert each byte directly.

9043

if (Subtarget.hasSSE41())

9044

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9045

Subtarget);

9046

9047

SDLoc dl(Op);

9048

SDValue V;

9049

9050

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

9051

for (unsigned i = 0; i < 16; i += 2) {

9052

bool ThisIsNonZero = NonZeroMask[i];

9053

bool NextIsNonZero = NonZeroMask[i + 1];

9054

if (!ThisIsNonZero && !NextIsNonZero)

9055

continue;

9056

9057

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

9058

SDValue Elt;

9059

if (ThisIsNonZero) {

9060

if (NumZero || NextIsNonZero)

9061

Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9062

else

9063

Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9064

}

9065

9066

if (NextIsNonZero) {

9067

SDValue NextElt = Op.getOperand(i + 1);

9068

if (i == 0 && NumZero)

9069

NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);

9070

else

9071

NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);

9072

NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,

9073

DAG.getConstant(8, dl, MVT::i8));

9074

if (ThisIsNonZero)

9075

Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);

9076

else

9077

Elt = NextElt;

9078

}

9079

9080

// If our first insertion is not the first index or zeros are needed, then

9081

// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

9082

// elements undefined).

9083

if (!V) {

9084

if (i != 0 || NumZero)

9085

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

9086

else {

9087

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

9088

V = DAG.getBitcast(MVT::v8i16, V);

9089

continue;

9090

}

9091

}

9092

Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);

9093

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,

9094

DAG.getIntPtrConstant(i / 2, dl));

9095

}

9096

9097

return DAG.getBitcast(MVT::v16i8, V);

9098

}

9099

9100

/// Custom lower build_vector of v8i16.

9101

static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,

9102

unsigned NumNonZero, unsigned NumZero,

9103

SelectionDAG &DAG,

9104

const X86Subtarget &Subtarget) {

9105

if (NumNonZero > 4 && !Subtarget.hasSSE41())

9106

return SDValue();

9107

9108

// Use PINSRW to insert each byte directly.

9109

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9110

Subtarget);

9111

}

9112

9113

/// Custom lower build_vector of v4i32 or v4f32.

9114

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

9115

const X86Subtarget &Subtarget) {

9116

// If this is a splat of a pair of elements, use MOVDDUP (unless the target

9117

// has XOP; in that case defer lowering to potentially use VPERMIL2PS).

9118

// Because we're creating a less complicated build vector here, we may enable

9119

// further folding of the MOVDDUP via shuffle transforms.

9120

if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

9121

Op.getOperand(0) == Op.getOperand(2) &&

9122

Op.getOperand(1) == Op.getOperand(3) &&

9123

Op.getOperand(0) != Op.getOperand(1)) {

9124

SDLoc DL(Op);

9125

MVT VT = Op.getSimpleValueType();

9126

MVT EltVT = VT.getVectorElementType();

9127

// Create a new build vector with the first 2 elements followed by undef

9128

// padding, bitcast to v2f64, duplicate, and bitcast back.

9129

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

9130

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

9131

SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

9132

SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

9133

return DAG.getBitcast(VT, Dup);

9134

}

9135

9136

// Find all zeroable elements.

9137

std::bitset<4> Zeroable, Undefs;

9138

for (int i = 0; i < 4; ++i) {

9139

SDValue Elt = Op.getOperand(i);

9140

Undefs[i] = Elt.isUndef();

9141

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

9142

}

9143

assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__
__PRETTY_FUNCTION__))

9144

"We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__
__PRETTY_FUNCTION__));

9145

9146

// We only know how to deal with build_vector nodes where elements are either

9147

// zeroable or extract_vector_elt with constant index.

9148

SDValue FirstNonZero;

9149

unsigned FirstNonZeroIdx;

9150

for (unsigned i = 0; i < 4; ++i) {

9151

if (Zeroable[i])

9152

continue;

9153

SDValue Elt = Op.getOperand(i);

9154

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9155

!isa<ConstantSDNode>(Elt.getOperand(1)))

9156

return SDValue();

9157

// Make sure that this node is extracting from a 128-bit vector.

9158

MVT VT = Elt.getOperand(0).getSimpleValueType();

9159

if (!VT.is128BitVector())

9160

return SDValue();

9161

if (!FirstNonZero.getNode()) {

9162

FirstNonZero = Elt;

9163

FirstNonZeroIdx = i;

9164

}

9165

}

9166

9167

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9167, __extension__
__PRETTY_FUNCTION__));

9168

SDValue V1 = FirstNonZero.getOperand(0);

9169

MVT VT = V1.getSimpleValueType();

9170

9171

// See if this build_vector can be lowered as a blend with zero.

9172

SDValue Elt;

9173

unsigned EltMaskIdx, EltIdx;

9174

int Mask[4];

9175

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

9176

if (Zeroable[EltIdx]) {

9177

// The zero vector will be on the right hand side.

9178

Mask[EltIdx] = EltIdx+4;

9179

continue;

9180

}

9181

9182

Elt = Op->getOperand(EltIdx);

9183

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

9184

EltMaskIdx = Elt.getConstantOperandVal(1);

9185

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

9186

break;

9187

Mask[EltIdx] = EltIdx;

9188

}

9189

9190

if (EltIdx == 4) {

9191

// Let the shuffle legalizer deal with blend operations.

9192

SDValue VZeroOrUndef = (Zeroable == Undefs)

9193

? DAG.getUNDEF(VT)

9194

: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

9195

if (V1.getSimpleValueType() != VT)

9196

V1 = DAG.getBitcast(VT, V1);

9197

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

9198

}

9199

9200

// See if we can lower this build_vector to a INSERTPS.

9201

if (!Subtarget.hasSSE41())

9202

return SDValue();

9203

9204

SDValue V2 = Elt.getOperand(0);

9205

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

9206

V1 = SDValue();

9207

9208

bool CanFold = true;

9209

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

9210

if (Zeroable[i])

9211

continue;

9212

9213

SDValue Current = Op->getOperand(i);

9214

SDValue SrcVector = Current->getOperand(0);

9215

if (!V1.getNode())

9216

V1 = SrcVector;

9217

CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

9218

}

9219

9220

if (!CanFold)

9221

return SDValue();

9222

9223

assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__
__PRETTY_FUNCTION__));

9224

if (V1.getSimpleValueType() != MVT::v4f32)

9225

V1 = DAG.getBitcast(MVT::v4f32, V1);

9226

if (V2.getSimpleValueType() != MVT::v4f32)

9227

V2 = DAG.getBitcast(MVT::v4f32, V2);

9228

9229

// Ok, we can emit an INSERTPS instruction.

9230

unsigned ZMask = Zeroable.to_ulong();

9231

9232

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

9233

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9233, __extension__
__PRETTY_FUNCTION__));

9234

SDLoc DL(Op);

9235

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

9236

DAG.getIntPtrConstant(InsertPSMask, DL, true));

9237

return DAG.getBitcast(VT, Result);

9238

}

9239

9240

/// Return a vector logical shift node.

9241

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

9242

SelectionDAG &DAG, const TargetLowering &TLI,

9243

const SDLoc &dl) {

9244

assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9244, __extension__
__PRETTY_FUNCTION__));

9245

MVT ShVT = MVT::v16i8;

9246

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

9247

SrcOp = DAG.getBitcast(ShVT, SrcOp);

9248

assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9248, __extension__
__PRETTY_FUNCTION__));

9249

SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

9250

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

9251

}

9252

9253

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

9254

SelectionDAG &DAG) {

9255

9256

// Check if the scalar load can be widened into a vector load. And if

9257

// the address is "base + cst" see if the cst can be "absorbed" into

9258

// the shuffle mask.

9259

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

9260

SDValue Ptr = LD->getBasePtr();

9261

if (!ISD::isNormalLoad(LD) || !LD->isSimple())

9262

return SDValue();

9263

EVT PVT = LD->getValueType(0);

9264

if (PVT != MVT::i32 && PVT != MVT::f32)

9265

return SDValue();

9266

9267

int FI = -1;

9268

int64_t Offset = 0;

9269

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

9270

FI = FINode->getIndex();

9271

Offset = 0;

9272

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

9273

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

9274

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

9275

Offset = Ptr.getConstantOperandVal(1);

9276

Ptr = Ptr.getOperand(0);

9277

} else {

9278

return SDValue();

9279

}

9280

9281

// FIXME: 256-bit vector instructions don't require a strict alignment,

9282

// improve this code to support it better.

9283

Align RequiredAlign(VT.getSizeInBits() / 8);

9284

SDValue Chain = LD->getChain();

9285

// Make sure the stack object alignment is at least 16 or 32.

9286

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

9287

MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

9288

if (!InferredAlign || *InferredAlign < RequiredAlign) {

9289

if (MFI.isFixedObjectIndex(FI)) {

9290

// Can't change the alignment. FIXME: It's possible to compute

9291

// the exact stack offset and reference FI + adjust offset instead.

9292

// If someone *really* cares about this. That's the way to implement it.

9293

return SDValue();

9294

} else {

9295

MFI.setObjectAlignment(FI, RequiredAlign);

9296

}

9297

}

9298

9299

// (Offset % 16 or 32) must be multiple of 4. Then address is then

9300

// Ptr + (Offset & ~15).

9301

if (Offset < 0)

9302

return SDValue();

9303

if ((Offset % RequiredAlign.value()) & 3)

9304

return SDValue();

9305

int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

9306

if (StartOffset) {

9307

SDLoc DL(Ptr);

9308

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

9309

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

9310

}

9311

9312

int EltNo = (Offset - StartOffset) >> 2;

9313

unsigned NumElems = VT.getVectorNumElements();

9314

9315

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

9316

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

9317

LD->getPointerInfo().getWithOffset(StartOffset));

9318

9319

SmallVector<int, 8> Mask(NumElems, EltNo);

9320

9321

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

9322

}

9323

9324

return SDValue();

9325

}

9326

9327

// Recurse to find a LoadSDNode source and the accumulated ByteOffest.

9328

static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

9329

if (ISD::isNON_EXTLoad(Elt.getNode())) {

9330

auto *BaseLd = cast<LoadSDNode>(Elt);

9331

if (!BaseLd->isSimple())

9332

return false;

9333

Ld = BaseLd;

9334

ByteOffset = 0;

9335

return true;

9336

}

9337

9338

switch (Elt.getOpcode()) {

9339

case ISD::BITCAST:

9340

case ISD::TRUNCATE:

9341

case ISD::SCALAR_TO_VECTOR:

9342

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

9343

case ISD::SRL:

9344

if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9345

uint64_t Amt = AmtC->getZExtValue();

9346

if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

9347

ByteOffset += Amt / 8;

9348

return true;

9349

}

9350

}

9351

break;

9352

case ISD::EXTRACT_VECTOR_ELT:

9353

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9354

SDValue Src = Elt.getOperand(0);

9355

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

9356

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

9357

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

9358

findEltLoadSrc(Src, Ld, ByteOffset)) {

9359

uint64_t Idx = IdxC->getZExtValue();

9360

ByteOffset += Idx * (SrcSizeInBits / 8);

9361

return true;

9362

}

9363

}

9364

break;

9365

}

9366

9367

return false;

9368

}

9369

9370

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

9371

/// elements can be replaced by a single large load which has the same value as

9372

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

9373

///

9374

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

9375

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

9376

const SDLoc &DL, SelectionDAG &DAG,

9377

const X86Subtarget &Subtarget,

9378

bool IsAfterLegalize) {

9379

if ((VT.getScalarSizeInBits() % 8) != 0)

9380

return SDValue();

9381

9382

unsigned NumElems = Elts.size();

9383

9384

int LastLoadedElt = -1;

9385

APInt LoadMask = APInt::getZero(NumElems);

9386

APInt ZeroMask = APInt::getZero(NumElems);

9387

APInt UndefMask = APInt::getZero(NumElems);

9388

9389

SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

9390

SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

9391

9392

// For each element in the initializer, see if we've found a load, zero or an

9393

// undef.

9394

for (unsigned i = 0; i < NumElems; ++i) {

9395

SDValue Elt = peekThroughBitcasts(Elts[i]);

9396

if (!Elt.getNode())

9397

return SDValue();

9398

if (Elt.isUndef()) {

9399

UndefMask.setBit(i);

9400

continue;

9401

}

9402

if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

9403

ZeroMask.setBit(i);

9404

continue;

9405

}

9406

9407

// Each loaded element must be the correct fractional portion of the

9408

// requested vector load.

9409

unsigned EltSizeInBits = Elt.getValueSizeInBits();

9410

if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

9411

return SDValue();

9412

9413

if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

9414

return SDValue();

9415

unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

9416

if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

9417

return SDValue();

9418

9419

LoadMask.setBit(i);

9420

LastLoadedElt = i;

9421

}

9422

assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))

9423

NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))

9424

"Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__));

9425

9426

// Handle Special Cases - all undef or undef/zero.

9427

if (UndefMask.popcount() == NumElems)

9428

return DAG.getUNDEF(VT);

9429

if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)

9430

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

9431

: DAG.getConstantFP(0.0, DL, VT);

9432

9433

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9434

int FirstLoadedElt = LoadMask.countr_zero();

9435

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

9436

EVT EltBaseVT = EltBase.getValueType();

9437

assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__
__PRETTY_FUNCTION__))

9438

"Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__
__PRETTY_FUNCTION__));

9439

LoadSDNode *LDBase = Loads[FirstLoadedElt];

9440

assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9440, __extension__
__PRETTY_FUNCTION__));

9441

unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

9442

unsigned BaseSizeInBytes = BaseSizeInBits / 8;

9443

int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);

9444

int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;

9445

assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9445, __extension__
__PRETTY_FUNCTION__));

9446

9447

// TODO: Support offsetting the base load.

9448

if (ByteOffsets[FirstLoadedElt] != 0)

9449

return SDValue();

9450

9451

// Check to see if the element's load is consecutive to the base load

9452

// or offset from a previous (already checked) load.

9453

auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

9454

LoadSDNode *Ld = Loads[EltIdx];

9455

int64_t ByteOffset = ByteOffsets[EltIdx];

9456

if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

9457

int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

9458

return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

9459

Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

9460

}

9461

return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

9462

EltIdx - FirstLoadedElt);

9463

};

9464

9465

// Consecutive loads can contain UNDEFS but not ZERO elements.

9466

// Consecutive loads with UNDEFs and ZEROs elements require a

9467

// an additional shuffle stage to clear the ZERO elements.

9468

bool IsConsecutiveLoad = true;

9469

bool IsConsecutiveLoadWithZeros = true;

9470

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

9471

if (LoadMask[i]) {

9472

if (!CheckConsecutiveLoad(LDBase, i)) {

9473

IsConsecutiveLoad = false;

9474

IsConsecutiveLoadWithZeros = false;

9475

break;

9476

}

9477

} else if (ZeroMask[i]) {

9478

IsConsecutiveLoad = false;

9479

}

9480

}

9481

9482

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

9483

auto MMOFlags = LDBase->getMemOperand()->getFlags();

9484

assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__
__PRETTY_FUNCTION__))

9485

"Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__
__PRETTY_FUNCTION__));

9486

SDValue NewLd =

9487

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

9488

LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

9489

MMOFlags);

9490

for (auto *LD : Loads)

9491

if (LD)

9492

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

9493

return NewLd;

9494

};

9495

9496

// Check if the base load is entirely dereferenceable.

9497

bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

9498

VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

9499

9500

// LOAD - all consecutive load/undefs (must start/end with a load or be

9501

// entirely dereferenceable). If we have found an entire vector of loads and

9502

// undefs, then return a large load of the entire vector width starting at the

9503

// base pointer. If the vector contains zeros, then attempt to shuffle those

9504

// elements.

9505

if (FirstLoadedElt == 0 &&

9506

(NumLoadedElts == (int)NumElems || IsDereferenceable) &&

9507

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

9508

if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

9509

return SDValue();

9510

9511

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

9512

// will lower to regular temporal loads and use the cache.

9513

if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&

9514

VT.is256BitVector() && !Subtarget.hasInt256())

9515

return SDValue();

9516

9517

if (NumElems == 1)

9518

return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

9519

9520

if (!ZeroMask)

9521

return CreateLoad(VT, LDBase);

9522

9523

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

9524

// vector and a zero vector to clear out the zero elements.

9525

if (!IsAfterLegalize && VT.isVector()) {

9526

unsigned NumMaskElts = VT.getVectorNumElements();

9527

if ((NumMaskElts % NumElems) == 0) {

9528

unsigned Scale = NumMaskElts / NumElems;

9529

SmallVector<int, 4> ClearMask(NumMaskElts, -1);

9530

for (unsigned i = 0; i < NumElems; ++i) {

9531

if (UndefMask[i])

9532

continue;

9533

int Offset = ZeroMask[i] ? NumMaskElts : 0;

9534

for (unsigned j = 0; j != Scale; ++j)

9535

ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

9536

}

9537

SDValue V = CreateLoad(VT, LDBase);

9538

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

9539

: DAG.getConstantFP(0.0, DL, VT);

9540

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

9541

}

9542

}

9543

}

9544

9545

// If the upper half of a ymm/zmm load is undef then just load the lower half.

9546

if (VT.is256BitVector() || VT.is512BitVector()) {

9547

unsigned HalfNumElems = NumElems / 2;

9548

if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {

9549

EVT HalfVT =

9550

EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

9551

SDValue HalfLD =

9552

EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

9553

DAG, Subtarget, IsAfterLegalize);

9554

if (HalfLD)

9555

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

9556

HalfLD, DAG.getIntPtrConstant(0, DL));

9557

}

9558

}

9559

9560

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

9561

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

9562

((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||

9563

LoadSizeInBits == 64) &&

9564

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

9565

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

9566

: MVT::getIntegerVT(LoadSizeInBits);

9567

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

9568

// Allow v4f32 on SSE1 only targets.

9569

// FIXME: Add more isel patterns so we can just use VT directly.

9570

if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

9571

VecVT = MVT::v4f32;

9572

if (TLI.isTypeLegal(VecVT)) {

9573

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

9574

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

9575

SDValue ResNode = DAG.getMemIntrinsicNode(

9576

X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

9577

LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

9578

for (auto *LD : Loads)

9579

if (LD)

9580

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

9581

return DAG.getBitcast(VT, ResNode);

9582

}

9583

}

9584

9585

// BROADCAST - match the smallest possible repetition pattern, load that

9586

// scalar/subvector element and then broadcast to the entire vector.

9587

if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

9588

(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

9589

for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

9590

unsigned RepeatSize = SubElems * BaseSizeInBits;

9591

unsigned ScalarSize = std::min(RepeatSize, 64u);

9592

if (!Subtarget.hasAVX2() && ScalarSize < 32)

9593

continue;

9594

9595

// Don't attempt a 1:N subvector broadcast - it should be caught by

9596

// combineConcatVectorOps, else will cause infinite loops.

9597

if (RepeatSize > ScalarSize && SubElems == 1)

9598

continue;

9599

9600

bool Match = true;

9601

SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

9602

for (unsigned i = 0; i != NumElems && Match; ++i) {

9603

if (!LoadMask[i])

9604

continue;

9605

SDValue Elt = peekThroughBitcasts(Elts[i]);

9606

if (RepeatedLoads[i % SubElems].isUndef())

9607

RepeatedLoads[i % SubElems] = Elt;

9608

else

9609

Match &= (RepeatedLoads[i % SubElems] == Elt);

9610

}

9611

9612

// We must have loads at both ends of the repetition.

9613

Match &= !RepeatedLoads.front().isUndef();

9614

Match &= !RepeatedLoads.back().isUndef();

9615

if (!Match)

9616

continue;

9617

9618

EVT RepeatVT =

9619

VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

9620

? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

9621

: EVT::getFloatingPointVT(ScalarSize);

9622

if (RepeatSize > ScalarSize)

9623

RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

9624

RepeatSize / ScalarSize);

9625

EVT BroadcastVT =

9626

EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

9627

VT.getSizeInBits() / ScalarSize);

9628

if (TLI.isTypeLegal(BroadcastVT)) {

9629

if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

9630

RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {

9631

SDValue Broadcast = RepeatLoad;

9632

if (RepeatSize > ScalarSize) {

9633

while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())

9634

Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);

9635

} else {

9636

if (!Subtarget.hasAVX2() &&

9637

!X86::mayFoldLoadIntoBroadcastFromMem(

9638

RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),

9639

Subtarget,

9640

/*AssumeSingleUse=*/true))

9641

return SDValue();

9642

Broadcast =

9643

DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);

9644

}

9645

return DAG.getBitcast(VT, Broadcast);

9646

}

9647

}

9648

}

9649

}

9650

9651

return SDValue();

9652

}

9653

9654

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

9655

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

9656

// are consecutive, non-overlapping, and in the right order.

9657

static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

9658

SelectionDAG &DAG,

9659

const X86Subtarget &Subtarget,

9660

bool IsAfterLegalize) {

9661

SmallVector<SDValue, 64> Elts;

9662

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

9663

if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

9664

Elts.push_back(Elt);

9665

continue;

9666

}

9667

return SDValue();

9668

}

9669

assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9669, __extension__
__PRETTY_FUNCTION__));

9670

return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

9671

IsAfterLegalize);

9672

}

9673

9674

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

9675

unsigned SplatBitSize, LLVMContext &C) {

9676

unsigned ScalarSize = VT.getScalarSizeInBits();

9677

unsigned NumElm = SplatBitSize / ScalarSize;

9678

9679

SmallVector<Constant *, 32> ConstantVec;

9680

for (unsigned i = 0; i < NumElm; i++) {

9681

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

9682

Constant *Const;

9683

if (VT.isFloatingPoint()) {

9684

if (ScalarSize == 16) {

9685

Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

9686

} else if (ScalarSize == 32) {

9687

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

9688

} else {

9689

assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__
__PRETTY_FUNCTION__));

9690

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

9691

}

9692

} else

9693

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

9694

ConstantVec.push_back(Const);

9695

}

9696

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

9697

}

9698

9699

static bool isFoldableUseOfShuffle(SDNode *N) {

9700

for (auto *U : N->uses()) {

9701

unsigned Opc = U->getOpcode();

9702

// VPERMV/VPERMV3 shuffles can never fold their index operands.

9703

if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

9704

return false;

9705

if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

9706

return false;

9707

if (isTargetShuffle(Opc))

9708

return true;

9709

if (Opc == ISD::BITCAST) // Ignore bitcasts

9710

return isFoldableUseOfShuffle(U);

9711

if (N->hasOneUse()) {

9712

// TODO, there may be some general way to know if a SDNode can

9713

// be folded. We now only know whether an MI is foldable.

9714

if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)

9715

return false;

9716

return true;

9717

}

9718

}

9719

return false;

9720

}

9721

9722

/// Attempt to use the vbroadcast instruction to generate a splat value

9723

/// from a splat BUILD_VECTOR which uses:

9724

/// a. A single scalar load, or a constant.

9725

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

9726

///

9727

/// The VBROADCAST node is returned when a pattern is found,

9728

/// or SDValue() otherwise.

9729

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

9730

const X86Subtarget &Subtarget,

9731

SelectionDAG &DAG) {

9732

// VBROADCAST requires AVX.

9733

// TODO: Splats could be generated for non-AVX CPUs using SSE

9734

// instructions, but there's less potential gain for only 128-bit vectors.

9735

if (!Subtarget.hasAVX())

9736

return SDValue();

9737

9738

MVT VT = BVOp->getSimpleValueType(0);

9739

unsigned NumElts = VT.getVectorNumElements();

9740

SDLoc dl(BVOp);

9741

9742

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__
__PRETTY_FUNCTION__))

9743

"Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__
__PRETTY_FUNCTION__));

9744

9745

// See if the build vector is a repeating sequence of scalars (inc. splat).

9746

SDValue Ld;

9747

BitVector UndefElements;

9748

SmallVector<SDValue, 16> Sequence;

9749

if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {

9750

assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9750, __extension__
__PRETTY_FUNCTION__));

9751

if (Sequence.size() == 1)

9752

Ld = Sequence[0];

9753

}

9754

9755

// Attempt to use VBROADCASTM

9756

// From this pattern:

9757

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

9758

// b. t1 = (build_vector t0 t0)

9759

//

9760

// Create (VBROADCASTM v2i1 X)

9761

if (!Sequence.empty() && Subtarget.hasCDI()) {

9762

// If not a splat, are the upper sequence values zeroable?

9763

unsigned SeqLen = Sequence.size();

9764

bool UpperZeroOrUndef =

9765

SeqLen == 1 ||

9766

llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {

9767

return !V || V.isUndef() || isNullConstant(V);

9768

});

9769

SDValue Op0 = Sequence[0];

9770

if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||

9771

(Op0.getOpcode() == ISD::ZERO_EXTEND &&

9772

Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {

9773

SDValue BOperand = Op0.getOpcode() == ISD::BITCAST

9774

? Op0.getOperand(0)

9775

: Op0.getOperand(0).getOperand(0);

9776

MVT MaskVT = BOperand.getSimpleValueType();

9777

MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);

9778

if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q

9779

(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

9780

MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);

9781

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

9782

unsigned Scale = 512 / VT.getSizeInBits();

9783

BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));

9784

}

9785

SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);

9786

if (BcstVT.getSizeInBits() != VT.getSizeInBits())

9787

Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());

9788

return DAG.getBitcast(VT, Bcst);

9789

}

9790

}

9791

}

9792

9793

unsigned NumUndefElts = UndefElements.count();

9794

if (!Ld || (NumElts - NumUndefElts) <= 1) {

9795

APInt SplatValue, Undef;

9796

unsigned SplatBitSize;

9797

bool HasUndef;

9798

// Check if this is a repeated constant pattern suitable for broadcasting.

9799

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

9800

SplatBitSize > VT.getScalarSizeInBits() &&

9801

SplatBitSize < VT.getSizeInBits()) {

9802

// Avoid replacing with broadcast when it's a use of a shuffle

9803

// instruction to preserve the present custom lowering of shuffles.

9804

if (isFoldableUseOfShuffle(BVOp))

9805

return SDValue();

9806

// replace BUILD_VECTOR with broadcast of the repeated constants.

9807

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9808

LLVMContext *Ctx = DAG.getContext();

9809

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

9810

if (Subtarget.hasAVX()) {

9811

if (SplatBitSize == 32 || SplatBitSize == 64 ||

9812

(SplatBitSize < 32 && Subtarget.hasAVX2())) {

9813

// Splatted value can fit in one INTEGER constant in constant pool.

9814

// Load the constant and broadcast it.

9815

MVT CVT = MVT::getIntegerVT(SplatBitSize);

9816

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

9817

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

9818

SDValue CP = DAG.getConstantPool(C, PVT);

9819

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

9820

9821

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9822

SDVTList Tys =

9823

DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

9824

SDValue Ops[] = {DAG.getEntryNode(), CP};

9825

MachinePointerInfo MPI =

9826

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9827

SDValue Brdcst = DAG.getMemIntrinsicNode(

9828

X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

9829

MachineMemOperand::MOLoad);

9830

return DAG.getBitcast(VT, Brdcst);

9831

}

9832

if (SplatBitSize > 64) {

9833

// Load the vector of constants and broadcast it.

9834

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

9835

*Ctx);

9836

SDValue VCP = DAG.getConstantPool(VecC, PVT);

9837

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

9838

MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);

9839

Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

9840

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9841

SDValue Ops[] = {DAG.getEntryNode(), VCP};

9842

MachinePointerInfo MPI =

9843

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9844

return DAG.getMemIntrinsicNode(

9845

X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,

9846

MachineMemOperand::MOLoad);

9847

}

9848

}

9849

}

9850

9851

// If we are moving a scalar into a vector (Ld must be set and all elements

9852

// but 1 are undef) and that operation is not obviously supported by

9853

// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

9854

// That's better than general shuffling and may eliminate a load to GPR and

9855

// move from scalar to vector register.

9856

if (!Ld || NumElts - NumUndefElts != 1)

9857

return SDValue();

9858

unsigned ScalarSize = Ld.getValueSizeInBits();

9859

if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

9860

return SDValue();

9861

}

9862

9863

bool ConstSplatVal =

9864

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

9865

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

9866

9867

// TODO: Handle broadcasts of non-constant sequences.

9868

9869

// Make sure that all of the users of a non-constant load are from the

9870

// BUILD_VECTOR node.

9871

// FIXME: Is the use count needed for non-constant, non-load case?

9872

if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

9873

return SDValue();

9874

9875

unsigned ScalarSize = Ld.getValueSizeInBits();

9876

bool IsGE256 = (VT.getSizeInBits() >= 256);

9877

9878

// When optimizing for size, generate up to 5 extra bytes for a broadcast

9879

// instruction to save 8 or more bytes of constant pool data.

9880

// TODO: If multiple splats are generated to load the same constant,

9881

// it may be detrimental to overall size. There needs to be a way to detect

9882

// that condition to know if this is truly a size win.

9883

bool OptForSize = DAG.shouldOptForSize();

9884

9885

// Handle broadcasting a single constant scalar from the constant pool

9886

// into a vector.

9887

// On Sandybridge (no AVX2), it is still better to load a constant vector

9888

// from the constant pool and not to broadcast it from a scalar.

9889

// But override that restriction when optimizing for size.

9890

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

9891

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

9892

EVT CVT = Ld.getValueType();

9893

assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9893, __extension__
__PRETTY_FUNCTION__));

9894

9895

// Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.

9896

// For size optimization, also splat v2f64 and v2i64, and for size opt

9897

// with AVX2, also splat i8 and i16.

9898

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

9899

if (ScalarSize == 32 ||

9900

(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||

9901

CVT == MVT::f16 ||

9902

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

9903

const Constant *C = nullptr;

9904

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

9905

C = CI->getConstantIntValue();

9906

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

9907

C = CF->getConstantFPValue();

9908

9909

assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9909, __extension__
__PRETTY_FUNCTION__));

9910

9911

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9912

SDValue CP =

9913

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

9914

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9915

9916

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9917

SDValue Ops[] = {DAG.getEntryNode(), CP};

9918

MachinePointerInfo MPI =

9919

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9920

return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

9921

MPI, Alignment, MachineMemOperand::MOLoad);

9922

}

9923

}

9924

9925

// Handle AVX2 in-register broadcasts.

9926

if (!IsLoad && Subtarget.hasInt256() &&

9927

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

9928

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9929

9930

// The scalar source must be a normal load.

9931

if (!IsLoad)

9932

return SDValue();

9933

9934

// Make sure the non-chain result is only used by this build vector.

9935

if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

9936

return SDValue();

9937

9938

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

9939

(Subtarget.hasVLX() && ScalarSize == 64)) {

9940

auto *LN = cast<LoadSDNode>(Ld);

9941

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9942

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9943

SDValue BCast =

9944

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9945

LN->getMemoryVT(), LN->getMemOperand());

9946

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9947

return BCast;

9948

}

9949

9950

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

9951

// double since there is no vbroadcastsd xmm

9952

if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

9953

(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

9954

auto *LN = cast<LoadSDNode>(Ld);

9955

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9956

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9957

SDValue BCast =

9958

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9959

LN->getMemoryVT(), LN->getMemOperand());

9960

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9961

return BCast;

9962

}

9963

9964

if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)

9965

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9966

9967

// Unsupported broadcast.

9968

return SDValue();

9969

}

9970

9971

/// For an EXTRACT_VECTOR_ELT with a constant index return the real

9972

/// underlying vector and index.

9973

///

9974

/// Modifies \p ExtractedFromVec to the real vector and returns the real

9975

/// index.

9976

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

9977

SDValue ExtIdx) {

9978

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

9979

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

9980

return Idx;

9981

9982

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

9983

// lowered this:

9984

// (extract_vector_elt (v8f32 %1), Constant<6>)

9985

// to:

9986

// (extract_vector_elt (vector_shuffle<2,u,u,u>

9987

// (extract_subvector (v8f32 %0), Constant<4>),

9988

// undef)

9989

// Constant<0>)

9990

// In this case the vector is the extract_subvector expression and the index

9991

// is 2, as specified by the shuffle.

9992

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

9993

SDValue ShuffleVec = SVOp->getOperand(0);

9994

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

9995

assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__
__PRETTY_FUNCTION__))

9996

ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__
__PRETTY_FUNCTION__));

9997

9998

int ShuffleIdx = SVOp->getMaskElt(Idx);

9999

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

10000

ExtractedFromVec = ShuffleVec;

10001

return ShuffleIdx;

10002

}

10003

return Idx;

10004

}

10005

10006

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

10007

MVT VT = Op.getSimpleValueType();

10008

10009

// Skip if insert_vec_elt is not supported.

10010

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

10011

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

10012

return SDValue();

10013

10014

SDLoc DL(Op);

10015

unsigned NumElems = Op.getNumOperands();

10016

10017

SDValue VecIn1;

10018

SDValue VecIn2;

10019

SmallVector<unsigned, 4> InsertIndices;

10020

SmallVector<int, 8> Mask(NumElems, -1);

10021

10022

for (unsigned i = 0; i != NumElems; ++i) {

10023

unsigned Opc = Op.getOperand(i).getOpcode();

10024

10025

if (Opc == ISD::UNDEF)

10026

continue;

10027

10028

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

10029

// Quit if more than 1 elements need inserting.

10030

if (InsertIndices.size() > 1)

10031

return SDValue();

10032

10033

InsertIndices.push_back(i);

10034

continue;

10035

}

10036

10037

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

10038

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

10039

10040

// Quit if non-constant index.

10041

if (!isa<ConstantSDNode>(ExtIdx))

10042

return SDValue();

10043

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

10044

10045

// Quit if extracted from vector of different type.

10046

if (ExtractedFromVec.getValueType() != VT)

10047

return SDValue();

10048

10049

if (!VecIn1.getNode())

10050

VecIn1 = ExtractedFromVec;

10051

else if (VecIn1 != ExtractedFromVec) {

10052

if (!VecIn2.getNode())

10053

VecIn2 = ExtractedFromVec;

10054

else if (VecIn2 != ExtractedFromVec)

10055

// Quit if more than 2 vectors to shuffle

10056

return SDValue();

10057

}

10058

10059

if (ExtractedFromVec == VecIn1)

10060

Mask[i] = Idx;

10061

else if (ExtractedFromVec == VecIn2)

10062

Mask[i] = Idx + NumElems;

10063

}

10064

10065

if (!VecIn1.getNode())

10066

return SDValue();

10067

10068

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

10069

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

10070

10071

for (unsigned Idx : InsertIndices)

10072

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

10073

DAG.getIntPtrConstant(Idx, DL));

10074

10075

return NV;

10076

}

10077

10078

// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.

10079

static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,

10080

const X86Subtarget &Subtarget) {

10081

MVT VT = Op.getSimpleValueType();

10082

MVT IVT = VT.changeVectorElementTypeToInteger();

10083

SmallVector<SDValue, 16> NewOps;

10084

for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)

10085

NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));

10086

SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);

10087

return DAG.getBitcast(VT, Res);

10088

}

10089

10090

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

10091

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

10092

const X86Subtarget &Subtarget) {

10093

10094

MVT VT = Op.getSimpleValueType();

10095

assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__
__PRETTY_FUNCTION__))

10096

"Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__
__PRETTY_FUNCTION__));

10097

10098

SDLoc dl(Op);

10099

if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

10100

ISD::isBuildVectorAllOnes(Op.getNode()))

10101

return Op;

10102

10103

uint64_t Immediate = 0;

10104

SmallVector<unsigned, 16> NonConstIdx;

10105

bool IsSplat = true;

10106

bool HasConstElts = false;

10107

int SplatIdx = -1;

10108

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

10109

SDValue In = Op.getOperand(idx);

10110

if (In.isUndef())

10111

continue;

10112

if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

10113

Immediate |= (InC->getZExtValue() & 0x1) << idx;

10114

HasConstElts = true;

10115

} else {

10116

NonConstIdx.push_back(idx);

10117

}

10118

if (SplatIdx < 0)

10119

SplatIdx = idx;

10120

else if (In != Op.getOperand(SplatIdx))

10121

IsSplat = false;

10122

}

10123

10124

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

10125

if (IsSplat) {

10126

// The build_vector allows the scalar element to be larger than the vector

10127

// element type. We need to mask it to use as a condition unless we know

10128

// the upper bits are zero.

10129

// FIXME: Use computeKnownBits instead of checking specific opcode?

10130

SDValue Cond = Op.getOperand(SplatIdx);

10131

assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10131, __extension__
__PRETTY_FUNCTION__));

10132

if (Cond.getOpcode() != ISD::SETCC)

10133

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

10134

DAG.getConstant(1, dl, MVT::i8));

10135

10136

// Perform the select in the scalar domain so we can use cmov.

10137

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10138

SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

10139

DAG.getAllOnesConstant(dl, MVT::i32),

10140

DAG.getConstant(0, dl, MVT::i32));

10141

Select = DAG.getBitcast(MVT::v32i1, Select);

10142

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

10143

} else {

10144

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10145

SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

10146

DAG.getAllOnesConstant(dl, ImmVT),

10147

DAG.getConstant(0, dl, ImmVT));

10148

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10149

Select = DAG.getBitcast(VecVT, Select);

10150

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

10151

DAG.getIntPtrConstant(0, dl));

10152

}

10153

}

10154

10155

// insert elements one by one

10156

SDValue DstVec;

10157

if (HasConstElts) {

10158

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10159

SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

10160

SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

10161

ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

10162

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

10163

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

10164

} else {

10165

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10166

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

10167

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10168

DstVec = DAG.getBitcast(VecVT, Imm);

10169

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

10170

DAG.getIntPtrConstant(0, dl));

10171

}

10172

} else

10173

DstVec = DAG.getUNDEF(VT);

10174

10175

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

10176

unsigned InsertIdx = NonConstIdx[i];

10177

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

10178

Op.getOperand(InsertIdx),

10179

DAG.getIntPtrConstant(InsertIdx, dl));

10180

}

10181

return DstVec;

10182

}

10183

10184

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {

10185

switch (Opcode) {

10186

case X86ISD::PACKSS:

10187

case X86ISD::PACKUS:

10188

case X86ISD::FHADD:

10189

case X86ISD::FHSUB:

10190

case X86ISD::HADD:

10191

case X86ISD::HSUB:

10192

return true;

10193

}

10194

return false;

10195

}

10196

10197

/// This is a helper function of LowerToHorizontalOp().

10198

/// This function checks that the build_vector \p N in input implements a

10199

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

10200

/// may not match the layout of an x86 256-bit horizontal instruction.

10201

/// In other words, if this returns true, then some extraction/insertion will

10202

/// be required to produce a valid horizontal instruction.

10203

///

10204

/// Parameter \p Opcode defines the kind of horizontal operation to match.

10205

/// For example, if \p Opcode is equal to ISD::ADD, then this function

10206

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

10207

/// is equal to ISD::SUB, then this function checks if this is a horizontal

10208

/// arithmetic sub.

10209

///

10210

/// This function only analyzes elements of \p N whose indices are

10211

/// in range [BaseIdx, LastIdx).

10212

///

10213

/// TODO: This function was originally used to match both real and fake partial

10214

/// horizontal operations, but the index-matching logic is incorrect for that.

10215

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

10216

/// code because it is only used for partial h-op matching now?

10217

static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

10218

SelectionDAG &DAG,

10219

unsigned BaseIdx, unsigned LastIdx,

10220

SDValue &V0, SDValue &V1) {

10221

EVT VT = N->getValueType(0);

10222

assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10222, __extension__
__PRETTY_FUNCTION__));

10223

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10223, __extension__
__PRETTY_FUNCTION__));

10224

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__
__PRETTY_FUNCTION__))

10225

"Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__
__PRETTY_FUNCTION__));

10226

10227

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

10228

bool CanFold = true;

10229

unsigned ExpectedVExtractIdx = BaseIdx;

10230

unsigned NumElts = LastIdx - BaseIdx;

10231

V0 = DAG.getUNDEF(VT);

10232

V1 = DAG.getUNDEF(VT);

10233

10234

// Check if N implements a horizontal binop.

10235

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

10236

SDValue Op = N->getOperand(i + BaseIdx);

10237

10238

// Skip UNDEFs.

10239

if (Op->isUndef()) {

10240

// Update the expected vector extract index.

10241

if (i * 2 == NumElts)

10242

ExpectedVExtractIdx = BaseIdx;

10243

ExpectedVExtractIdx += 2;

10244

continue;

10245

}

10246

10247

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

10248

10249

if (!CanFold)

10250

break;

10251

10252

SDValue Op0 = Op.getOperand(0);

10253

SDValue Op1 = Op.getOperand(1);

10254

10255

// Try to match the following pattern:

10256

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

10257

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10258

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10259

Op0.getOperand(0) == Op1.getOperand(0) &&

10260

isa<ConstantSDNode>(Op0.getOperand(1)) &&

10261

isa<ConstantSDNode>(Op1.getOperand(1)));

10262

if (!CanFold)

10263

break;

10264

10265

unsigned I0 = Op0.getConstantOperandVal(1);

10266

unsigned I1 = Op1.getConstantOperandVal(1);

10267

10268

if (i * 2 < NumElts) {

10269

if (V0.isUndef()) {

10270

V0 = Op0.getOperand(0);

10271

if (V0.getValueType() != VT)

10272

return false;

10273

}

10274

} else {

10275

if (V1.isUndef()) {

10276

V1 = Op0.getOperand(0);

10277

if (V1.getValueType() != VT)

10278

return false;

10279

}

10280

if (i * 2 == NumElts)

10281

ExpectedVExtractIdx = BaseIdx;

10282

}

10283

10284

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

10285

if (I0 == ExpectedVExtractIdx)

10286

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

10287

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

10288

// Try to match the following dag sequence:

10289

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

10290

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

10291

} else

10292

CanFold = false;

10293

10294

ExpectedVExtractIdx += 2;

10295

}

10296

10297

return CanFold;

10298

}

10299

10300

/// Emit a sequence of two 128-bit horizontal add/sub followed by

10301

/// a concat_vector.

10302

///

10303

/// This is a helper function of LowerToHorizontalOp().

10304

/// This function expects two 256-bit vectors called V0 and V1.

10305

/// At first, each vector is split into two separate 128-bit vectors.

10306

/// Then, the resulting 128-bit vectors are used to implement two

10307

/// horizontal binary operations.

10308

///

10309

/// The kind of horizontal binary operation is defined by \p X86Opcode.

10310

///

10311

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

10312

/// the two new horizontal binop.

10313

/// When Mode is set, the first horizontal binop dag node would take as input

10314

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

10315

/// horizontal binop dag node would take as input the lower 128-bit of V1

10316

/// and the upper 128-bit of V1.

10317

/// Example:

10318

/// HADD V0_LO, V0_HI

10319

/// HADD V1_LO, V1_HI

10320

///

10321

/// Otherwise, the first horizontal binop dag node takes as input the lower

10322

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

10323

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

10324

/// Example:

10325

/// HADD V0_LO, V1_LO

10326

/// HADD V0_HI, V1_HI

10327

///

10328

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

10329

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

10330

/// the upper 128-bits of the result.

10331

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

10332

const SDLoc &DL, SelectionDAG &DAG,

10333

unsigned X86Opcode, bool Mode,

10334

bool isUndefLO, bool isUndefHI) {

10335

MVT VT = V0.getSimpleValueType();

10336

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__
__PRETTY_FUNCTION__))

10337

"Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__
__PRETTY_FUNCTION__));

10338

10339

unsigned NumElts = VT.getVectorNumElements();

10340

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

10341

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

10342

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

10343

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

10344

MVT NewVT = V0_LO.getSimpleValueType();

10345

10346

SDValue LO = DAG.getUNDEF(NewVT);

10347

SDValue HI = DAG.getUNDEF(NewVT);

10348

10349

if (Mode) {

10350

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10351

if (!isUndefLO && !V0->isUndef())

10352

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

10353

if (!isUndefHI && !V1->isUndef())

10354

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

10355

} else {

10356

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10357

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

10358

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

10359

10360

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

10361

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

10362

}

10363

10364

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

10365

}

10366

10367

/// Returns true iff \p BV builds a vector with the result equivalent to

10368

/// the result of ADDSUB/SUBADD operation.

10369

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

10370

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

10371

/// \p Opnd0 and \p Opnd1.

10372

static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

10373

const X86Subtarget &Subtarget, SelectionDAG &DAG,

10374

SDValue &Opnd0, SDValue &Opnd1,

10375

unsigned &NumExtracts,

10376

bool &IsSubAdd) {

10377

10378

MVT VT = BV->getSimpleValueType(0);

10379

if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

10380

return false;

10381

10382

unsigned NumElts = VT.getVectorNumElements();

10383

SDValue InVec0 = DAG.getUNDEF(VT);

10384

SDValue InVec1 = DAG.getUNDEF(VT);

10385

10386

NumExtracts = 0;

10387

10388

// Odd-numbered elements in the input build vector are obtained from

10389

// adding/subtracting two integer/float elements.

10390

// Even-numbered elements in the input build vector are obtained from

10391

// subtracting/adding two integer/float elements.

10392

unsigned Opc[2] = {0, 0};

10393

for (unsigned i = 0, e = NumElts; i != e; ++i) {

10394

SDValue Op = BV->getOperand(i);

10395

10396

// Skip 'undef' values.

10397

unsigned Opcode = Op.getOpcode();

10398

if (Opcode == ISD::UNDEF)

10399

continue;

10400

10401

// Early exit if we found an unexpected opcode.

10402

if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

10403

return false;

10404

10405

SDValue Op0 = Op.getOperand(0);

10406

SDValue Op1 = Op.getOperand(1);

10407

10408

// Try to match the following pattern:

10409

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

10410

// Early exit if we cannot match that sequence.

10411

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10412

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10413

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10414

Op0.getOperand(1) != Op1.getOperand(1))

10415

return false;

10416

10417

unsigned I0 = Op0.getConstantOperandVal(1);

10418

if (I0 != i)

10419

return false;

10420

10421

// We found a valid add/sub node, make sure its the same opcode as previous

10422

// elements for this parity.

10423

if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

10424

return false;

10425

Opc[i % 2] = Opcode;

10426

10427

// Update InVec0 and InVec1.

10428

if (InVec0.isUndef()) {

10429

InVec0 = Op0.getOperand(0);

10430

if (InVec0.getSimpleValueType() != VT)

10431

return false;

10432

}

10433

if (InVec1.isUndef()) {

10434

InVec1 = Op1.getOperand(0);

10435

if (InVec1.getSimpleValueType() != VT)

10436

return false;

10437

}

10438

10439

// Make sure that operands in input to each add/sub node always

10440

// come from a same pair of vectors.

10441

if (InVec0 != Op0.getOperand(0)) {

10442

if (Opcode == ISD::FSUB)

10443

return false;

10444

10445

// FADD is commutable. Try to commute the operands

10446

// and then test again.

10447

std::swap(Op0, Op1);

10448

if (InVec0 != Op0.getOperand(0))

10449

return false;

10450

}

10451

10452

if (InVec1 != Op1.getOperand(0))

10453

return false;

10454

10455

// Increment the number of extractions done.

10456

++NumExtracts;

10457

}

10458

10459

// Ensure we have found an opcode for both parities and that they are

10460

// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

10461

// inputs are undef.

10462

if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

10463

InVec0.isUndef() || InVec1.isUndef())

10464

return false;

10465

10466

IsSubAdd = Opc[0] == ISD::FADD;

10467

10468

Opnd0 = InVec0;

10469

Opnd1 = InVec1;

10470

return true;

10471

}

10472

10473

/// Returns true if is possible to fold MUL and an idiom that has already been

10474

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

10475

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

10476

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

10477

///

10478

/// Prior to calling this function it should be known that there is some

10479

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

10480

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

10481

/// before replacement of such SDNode with ADDSUB operation. Thus the number

10482

/// of \p Opnd0 uses is expected to be equal to 2.

10483

/// For example, this function may be called for the following IR:

10484

/// %AB = fmul fast <2 x double> %A, %B

10485

/// %Sub = fsub fast <2 x double> %AB, %C

10486

/// %Add = fadd fast <2 x double> %AB, %C

10487

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

10488

/// <2 x i32> <i32 0, i32 3>

10489

/// There is a def for %Addsub here, which potentially can be replaced by

10490

/// X86ISD::ADDSUB operation:

10491

/// %Addsub = X86ISD::ADDSUB %AB, %C

10492

/// and such ADDSUB can further be replaced with FMADDSUB:

10493

/// %Addsub = FMADDSUB %A, %B, %C.

10494

///

10495

/// The main reason why this method is called before the replacement of the

10496

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

10497

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

10498

/// FMADDSUB is.

10499

static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

10500

SelectionDAG &DAG,

10501

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,

10502

unsigned ExpectedUses) {

10503

if (Opnd0.getOpcode() != ISD::FMUL ||

10504

!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

10505

return false;

10506

10507

// FIXME: These checks must match the similar ones in

10508

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

10509

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

10510

// or MUL + ADDSUB to FMADDSUB.

10511

const TargetOptions &Options = DAG.getTarget().Options;

10512

bool AllowFusion =

10513

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

10514

if (!AllowFusion)

10515

return false;

10516

10517

Opnd2 = Opnd1;

10518

Opnd1 = Opnd0.getOperand(1);

10519

Opnd0 = Opnd0.getOperand(0);

10520

10521

return true;

10522

}

10523

10524

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

10525

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

10526

/// X86ISD::FMSUBADD node.

10527

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

10528

const X86Subtarget &Subtarget,

10529

SelectionDAG &DAG) {

10530

SDValue Opnd0, Opnd1;

10531

unsigned NumExtracts;

10532

bool IsSubAdd;

10533

if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,

10534

IsSubAdd))

10535

return SDValue();

10536

10537

MVT VT = BV->getSimpleValueType(0);

10538

SDLoc DL(BV);

10539

10540

// Try to generate X86ISD::FMADDSUB node here.

10541

SDValue Opnd2;

10542

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {

10543

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

10544

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

10545

}

10546

10547

// We only support ADDSUB.

10548

if (IsSubAdd)

10549

return SDValue();

10550

10551

// There are no known X86 targets with 512-bit ADDSUB instructions!

10552

// Convert to blend(fsub,fadd).

10553

if (VT.is512BitVector()) {

10554

SmallVector<int> Mask;

10555

for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {

10556

Mask.push_back(I);

10557

Mask.push_back(I + E + 1);

10558

}

10559

SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);

10560

SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);

10561

return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);

10562

}

10563

10564

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

10565

}

10566

10567

static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

10568

unsigned &HOpcode, SDValue &V0, SDValue &V1) {

10569

// Initialize outputs to known values.

10570

MVT VT = BV->getSimpleValueType(0);

10571

HOpcode = ISD::DELETED_NODE;

10572

V0 = DAG.getUNDEF(VT);

10573

V1 = DAG.getUNDEF(VT);

10574

10575

// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

10576

// half of the result is calculated independently from the 128-bit halves of

10577

// the inputs, so that makes the index-checking logic below more complicated.

10578

unsigned NumElts = VT.getVectorNumElements();

10579

unsigned GenericOpcode = ISD::DELETED_NODE;

10580

unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

10581

unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

10582

unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

10583

for (unsigned i = 0; i != Num128BitChunks; ++i) {

10584

for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

10585

// Ignore undef elements.

10586

SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

10587

if (Op.isUndef())

10588

continue;

10589

10590

// If there's an opcode mismatch, we're done.

10591

if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

10592

return false;

10593

10594

// Initialize horizontal opcode.

10595

if (HOpcode == ISD::DELETED_NODE) {

10596

GenericOpcode = Op.getOpcode();

10597

switch (GenericOpcode) {

10598

case ISD::ADD: HOpcode = X86ISD::HADD; break;

10599

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

10600

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

10601

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

10602

default: return false;

10603

}

10604

}

10605

10606

SDValue Op0 = Op.getOperand(0);

10607

SDValue Op1 = Op.getOperand(1);

10608

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10609

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10610

Op0.getOperand(0) != Op1.getOperand(0) ||

10611

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10612

!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

10613

return false;

10614

10615

// The source vector is chosen based on which 64-bit half of the

10616

// destination vector is being calculated.

10617

if (j < NumEltsIn64Bits) {

10618

if (V0.isUndef())

10619

V0 = Op0.getOperand(0);

10620

} else {

10621

if (V1.isUndef())

10622

V1 = Op0.getOperand(0);

10623

}

10624

10625

SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;

10626

if (SourceVec != Op0.getOperand(0))

10627

return false;

10628

10629

// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

10630

unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

10631

unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

10632

unsigned ExpectedIndex = i * NumEltsIn128Bits +

10633

(j % NumEltsIn64Bits) * 2;

10634

if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

10635

continue;

10636

10637

// If this is not a commutative op, this does not match.

10638

if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

10639

return false;

10640

10641

// Addition is commutative, so try swapping the extract indexes.

10642

// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

10643

if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

10644

continue;

10645

10646

// Extract indexes do not match horizontal requirement.

10647

return false;

10648

}

10649

}

10650

// We matched. Opcode and operands are returned by reference as arguments.

10651

return true;

10652

}

10653

10654

static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

10655

SelectionDAG &DAG, unsigned HOpcode,

10656

SDValue V0, SDValue V1) {

10657

// If either input vector is not the same size as the build vector,

10658

// extract/insert the low bits to the correct size.

10659

// This is free (examples: zmm --> xmm, xmm --> ymm).

10660

MVT VT = BV->getSimpleValueType(0);

10661

unsigned Width = VT.getSizeInBits();

10662

if (V0.getValueSizeInBits() > Width)

10663

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);

10664

else if (V0.getValueSizeInBits() < Width)

10665

V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

10666

10667

if (V1.getValueSizeInBits() > Width)

10668

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);

10669

else if (V1.getValueSizeInBits() < Width)

10670

V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

10671

10672

unsigned NumElts = VT.getVectorNumElements();

10673

APInt DemandedElts = APInt::getAllOnes(NumElts);

10674

for (unsigned i = 0; i != NumElts; ++i)

10675

if (BV->getOperand(i).isUndef())

10676

DemandedElts.clearBit(i);

10677

10678

// If we don't need the upper xmm, then perform as a xmm hop.

10679

unsigned HalfNumElts = NumElts / 2;

10680

if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

10681

MVT HalfVT = VT.getHalfNumVectorElementsVT();

10682

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);

10683

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);

10684

SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);

10685

return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);

10686

}

10687

10688

return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);

10689

}

10690

10691

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

10692

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

10693

const X86Subtarget &Subtarget,

10694

SelectionDAG &DAG) {

10695

// We need at least 2 non-undef elements to make this worthwhile by default.

10696

unsigned NumNonUndefs =

10697

count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

10698

if (NumNonUndefs < 2)

10699

return SDValue();

10700

10701

// There are 4 sets of horizontal math operations distinguished by type:

10702

// int/FP at 128-bit/256-bit. Each type was introduced with a different

10703

// subtarget feature. Try to match those "native" patterns first.

10704

MVT VT = BV->getSimpleValueType(0);

10705

if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

10706

((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

10707

((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

10708

((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

10709

unsigned HOpcode;

10710

SDValue V0, V1;

10711

if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

10712

return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);

10713

}

10714

10715

// Try harder to match 256-bit ops by using extract/concat.

10716

if (!Subtarget.hasAVX() || !VT.is256BitVector())

10717

return SDValue();

10718

10719

// Count the number of UNDEF operands in the build_vector in input.

10720

unsigned NumElts = VT.getVectorNumElements();

10721

unsigned Half = NumElts / 2;

10722

unsigned NumUndefsLO = 0;

10723

unsigned NumUndefsHI = 0;

10724

for (unsigned i = 0, e = Half; i != e; ++i)

10725

if (BV->getOperand(i)->isUndef())

10726

NumUndefsLO++;

10727

10728

for (unsigned i = Half, e = NumElts; i != e; ++i)

10729

if (BV->getOperand(i)->isUndef())

10730

NumUndefsHI++;

10731

10732

SDLoc DL(BV);

10733

SDValue InVec0, InVec1;

10734

if (VT == MVT::v8i32 || VT == MVT::v16i16) {

10735

SDValue InVec2, InVec3;

10736

unsigned X86Opcode;

10737

bool CanFold = true;

10738

10739

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

10740

isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,

10741

InVec3) &&

10742

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10743

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10744

X86Opcode = X86ISD::HADD;

10745

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,

10746

InVec1) &&

10747

isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,

10748

InVec3) &&

10749

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10750

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10751

X86Opcode = X86ISD::HSUB;

10752

else

10753

CanFold = false;

10754

10755

if (CanFold) {

10756

// Do not try to expand this build_vector into a pair of horizontal

10757

// add/sub if we can emit a pair of scalar add/sub.

10758

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10759

return SDValue();

10760

10761

// Convert this build_vector into a pair of horizontal binops followed by

10762

// a concat vector. We must adjust the outputs from the partial horizontal

10763

// matching calls above to account for undefined vector halves.

10764

SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

10765

SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

10766

assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10766, __extension__
__PRETTY_FUNCTION__));

10767

bool isUndefLO = NumUndefsLO == Half;

10768

bool isUndefHI = NumUndefsHI == Half;

10769

return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

10770

isUndefHI);

10771

}

10772

}

10773

10774

if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

10775

VT == MVT::v16i16) {

10776

unsigned X86Opcode;

10777

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

10778

X86Opcode = X86ISD::HADD;

10779

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,

10780

InVec1))

10781

X86Opcode = X86ISD::HSUB;

10782

else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,

10783

InVec1))

10784

X86Opcode = X86ISD::FHADD;

10785

else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,

10786

InVec1))

10787

X86Opcode = X86ISD::FHSUB;

10788

else

10789

return SDValue();

10790

10791

// Don't try to expand this build_vector into a pair of horizontal add/sub

10792

// if we can simply emit a pair of scalar add/sub.

10793

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10794

return SDValue();

10795

10796

// Convert this build_vector into two horizontal add/sub followed by

10797

// a concat vector.

10798

bool isUndefLO = NumUndefsLO == Half;

10799

bool isUndefHI = NumUndefsHI == Half;

10800

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

10801

isUndefLO, isUndefHI);

10802

}

10803

10804

return SDValue();

10805

}

10806

10807

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

10808

SelectionDAG &DAG);

10809

10810

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

10811

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

10812

/// just apply the bit to the vectors.

10813

/// NOTE: Its not in our interest to start make a general purpose vectorizer

10814

/// from this, but enough scalar bit operations are created from the later

10815

/// legalization + scalarization stages to need basic support.

10816

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

10817

const X86Subtarget &Subtarget,

10818

SelectionDAG &DAG) {

10819

SDLoc DL(Op);

10820

MVT VT = Op->getSimpleValueType(0);

10821

unsigned NumElems = VT.getVectorNumElements();

10822

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

10823

10824

// Check that all elements have the same opcode.

10825

// TODO: Should we allow UNDEFS and if so how many?

10826

unsigned Opcode = Op->getOperand(0).getOpcode();

10827

for (unsigned i = 1; i < NumElems; ++i)

10828

if (Opcode != Op->getOperand(i).getOpcode())

10829

return SDValue();

10830

10831

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

10832

bool IsShift = false;

10833

switch (Opcode) {

10834

default:

10835

return SDValue();

10836

case ISD::SHL:

10837

case ISD::SRL:

10838

case ISD::SRA:

10839

IsShift = true;

10840

break;

10841

case ISD::AND:

10842

case ISD::XOR:

10843

case ISD::OR:

10844

// Don't do this if the buildvector is a splat - we'd replace one

10845

// constant with an entire vector.

10846

if (Op->getSplatValue())

10847

return SDValue();

10848

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

10849

return SDValue();

10850

break;

10851

}

10852

10853

SmallVector<SDValue, 4> LHSElts, RHSElts;

10854

for (SDValue Elt : Op->ops()) {

10855

SDValue LHS = Elt.getOperand(0);

10856

SDValue RHS = Elt.getOperand(1);

10857

10858

// We expect the canonicalized RHS operand to be the constant.

10859

if (!isa<ConstantSDNode>(RHS))

10860

return SDValue();

10861

10862

// Extend shift amounts.

10863

if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

10864

if (!IsShift)

10865

return SDValue();

10866

RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

10867

}

10868

10869

LHSElts.push_back(LHS);

10870

RHSElts.push_back(RHS);

10871

}

10872

10873

// Limit to shifts by uniform immediates.

10874

// TODO: Only accept vXi8/vXi64 special cases?

10875

// TODO: Permit non-uniform XOP/AVX2/MULLO cases?

10876

if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

10877

return SDValue();

10878

10879

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

10880

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

10881

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

10882

10883

if (!IsShift)

10884

return Res;

10885

10886

// Immediately lower the shift to ensure the constant build vector doesn't

10887

// get converted to a constant pool before the shift is lowered.

10888

return LowerShift(Res, Subtarget, DAG);

10889

}

10890

10891

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

10892

/// functionality to do this, so it's all zeros, all ones, or some derivation

10893

/// that is cheap to calculate.

10894

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

10895

const X86Subtarget &Subtarget) {

10896

SDLoc DL(Op);

10897

MVT VT = Op.getSimpleValueType();

10898

10899

// Vectors containing all zeros can be matched by pxor and xorps.

10900

if (ISD::isBuildVectorAllZeros(Op.getNode()))

10901

return Op;

10902

10903

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

10904

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

10905

// vpcmpeqd on 256-bit vectors.

10906

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

10907

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

10908

return Op;

10909

10910

return getOnesVector(VT, DAG, DL);

10911

}

10912

10913

return SDValue();

10914

}

10915

10916

/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

10917

/// from a vector of source values and a vector of extraction indices.

10918

/// The vectors might be manipulated to match the type of the permute op.

10919

static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

10920

SDLoc &DL, SelectionDAG &DAG,

10921

const X86Subtarget &Subtarget) {

10922

MVT ShuffleVT = VT;

10923

EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10924

unsigned NumElts = VT.getVectorNumElements();

10925

unsigned SizeInBits = VT.getSizeInBits();

10926

10927

// Adjust IndicesVec to match VT size.

10928

assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__
__PRETTY_FUNCTION__))

10929

"Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__
__PRETTY_FUNCTION__));

10930

if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {

10931

// Narrow/widen the indices vector to the correct size.

10932

if (IndicesVec.getValueSizeInBits() > SizeInBits)

10933

IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

10934

NumElts * VT.getScalarSizeInBits());

10935

else if (IndicesVec.getValueSizeInBits() < SizeInBits)

10936

IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,

10937

SDLoc(IndicesVec), SizeInBits);

10938

// Zero-extend the index elements within the vector.

10939

if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

10940

IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),

10941

IndicesVT, IndicesVec);

10942

}

10943

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

10944

10945

// Handle SrcVec that don't match VT type.

10946

if (SrcVec.getValueSizeInBits() != SizeInBits) {

10947

if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

10948

// Handle larger SrcVec by treating it as a larger permute.

10949

unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

10950

VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

10951

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10952

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

10953

Subtarget, DAG, SDLoc(IndicesVec));

10954

SDValue NewSrcVec =

10955

createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10956

if (NewSrcVec)

10957

return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

10958

return SDValue();

10959

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

10960

// Widen smaller SrcVec to match VT.

10961

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

10962

} else

10963

return SDValue();

10964

}

10965

10966

auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

10967

assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10967, __extension__
__PRETTY_FUNCTION__));

10968

EVT SrcVT = Idx.getValueType();

10969

unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

10970

uint64_t IndexScale = 0;

10971

uint64_t IndexOffset = 0;

10972

10973

// If we're scaling a smaller permute op, then we need to repeat the

10974

// indices, scaling and offsetting them as well.

10975

// e.g. v4i32 -> v16i8 (Scale = 4)

10976

// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

10977

// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

10978

for (uint64_t i = 0; i != Scale; ++i) {

10979

IndexScale |= Scale << (i * NumDstBits);

10980

IndexOffset |= i << (i * NumDstBits);

10981

}

10982

10983

Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

10984

DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

10985

Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

10986

DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

10987

return Idx;

10988

};

10989

10990

unsigned Opcode = 0;

10991

switch (VT.SimpleTy) {

10992

default:

10993

break;

10994

case MVT::v16i8:

10995

if (Subtarget.hasSSSE3())

10996

Opcode = X86ISD::PSHUFB;

10997

break;

10998

case MVT::v8i16:

10999

if (Subtarget.hasVLX() && Subtarget.hasBWI())

11000

Opcode = X86ISD::VPERMV;

11001

else if (Subtarget.hasSSSE3()) {

11002

Opcode = X86ISD::PSHUFB;

11003

ShuffleVT = MVT::v16i8;

11004

}

11005

break;

11006

case MVT::v4f32:

11007

case MVT::v4i32:

11008

if (Subtarget.hasAVX()) {

11009

Opcode = X86ISD::VPERMILPV;

11010

ShuffleVT = MVT::v4f32;

11011

} else if (Subtarget.hasSSSE3()) {

11012

Opcode = X86ISD::PSHUFB;

11013

ShuffleVT = MVT::v16i8;

11014

}

11015

break;

11016

case MVT::v2f64:

11017

case MVT::v2i64:

11018

if (Subtarget.hasAVX()) {

11019

// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

11020

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11021

Opcode = X86ISD::VPERMILPV;

11022

ShuffleVT = MVT::v2f64;

11023

} else if (Subtarget.hasSSE41()) {

11024

// SSE41 can compare v2i64 - select between indices 0 and 1.

11025

return DAG.getSelectCC(

11026

DL, IndicesVec,

11027

getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

11028

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

11029

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

11030

ISD::CondCode::SETEQ);

11031

}

11032

break;

11033

case MVT::v32i8:

11034

if (Subtarget.hasVLX() && Subtarget.hasVBMI())

11035

Opcode = X86ISD::VPERMV;

11036

else if (Subtarget.hasXOP()) {

11037

SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

11038

SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

11039

SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

11040

SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

11041

return DAG.getNode(

11042

ISD::CONCAT_VECTORS, DL, VT,

11043

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

11044

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

11045

} else if (Subtarget.hasAVX()) {

11046

SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

11047

SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

11048

SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

11049

SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

11050

auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

11051

ArrayRef<SDValue> Ops) {

11052

// Permute Lo and Hi and then select based on index range.

11053

// This works as SHUFB uses bits[3:0] to permute elements and we don't

11054

// care about the bit[7] as its just an index vector.

11055

SDValue Idx = Ops[2];

11056

EVT VT = Idx.getValueType();

11057

return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

11058

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

11059

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

11060

ISD::CondCode::SETGT);

11061

};

11062

SDValue Ops[] = {LoLo, HiHi, IndicesVec};

11063

return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

11064

PSHUFBBuilder);

11065

}

11066

break;

11067

case MVT::v16i16:

11068

if (Subtarget.hasVLX() && Subtarget.hasBWI())

11069

Opcode = X86ISD::VPERMV;

11070

else if (Subtarget.hasAVX()) {

11071

// Scale to v32i8 and perform as v32i8.

11072

IndicesVec = ScaleIndices(IndicesVec, 2);

11073

return DAG.getBitcast(

11074

VT, createVariablePermute(

11075

MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

11076

DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

11077

}

11078

break;

11079

case MVT::v8f32:

11080

case MVT::v8i32:

11081

if (Subtarget.hasAVX2())

11082

Opcode = X86ISD::VPERMV;

11083

else if (Subtarget.hasAVX()) {

11084

SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

11085

SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11086

{0, 1, 2, 3, 0, 1, 2, 3});

11087

SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11088

{4, 5, 6, 7, 4, 5, 6, 7});

11089

if (Subtarget.hasXOP())

11090

return DAG.getBitcast(

11091

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

11092

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11093

// Permute Lo and Hi and then select based on index range.

11094

// This works as VPERMILPS only uses index bits[0:1] to permute elements.

11095

SDValue Res = DAG.getSelectCC(

11096

DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

11097

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

11098

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

11099

ISD::CondCode::SETGT);

11100

return DAG.getBitcast(VT, Res);

11101

}

11102

break;

11103

case MVT::v4i64:

11104

case MVT::v4f64:

11105

if (Subtarget.hasAVX512()) {

11106

if (!Subtarget.hasVLX()) {

11107

MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

11108

SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

11109

SDLoc(SrcVec));

11110

IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

11111

DAG, SDLoc(IndicesVec));

11112

SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

11113

DAG, Subtarget);

11114

return extract256BitVector(Res, 0, DAG, DL);

11115

}

11116

Opcode = X86ISD::VPERMV;

11117

} else if (Subtarget.hasAVX()) {

11118

SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

11119

SDValue LoLo =

11120

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

11121

SDValue HiHi =

11122

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

11123

// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

11124

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11125

if (Subtarget.hasXOP())

11126

return DAG.getBitcast(

11127

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

11128

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11129

// Permute Lo and Hi and then select based on index range.

11130

// This works as VPERMILPD only uses index bit[1] to permute elements.

11131

SDValue Res = DAG.getSelectCC(

11132

DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

11133

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

11134

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

11135

ISD::CondCode::SETGT);

11136

return DAG.getBitcast(VT, Res);

11137

}

11138

break;

11139

case MVT::v64i8:

11140

if (Subtarget.hasVBMI())

11141

Opcode = X86ISD::VPERMV;

11142

break;

11143

case MVT::v32i16:

11144

if (Subtarget.hasBWI())

11145

Opcode = X86ISD::VPERMV;

11146

break;

11147

case MVT::v16f32:

11148

case MVT::v16i32:

11149

case MVT::v8f64:

11150

case MVT::v8i64:

11151

if (Subtarget.hasAVX512())

11152

Opcode = X86ISD::VPERMV;

11153

break;

11154

}

11155

if (!Opcode)

11156

return SDValue();

11157

11158

assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))

11159

(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__))

11160

"Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__
__PRETTY_FUNCTION__));

11161

11162

uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

11163

if (Scale > 1)

11164

IndicesVec = ScaleIndices(IndicesVec, Scale);

11165

11166

EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

11167

IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

11168

11169

SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

11170

SDValue Res = Opcode == X86ISD::VPERMV

11171

? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

11172

: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

11173

return DAG.getBitcast(VT, Res);

11174

}

11175

11176

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

11177

// reasoned to be a permutation of a vector by indices in a non-constant vector.

11178

// (build_vector (extract_elt V, (extract_elt I, 0)),

11179

// (extract_elt V, (extract_elt I, 1)),

11180

// ...

11181

// ->

11182

// (vpermv I, V)

11183

//

11184

// TODO: Handle undefs

11185

// TODO: Utilize pshufb and zero mask blending to support more efficient

11186

// construction of vectors with constant-0 elements.

11187

static SDValue

11188

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

11189

const X86Subtarget &Subtarget) {

11190

SDValue SrcVec, IndicesVec;

11191

// Check for a match of the permute source vector and permute index elements.

11192

// This is done by checking that the i-th build_vector operand is of the form:

11193

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

11194

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

11195

SDValue Op = V.getOperand(Idx);

11196

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11197

return SDValue();

11198

11199

// If this is the first extract encountered in V, set the source vector,

11200

// otherwise verify the extract is from the previously defined source

11201

// vector.

11202

if (!SrcVec)

11203

SrcVec = Op.getOperand(0);

11204

else if (SrcVec != Op.getOperand(0))

11205

return SDValue();

11206

SDValue ExtractedIndex = Op->getOperand(1);

11207

// Peek through extends.

11208

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

11209

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

11210

ExtractedIndex = ExtractedIndex.getOperand(0);

11211

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11212

return SDValue();

11213

11214

// If this is the first extract from the index vector candidate, set the

11215

// indices vector, otherwise verify the extract is from the previously

11216

// defined indices vector.

11217

if (!IndicesVec)

11218

IndicesVec = ExtractedIndex.getOperand(0);

11219

else if (IndicesVec != ExtractedIndex.getOperand(0))

11220

return SDValue();

11221

11222

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

11223

if (!PermIdx || PermIdx->getAPIntValue() != Idx)

11224

return SDValue();

11225

}

11226

11227

SDLoc DL(V);

11228

MVT VT = V.getSimpleValueType();

11229

return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

11230

}

11231

11232

SDValue

11233

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

11234

SDLoc dl(Op);

11235

11236

MVT VT = Op.getSimpleValueType();

11237

MVT EltVT = VT.getVectorElementType();

11238

MVT OpEltVT = Op.getOperand(0).getSimpleValueType();

11239

unsigned NumElems = Op.getNumOperands();

11240

11241

// Generate vectors for predicate vectors.

11242

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

11243

return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

11244

11245

if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())

11246

return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);

11247

11248

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

11249

return VectorConstant;

11250

11251

unsigned EVTBits = EltVT.getSizeInBits();

11252

APInt UndefMask = APInt::getZero(NumElems);

11253

APInt FrozenUndefMask = APInt::getZero(NumElems);

11254

APInt ZeroMask = APInt::getZero(NumElems);

11255

APInt NonZeroMask = APInt::getZero(NumElems);

11256

bool IsAllConstants = true;

11257

bool OneUseFrozenUndefs = true;

11258

SmallSet<SDValue, 8> Values;

11259

unsigned NumConstants = NumElems;

11260

for (unsigned i = 0; i < NumElems; ++i) {

11261

SDValue Elt = Op.getOperand(i);

11262

if (Elt.isUndef()) {

11263

UndefMask.setBit(i);

11264

continue;

11265

}

11266

if (ISD::isFreezeUndef(Elt.getNode())) {

11267

OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();

11268

FrozenUndefMask.setBit(i);

11269

continue;

11270

}

11271

Values.insert(Elt);

11272

if (!isIntOrFPConstant(Elt)) {

11273

IsAllConstants = false;

11274

NumConstants--;

11275

}

11276

if (X86::isZeroNode(Elt)) {

11277

ZeroMask.setBit(i);

11278

} else {

11279

NonZeroMask.setBit(i);

11280

}

11281

}

11282

11283

// All undef vector. Return an UNDEF.

11284

if (UndefMask.isAllOnes())

11285

return DAG.getUNDEF(VT);

11286

11287

// All undef/freeze(undef) vector. Return a FREEZE UNDEF.

11288

if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())

11289

return DAG.getFreeze(DAG.getUNDEF(VT));

11290

11291

// All undef/freeze(undef)/zero vector. Return a zero vector.

11292

if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())

11293

return getZeroVector(VT, Subtarget, DAG, dl);

11294

11295

// If we have multiple FREEZE-UNDEF operands, we are likely going to end up

11296

// lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in

11297

// our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,

11298

// and blend the FREEZE-UNDEF operands back in.

11299

// FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?

11300

if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();

11301

NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {

11302

SmallVector<int, 16> BlendMask(NumElems, -1);

11303

SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));

11304

for (unsigned i = 0; i < NumElems; ++i) {

11305

if (UndefMask[i]) {

11306

BlendMask[i] = -1;

11307

continue;

11308

}

11309

BlendMask[i] = i;

11310

if (!FrozenUndefMask[i])

11311

Elts[i] = Op.getOperand(i);

11312

else

11313

BlendMask[i] += NumElems;

11314

}

11315

SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);

11316

SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));

11317

SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);

11318

return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);

11319

}

11320

11321

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

11322

11323

// If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might

11324

// be better off lowering to a smaller build vector and padding with

11325

// undef/zero.

11326

if ((VT.is256BitVector() || VT.is512BitVector()) &&

11327

!isFoldableUseOfShuffle(BV)) {

11328

unsigned UpperElems = NumElems / 2;

11329

APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;

11330

unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();

11331

if (NumUpperUndefsOrZeros >= UpperElems) {

11332

if (VT.is512BitVector() &&

11333

NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))

11334

UpperElems = NumElems - (NumElems / 4);

11335

// If freeze(undef) is in any upper elements, force to zero.

11336

bool UndefUpper = UndefMask.countl_one() >= UpperElems;

11337

MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);

11338

SDValue NewBV =

11339

DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));

11340

return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);

11341

}

11342

}

11343

11344

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

11345

return AddSub;

11346

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

11347

return HorizontalOp;

11348

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

11349

return Broadcast;

11350

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

11351

return BitOp;

11352

11353

unsigned NumZero = ZeroMask.popcount();

11354

unsigned NumNonZero = NonZeroMask.popcount();

11355

11356

// If we are inserting one variable into a vector of non-zero constants, try

11357

// to avoid loading each constant element as a scalar. Load the constants as a

11358

// vector and then insert the variable scalar element. If insertion is not

11359

// supported, fall back to a shuffle to get the scalar blended with the

11360

// constants. Insertion into a zero vector is handled as a special-case

11361

// somewhere below here.

11362

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

11363

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

11364

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

11365

// Create an all-constant vector. The variable element in the old

11366

// build vector is replaced by undef in the constant vector. Save the

11367

// variable scalar element and its index for use in the insertelement.

11368

LLVMContext &Context = *DAG.getContext();

11369

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

11370

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

11371

SDValue VarElt;

11372

SDValue InsIndex;

11373

for (unsigned i = 0; i != NumElems; ++i) {

11374

SDValue Elt = Op.getOperand(i);

11375

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

11376

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

11377

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

11378

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

11379

else if (!Elt.isUndef()) {

11380

assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__
__PRETTY_FUNCTION__))

11381

"Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__
__PRETTY_FUNCTION__));

11382

VarElt = Elt;

11383

InsIndex = DAG.getVectorIdxConstant(i, dl);

11384

}

11385

}

11386

Constant *CV = ConstantVector::get(ConstVecOps);

11387

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

11388

11389

// The constants we just created may not be legal (eg, floating point). We

11390

// must lower the vector right here because we can not guarantee that we'll

11391

// legalize it before loading it. This is also why we could not just create

11392

// a new build vector here. If the build vector contains illegal constants,

11393

// it could get split back up into a series of insert elements.

11394

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

11395

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

11396

MachineFunction &MF = DAG.getMachineFunction();

11397

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

11398

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

11399

unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();

11400

unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

11401

if (InsertC < NumEltsInLow128Bits)

11402

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

11403

11404

// There's no good way to insert into the high elements of a >128-bit

11405

// vector, so use shuffles to avoid an extract/insert sequence.

11406

assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11406, __extension__
__PRETTY_FUNCTION__));

11407

assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11407, __extension__
__PRETTY_FUNCTION__));

11408

SmallVector<int, 8> ShuffleMask;

11409

unsigned NumElts = VT.getVectorNumElements();

11410

for (unsigned i = 0; i != NumElts; ++i)

11411

ShuffleMask.push_back(i == InsertC ? NumElts : i);

11412

SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

11413

return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

11414

}

11415

11416

// Special case for single non-zero, non-undef, element.

11417

if (NumNonZero == 1) {

11418

unsigned Idx = NonZeroMask.countr_zero();

11419

SDValue Item = Op.getOperand(Idx);

11420

11421

// If we have a constant or non-constant insertion into the low element of

11422

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

11423

// the rest of the elements. This will be matched as movd/movq/movss/movsd

11424

// depending on what the source datatype is.

11425

if (Idx == 0) {

11426

if (NumZero == 0)

11427

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11428

11429

if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

11430

EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||

11431

(EltVT == MVT::i16 && Subtarget.hasFP16())) {

11432

assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))

11433

VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__))

11434

"Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__
__PRETTY_FUNCTION__));

11435

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11436

// Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a

11437

// zero vector.

11438

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11439

}

11440

11441

// We can't directly insert an i8 or i16 into a vector, so zero extend

11442

// it to i32 first.

11443

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

11444

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

11445

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

11446

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

11447

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11448

return DAG.getBitcast(VT, Item);

11449

}

11450

}

11451

11452

// Is it a vector logical left shift?

11453

if (NumElems == 2 && Idx == 1 &&

11454

X86::isZeroNode(Op.getOperand(0)) &&

11455

!X86::isZeroNode(Op.getOperand(1))) {

11456

unsigned NumBits = VT.getSizeInBits();

11457

return getVShift(true, VT,

11458

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

11459

VT, Op.getOperand(1)),

11460

NumBits/2, DAG, *this, dl);

11461

}

11462

11463

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

11464

return SDValue();

11465

11466

// Otherwise, if this is a vector with i32 or f32 elements, and the element

11467

// is a non-constant being inserted into an element other than the low one,

11468

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

11469

// movd/movss) to move this into the low element, then shuffle it into

11470

// place.

11471

if (EVTBits == 32) {

11472

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11473

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

11474

}

11475

}

11476

11477

// Splat is obviously ok. Let legalizer expand it to a shuffle.

11478

if (Values.size() == 1) {

11479

if (EVTBits == 32) {

11480

// Instead of a shuffle like this:

11481

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

11482

// Check if it's possible to issue this instead.

11483

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

11484

unsigned Idx = NonZeroMask.countr_zero();

11485

SDValue Item = Op.getOperand(Idx);

11486

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

11487

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

11488

}

11489

return SDValue();

11490

}

11491

11492

// A vector full of immediates; various special cases are already

11493

// handled, so this is best done with a single constant-pool load.

11494

if (IsAllConstants)

11495

return SDValue();

11496

11497

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

11498

return V;

11499

11500

// See if we can use a vector load to get all of the elements.

11501

{

11502

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

11503

if (SDValue LD =

11504

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

11505

return LD;

11506

}

11507

11508

// If this is a splat of pairs of 32-bit elements, we can use a narrower

11509

// build_vector and broadcast it.

11510

// TODO: We could probably generalize this more.

11511

if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

11512

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

11513

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

11514

auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

11515

// Make sure all the even/odd operands match.

11516

for (unsigned i = 2; i != NumElems; ++i)

11517

if (Ops[i % 2] != Op.getOperand(i))

11518

return false;

11519

return true;

11520

};

11521

if (CanSplat(Op, NumElems, Ops)) {

11522

MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

11523

MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

11524

// Create a new build vector and cast to v2i64/v2f64.

11525

SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

11526

DAG.getBuildVector(NarrowVT, dl, Ops));

11527

// Broadcast from v2i64/v2f64 and cast to final VT.

11528

MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);

11529

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

11530

NewBV));

11531

}

11532

}

11533

11534

// For AVX-length vectors, build the individual 128-bit pieces and use

11535

// shuffles to put them in place.

11536

if (VT.getSizeInBits() > 128) {

11537

MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);

11538

11539

// Build both the lower and upper subvector.

11540

SDValue Lower =

11541

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

11542

SDValue Upper = DAG.getBuildVector(

11543

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

11544

11545

// Recreate the wider vector with the lower and upper part.

11546

return concatSubVectors(Lower, Upper, DAG, dl);

11547

}

11548

11549

// Let legalizer expand 2-wide build_vectors.

11550

if (EVTBits == 64) {

11551

if (NumNonZero == 1) {

11552

// One half is zero or undef.

11553

unsigned Idx = NonZeroMask.countr_zero();

11554

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

11555

Op.getOperand(Idx));

11556

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

11557

}

11558

return SDValue();

11559

}

11560

11561

// If element VT is < 32 bits, convert it to inserts into a zero vector.

11562

if (EVTBits == 8 && NumElems == 16)

11563

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,

11564

DAG, Subtarget))

11565

return V;

11566

11567

if (EltVT == MVT::i16 && NumElems == 8)

11568

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,

11569

DAG, Subtarget))

11570

return V;

11571

11572

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

11573

if (EVTBits == 32 && NumElems == 4)

11574

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

11575

return V;

11576

11577

// If element VT is == 32 bits, turn it into a number of shuffles.

11578

if (NumElems == 4 && NumZero > 0) {

11579

SmallVector<SDValue, 8> Ops(NumElems);

11580

for (unsigned i = 0; i < 4; ++i) {

11581

bool isZero = !NonZeroMask[i];

11582

if (isZero)

11583

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

11584

else

11585

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11586

}

11587

11588

for (unsigned i = 0; i < 2; ++i) {

11589

switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {

11590

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11590);

11591

case 0:

11592

Ops[i] = Ops[i*2]; // Must be a zero vector.

11593

break;

11594

case 1:

11595

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

11596

break;

11597

case 2:

11598

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11599

break;

11600

case 3:

11601

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11602

break;

11603

}

11604

}

11605

11606

bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;

11607

bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;

11608

int MaskVec[] = {

11609

Reverse1 ? 1 : 0,

11610

Reverse1 ? 0 : 1,

11611

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

11612

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

11613

};

11614

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

11615

}

11616

11617

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11617, __extension__
__PRETTY_FUNCTION__));

11618

11619

// Check for a build vector from mostly shuffle plus few inserting.

11620

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

11621

return Sh;

11622

11623

// For SSE 4.1, use insertps to put the high elements into the low element.

11624

if (Subtarget.hasSSE41() && EltVT != MVT::f16) {

11625

SDValue Result;

11626

if (!Op.getOperand(0).isUndef())

11627

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

11628

else

11629

Result = DAG.getUNDEF(VT);

11630

11631

for (unsigned i = 1; i < NumElems; ++i) {

11632

if (Op.getOperand(i).isUndef()) continue;

11633

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

11634

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

11635

}

11636

return Result;

11637

}

11638

11639

// Otherwise, expand into a number of unpckl*, start by extending each of

11640

// our (non-undef) elements to the full vector width with the element in the

11641

// bottom slot of the vector (which generates no code for SSE).

11642

SmallVector<SDValue, 8> Ops(NumElems);

11643

for (unsigned i = 0; i < NumElems; ++i) {

11644

if (!Op.getOperand(i).isUndef())

11645

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11646

else

11647

Ops[i] = DAG.getUNDEF(VT);

11648

}

11649

11650

// Next, we iteratively mix elements, e.g. for v4f32:

11651

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

11652

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

11653

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

11654

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

11655

// Generate scaled UNPCKL shuffle mask.

11656

SmallVector<int, 16> Mask;

11657

for(unsigned i = 0; i != Scale; ++i)

11658

Mask.push_back(i);

11659

for (unsigned i = 0; i != Scale; ++i)

11660

Mask.push_back(NumElems+i);

11661

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

11662

11663

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

11664

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

11665

}

11666

return Ops[0];

11667

}

11668

11669

// 256-bit AVX can use the vinsertf128 instruction

11670

// to create 256-bit vectors from two other 128-bit ones.

11671

// TODO: Detect subvector broadcast here instead of DAG combine?

11672

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,

11673

const X86Subtarget &Subtarget) {

11674

SDLoc dl(Op);

11675

MVT ResVT = Op.getSimpleValueType();

11676

11677

assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__
__PRETTY_FUNCTION__))

11678

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__
__PRETTY_FUNCTION__));

11679

11680

unsigned NumOperands = Op.getNumOperands();

11681

unsigned NumFreezeUndef = 0;

11682

unsigned NumZero = 0;

11683

unsigned NumNonZero = 0;

11684

unsigned NonZeros = 0;

11685

for (unsigned i = 0; i != NumOperands; ++i) {

11686

SDValue SubVec = Op.getOperand(i);

11687

if (SubVec.isUndef())

11688

continue;

11689

if (ISD::isFreezeUndef(SubVec.getNode())) {

11690

// If the freeze(undef) has multiple uses then we must fold to zero.

11691

if (SubVec.hasOneUse())

11692

++NumFreezeUndef;

11693

else

11694

++NumZero;

11695

}

11696

else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11697

++NumZero;

11698

else {

11699

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11699, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11700

NonZeros |= 1 << i;

11701

++NumNonZero;

11702

}

11703

}

11704

11705

// If we have more than 2 non-zeros, build each half separately.

11706

if (NumNonZero > 2) {

11707

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11708

ArrayRef<SDUse> Ops = Op->ops();

11709

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11710

Ops.slice(0, NumOperands/2));

11711

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11712

Ops.slice(NumOperands/2));

11713

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11714

}

11715

11716

// Otherwise, build it up through insert_subvectors.

11717

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

11718

: (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))

11719

: DAG.getUNDEF(ResVT));

11720

11721

MVT SubVT = Op.getOperand(0).getSimpleValueType();

11722

unsigned NumSubElems = SubVT.getVectorNumElements();

11723

for (unsigned i = 0; i != NumOperands; ++i) {

11724

if ((NonZeros & (1 << i)) == 0)

11725

continue;

11726

11727

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,

11728

Op.getOperand(i),

11729

DAG.getIntPtrConstant(i * NumSubElems, dl));

11730

}

11731

11732

return Vec;

11733

}

11734

11735

// Returns true if the given node is a type promotion (by concatenating i1

11736

// zeros) of the result of a node that already zeros all upper bits of

11737

// k-register.

11738

// TODO: Merge this with LowerAVXCONCAT_VECTORS?

11739

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

11740

const X86Subtarget &Subtarget,

11741

SelectionDAG & DAG) {

11742

SDLoc dl(Op);

11743

MVT ResVT = Op.getSimpleValueType();

11744

unsigned NumOperands = Op.getNumOperands();

11745

11746

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__
__PRETTY_FUNCTION__))

11747

"Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__
__PRETTY_FUNCTION__));

11748

11749

uint64_t Zeros = 0;

11750

uint64_t NonZeros = 0;

11751

for (unsigned i = 0; i != NumOperands; ++i) {

11752

SDValue SubVec = Op.getOperand(i);

11753

if (SubVec.isUndef())

11754

continue;

11755

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11755, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11756

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11757

Zeros |= (uint64_t)1 << i;

11758

else

11759

NonZeros |= (uint64_t)1 << i;

11760

}

11761

11762

unsigned NumElems = ResVT.getVectorNumElements();

11763

11764

// If we are inserting non-zero vector and there are zeros in LSBs and undef

11765

// in the MSBs we need to emit a KSHIFTL. The generic lowering to

11766

// insert_subvector will give us two kshifts.

11767

if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

11768

Log2_64(NonZeros) != NumOperands - 1) {

11769

MVT ShiftVT = ResVT;

11770

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

11771

ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

11772

unsigned Idx = Log2_64(NonZeros);

11773

SDValue SubVec = Op.getOperand(Idx);

11774

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11775

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,

11776

DAG.getUNDEF(ShiftVT), SubVec,

11777

DAG.getIntPtrConstant(0, dl));

11778

Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,

11779

DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

11780

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

11781

DAG.getIntPtrConstant(0, dl));

11782

}

11783

11784

// If there are zero or one non-zeros we can handle this very simply.

11785

if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

11786

SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

11787

if (!NonZeros)

11788

return Vec;

11789

unsigned Idx = Log2_64(NonZeros);

11790

SDValue SubVec = Op.getOperand(Idx);

11791

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11792

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

11793

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

11794

}

11795

11796

if (NumOperands > 2) {

11797

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11798

ArrayRef<SDUse> Ops = Op->ops();

11799

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11800

Ops.slice(0, NumOperands/2));

11801

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11802

Ops.slice(NumOperands/2));

11803

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11804

}

11805

11806

assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11806, __extension__
__PRETTY_FUNCTION__));

11807

11808

if (ResVT.getVectorNumElements() >= 16)

11809

return Op; // The operation is legal with KUNPCK

11810

11811

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

11812

DAG.getUNDEF(ResVT), Op.getOperand(0),

11813

DAG.getIntPtrConstant(0, dl));

11814

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

11815

DAG.getIntPtrConstant(NumElems/2, dl));

11816

}

11817

11818

static SDValue LowerCONCAT_VECTORS(SDValue Op,

11819

const X86Subtarget &Subtarget,

11820

SelectionDAG &DAG) {

11821

MVT VT = Op.getSimpleValueType();

11822

if (VT.getVectorElementType() == MVT::i1)

11823

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

11824

11825

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))

11826

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__))

11827

Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__
__PRETTY_FUNCTION__));

11828

11829

// AVX can use the vinsertf128 instruction to create 256-bit vectors

11830

// from two other 128-bit ones.

11831

11832

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

11833

return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);

11834

}

11835

11836

//===----------------------------------------------------------------------===//

11837

// Vector shuffle lowering

11838

//

11839

// This is an experimental code path for lowering vector shuffles on x86. It is

11840

// designed to handle arbitrary vector shuffles and blends, gracefully

11841

// degrading performance as necessary. It works hard to recognize idiomatic

11842

// shuffles and lower them to optimal instruction patterns without leaving

11843

// a framework that allows reasonably efficient handling of all vector shuffle

11844

// patterns.

11845

//===----------------------------------------------------------------------===//

11846

11847

/// Tiny helper function to identify a no-op mask.

11848

///

11849

/// This is a somewhat boring predicate function. It checks whether the mask

11850

/// array input, which is assumed to be a single-input shuffle mask of the kind

11851

/// used by the X86 shuffle instructions (not a fully general

11852

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

11853

/// in-place shuffle are 'no-op's.

11854

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

11855

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11856

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__));

11857

if (Mask[i] >= 0 && Mask[i] != i)

11858

return false;

11859

}

11860

return true;

11861

}

11862

11863

/// Test whether there are elements crossing LaneSizeInBits lanes in this

11864

/// shuffle mask.

11865

///

11866

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

11867

/// and we routinely test for these.

11868

static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

11869

unsigned ScalarSizeInBits,

11870

ArrayRef<int> Mask) {

11871

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))

11872

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__))

11873

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__
__PRETTY_FUNCTION__));

11874

int LaneSize = LaneSizeInBits / ScalarSizeInBits;

11875

int Size = Mask.size();

11876

for (int i = 0; i < Size; ++i)

11877

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

11878

return true;

11879

return false;

11880

}

11881

11882

/// Test whether there are elements crossing 128-bit lanes in this

11883

/// shuffle mask.

11884

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

11885

return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

11886

}

11887

11888

/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

11889

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

11890

/// better support 'repeated mask + lane permute' style shuffles.

11891

static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

11892

unsigned ScalarSizeInBits,

11893

ArrayRef<int> Mask) {

11894

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))

11895

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__))

11896

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__
__PRETTY_FUNCTION__));

11897

int NumElts = Mask.size();

11898

int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

11899

int NumLanes = NumElts / NumEltsPerLane;

11900

if (NumLanes > 1) {

11901

for (int i = 0; i != NumLanes; ++i) {

11902

int SrcLane = -1;

11903

for (int j = 0; j != NumEltsPerLane; ++j) {

11904

int M = Mask[(i * NumEltsPerLane) + j];

11905

if (M < 0)

11906

continue;

11907

int Lane = (M % NumElts) / NumEltsPerLane;

11908

if (SrcLane >= 0 && SrcLane != Lane)

11909

return true;

11910

SrcLane = Lane;

11911

}

11912

}

11913

}

11914

return false;

11915

}

11916

11917

/// Test whether a shuffle mask is equivalent within each sub-lane.

11918

///

11919

/// This checks a shuffle mask to see if it is performing the same

11920

/// lane-relative shuffle in each sub-lane. This trivially implies

11921

/// that it is also not lane-crossing. It may however involve a blend from the

11922

/// same lane of a second vector.

11923

///

11924

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

11925

/// non-trivial to compute in the face of undef lanes. The representation is

11926

/// suitable for use with existing 128-bit shuffles as entries from the second

11927

/// vector have been remapped to [LaneSize, 2*LaneSize).

11928

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

11929

ArrayRef<int> Mask,

11930

SmallVectorImpl<int> &RepeatedMask) {

11931

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

11932

RepeatedMask.assign(LaneSize, -1);

11933

int Size = Mask.size();

11934

for (int i = 0; i < Size; ++i) {

11935

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11935, __extension__
__PRETTY_FUNCTION__));

11936

if (Mask[i] < 0)

11937

continue;

11938

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11939

// This entry crosses lanes, so there is no way to model this shuffle.

11940

return false;

11941

11942

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

11943

// Adjust second vector indices to start at LaneSize instead of Size.

11944

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

11945

: Mask[i] % LaneSize + LaneSize;

11946

if (RepeatedMask[i % LaneSize] < 0)

11947

// This is the first non-undef entry in this slot of a 128-bit lane.

11948

RepeatedMask[i % LaneSize] = LocalM;

11949

else if (RepeatedMask[i % LaneSize] != LocalM)

11950

// Found a mismatch with the repeated mask.

11951

return false;

11952

}

11953

return true;

11954

}

11955

11956

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

11957

static bool

11958

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11959

SmallVectorImpl<int> &RepeatedMask) {

11960

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11961

}

11962

11963

static bool

11964

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

11965

SmallVector<int, 32> RepeatedMask;

11966

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11967

}

11968

11969

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

11970

static bool

11971

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11972

SmallVectorImpl<int> &RepeatedMask) {

11973

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

11974

}

11975

11976

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11977

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11978

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

11979

unsigned EltSizeInBits,

11980

ArrayRef<int> Mask,

11981

SmallVectorImpl<int> &RepeatedMask) {

11982

int LaneSize = LaneSizeInBits / EltSizeInBits;

11983

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

11984

int Size = Mask.size();

11985

for (int i = 0; i < Size; ++i) {

11986

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11986, __extension__
__PRETTY_FUNCTION__));

11987

if (Mask[i] == SM_SentinelUndef)

11988

continue;

11989

if (Mask[i] == SM_SentinelZero) {

11990

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

11991

return false;

11992

RepeatedMask[i % LaneSize] = SM_SentinelZero;

11993

continue;

11994

}

11995

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11996

// This entry crosses lanes, so there is no way to model this shuffle.

11997

return false;

11998

11999

// Handle the in-lane shuffles by detecting if and when they repeat. Adjust

12000

// later vector indices to start at multiples of LaneSize instead of Size.

12001

int LaneM = Mask[i] / Size;

12002

int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);

12003

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

12004

// This is the first non-undef entry in this slot of a 128-bit lane.

12005

RepeatedMask[i % LaneSize] = LocalM;

12006

else if (RepeatedMask[i % LaneSize] != LocalM)

12007

// Found a mismatch with the repeated mask.

12008

return false;

12009

}

12010

return true;

12011

}

12012

12013

/// Test whether a target shuffle mask is equivalent within each sub-lane.

12014

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

12015

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

12016

ArrayRef<int> Mask,

12017

SmallVectorImpl<int> &RepeatedMask) {

12018

return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

12019

Mask, RepeatedMask);

12020

}

12021

12022

/// Checks whether the vector elements referenced by two shuffle masks are

12023

/// equivalent.

12024

static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

12025

int Idx, int ExpectedIdx) {

12026

assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__
__PRETTY_FUNCTION__))

12027

ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__
__PRETTY_FUNCTION__));

12028

if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

12029

return false;

12030

12031

switch (Op.getOpcode()) {

12032

case ISD::BUILD_VECTOR:

12033

// If the values are build vectors, we can look through them to find

12034

// equivalent inputs that make the shuffles equivalent.

12035

// TODO: Handle MaskSize != Op.getNumOperands()?

12036

if (MaskSize == (int)Op.getNumOperands() &&

12037

MaskSize == (int)ExpectedOp.getNumOperands())

12038

return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

12039

break;

12040

case X86ISD::VBROADCAST:

12041

case X86ISD::VBROADCAST_LOAD:

12042

// TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?

12043

return (Op == ExpectedOp &&

12044

(int)Op.getValueType().getVectorNumElements() == MaskSize);

12045

case X86ISD::HADD:

12046

case X86ISD::HSUB:

12047

case X86ISD::FHADD:

12048

case X86ISD::FHSUB:

12049

case X86ISD::PACKSS:

12050

case X86ISD::PACKUS:

12051

// HOP(X,X) can refer to the elt from the lower/upper half of a lane.

12052

// TODO: Handle MaskSize != NumElts?

12053

// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

12054

if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

12055

MVT VT = Op.getSimpleValueType();

12056

int NumElts = VT.getVectorNumElements();

12057

if (MaskSize == NumElts) {

12058

int NumLanes = VT.getSizeInBits() / 128;

12059

int NumEltsPerLane = NumElts / NumLanes;

12060

int NumHalfEltsPerLane = NumEltsPerLane / 2;

12061

bool SameLane =

12062

(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

12063

bool SameElt =

12064

(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

12065

return SameLane && SameElt;

12066

}

12067

}

12068

break;

12069

}

12070

12071

return false;

12072

}

12073

12074

/// Checks whether a shuffle mask is equivalent to an explicit list of

12075

/// arguments.

12076

///

12077

/// This is a fast way to test a shuffle mask against a fixed pattern:

12078

///

12079

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

12080

///

12081

/// It returns true if the mask is exactly as wide as the argument list, and

12082

/// each element of the mask is either -1 (signifying undef) or the value given

12083

/// in the argument.

12084

static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,

12085

SDValue V1 = SDValue(),

12086

SDValue V2 = SDValue()) {

12087

int Size = Mask.size();

12088

if (Size != (int)ExpectedMask.size())

12089

return false;

12090

12091

for (int i = 0; i < Size; ++i) {

12092

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12092, __extension__
__PRETTY_FUNCTION__));

12093

int MaskIdx = Mask[i];

12094

int ExpectedIdx = ExpectedMask[i];

12095

if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

12096

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12097

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12098

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12099

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12100

if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12101

return false;

12102

}

12103

}

12104

return true;

12105

}

12106

12107

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

12108

///

12109

/// The masks must be exactly the same width.

12110

///

12111

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

12112

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

12113

///

12114

/// SM_SentinelZero is accepted as a valid negative index but must match in

12115

/// both, or via a known bits test.

12116

static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,

12117

ArrayRef<int> ExpectedMask,

12118

const SelectionDAG &DAG,

12119

SDValue V1 = SDValue(),

12120

SDValue V2 = SDValue()) {

12121

int Size = Mask.size();

12122

if (Size != (int)ExpectedMask.size())

12123

return false;

12124

assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))

12125

[Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__))

12126

"Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__
__PRETTY_FUNCTION__));

12127

12128

// Check for out-of-range target shuffle mask indices.

12129

if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

12130

return false;

12131

12132

// Don't use V1/V2 if they're not the same size as the shuffle mask type.

12133

if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())

12134

V1 = SDValue();

12135

if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())

12136

V2 = SDValue();

12137

12138

APInt ZeroV1 = APInt::getZero(Size);

12139

APInt ZeroV2 = APInt::getZero(Size);

12140

12141

for (int i = 0; i < Size; ++i) {

12142

int MaskIdx = Mask[i];

12143

int ExpectedIdx = ExpectedMask[i];

12144

if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

12145

continue;

12146

if (MaskIdx == SM_SentinelZero) {

12147

// If we need this expected index to be a zero element, then update the

12148

// relevant zero mask and perform the known bits at the end to minimize

12149

// repeated computes.

12150

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12151

if (ExpectedV &&

12152

Size == (int)ExpectedV.getValueType().getVectorNumElements()) {

12153

int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12154

APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;

12155

ZeroMask.setBit(BitIdx);

12156

continue;

12157

}

12158

}

12159

if (MaskIdx >= 0) {

12160

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12161

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12162

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12163

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12164

if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12165

continue;

12166

}

12167

return false;

12168

}

12169

return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&

12170

(ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));

12171

}

12172

12173

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

12174

// instructions.

12175

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,

12176

const SelectionDAG &DAG) {

12177

if (VT != MVT::v8i32 && VT != MVT::v8f32)

12178

return false;

12179

12180

SmallVector<int, 8> Unpcklwd;

12181

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

12182

/* Unary = */ false);

12183

SmallVector<int, 8> Unpckhwd;

12184

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

12185

/* Unary = */ false);

12186

bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||

12187

isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));

12188

return IsUnpackwdMask;

12189

}

12190

12191

static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,

12192

const SelectionDAG &DAG) {

12193

// Create 128-bit vector type based on mask size.

12194

MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

12195

MVT VT = MVT::getVectorVT(EltVT, Mask.size());

12196

12197

// We can't assume a canonical shuffle mask, so try the commuted version too.

12198

SmallVector<int, 4> CommutedMask(Mask);

12199

ShuffleVectorSDNode::commuteMask(CommutedMask);

12200

12201

// Match any of unary/binary or low/high.

12202

for (unsigned i = 0; i != 4; ++i) {

12203

SmallVector<int, 16> UnpackMask;

12204

createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

12205

if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||

12206

isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))

12207

return true;

12208

}

12209

return false;

12210

}

12211

12212

/// Return true if a shuffle mask chooses elements identically in its top and

12213

/// bottom halves. For example, any splat mask has the same top and bottom

12214

/// halves. If an element is undefined in only one half of the mask, the halves

12215

/// are not considered identical.

12216

static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

12217

assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12217, __extension__
__PRETTY_FUNCTION__));

12218

unsigned HalfSize = Mask.size() / 2;

12219

for (unsigned i = 0; i != HalfSize; ++i) {

12220

if (Mask[i] != Mask[i + HalfSize])

12221

return false;

12222

}

12223

return true;

12224

}

12225

12226

/// Get a 4-lane 8-bit shuffle immediate for a mask.

12227

///

12228

/// This helper function produces an 8-bit shuffle immediate corresponding to

12229

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

12230

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

12231

/// example.

12232

///

12233

/// NB: We rely heavily on "undef" masks preserving the input lane.

12234

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

12235

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12235, __extension__
__PRETTY_FUNCTION__));

12236

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12236, __extension__
__PRETTY_FUNCTION__));

12237

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__));

12238

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12238, __extension__
__PRETTY_FUNCTION__));

12239

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__
__PRETTY_FUNCTION__));

12240

12241

// If the mask only uses one non-undef element, then fully 'splat' it to

12242

// improve later broadcast matching.

12243

int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

12244

assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12244, __extension__
__PRETTY_FUNCTION__));

12245

12246

int FirstElt = Mask[FirstIndex];

12247

if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

12248

return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

12249

12250

unsigned Imm = 0;

12251

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

12252

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

12253

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

12254

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

12255

return Imm;

12256

}

12257

12258

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

12259

SelectionDAG &DAG) {

12260

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

12261

}

12262

12263

// The Shuffle result is as follow:

12264

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

12265

// Each Zeroable's element correspond to a particular Mask's element.

12266

// As described in computeZeroableShuffleElements function.

12267

//

12268

// The function looks for a sub-mask that the nonzero elements are in

12269

// increasing order. If such sub-mask exist. The function returns true.

12270

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

12271

ArrayRef<int> Mask, const EVT &VectorType,

12272

bool &IsZeroSideLeft) {

12273

int NextElement = -1;

12274

// Check if the Mask's nonzero elements are in increasing order.

12275

for (int i = 0, e = Mask.size(); i < e; i++) {

12276

// Checks if the mask's zeros elements are built from only zeros.

12277

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12277, __extension__
__PRETTY_FUNCTION__));

12278

if (Mask[i] < 0)

12279

return false;

12280

if (Zeroable[i])

12281

continue;

12282

// Find the lowest non zero element

12283

if (NextElement < 0) {

12284

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

12285

IsZeroSideLeft = NextElement != 0;

12286

}

12287

// Exit if the mask's non zero elements are not in increasing order.

12288

if (NextElement != Mask[i])

12289

return false;

12290

NextElement++;

12291

}

12292

return true;

12293

}

12294

12295

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

12296

static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

12297

ArrayRef<int> Mask, SDValue V1,

12298

SDValue V2, const APInt &Zeroable,

12299

const X86Subtarget &Subtarget,

12300

SelectionDAG &DAG) {

12301

int Size = Mask.size();

12302

int LaneSize = 128 / VT.getScalarSizeInBits();

12303

const int NumBytes = VT.getSizeInBits() / 8;

12304

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

12305

12306

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))

12307

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__))

12308

(Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__
__PRETTY_FUNCTION__));

12309

12310

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

12311

// Sign bit set in i8 mask means zero element.

12312

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

12313

12314

SDValue V;

12315

for (int i = 0; i < NumBytes; ++i) {

12316

int M = Mask[i / NumEltBytes];

12317

if (M < 0) {

12318

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

12319

continue;

12320

}

12321

if (Zeroable[i / NumEltBytes]) {

12322

PSHUFBMask[i] = ZeroMask;

12323

continue;

12324

}

12325

12326

// We can only use a single input of V1 or V2.

12327

SDValue SrcV = (M >= Size ? V2 : V1);

12328

if (V && V != SrcV)

12329

return SDValue();

12330

V = SrcV;

12331

M %= Size;

12332

12333

// PSHUFB can't cross lanes, ensure this doesn't happen.

12334

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

12335

return SDValue();

12336

12337

M = M % LaneSize;

12338

M = M * NumEltBytes + (i % NumEltBytes);

12339

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

12340

}

12341

assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12341, __extension__
__PRETTY_FUNCTION__));

12342

12343

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

12344

return DAG.getBitcast(

12345

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

12346

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

12347

}

12348

12349

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

12350

const X86Subtarget &Subtarget, SelectionDAG &DAG,

12351

const SDLoc &dl);

12352

12353

// X86 has dedicated shuffle that can be lowered to VEXPAND

12354

static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

12355

const APInt &Zeroable,

12356

ArrayRef<int> Mask, SDValue &V1,

12357

SDValue &V2, SelectionDAG &DAG,

12358

const X86Subtarget &Subtarget) {

12359

bool IsLeftZeroSide = true;

12360

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

12361

IsLeftZeroSide))

12362

return SDValue();

12363

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

12364

MVT IntegerType =

12365

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

12366

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

12367

unsigned NumElts = VT.getVectorNumElements();

12368

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__
__PRETTY_FUNCTION__))

12369

"Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__
__PRETTY_FUNCTION__));

12370

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

12371

Subtarget, DAG, DL);

12372

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

12373

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

12374

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

12375

}

12376

12377

static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

12378

unsigned &UnpackOpcode, bool IsUnary,

12379

ArrayRef<int> TargetMask, const SDLoc &DL,

12380

SelectionDAG &DAG,

12381

const X86Subtarget &Subtarget) {

12382

int NumElts = VT.getVectorNumElements();

12383

12384

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

12385

for (int i = 0; i != NumElts; i += 2) {

12386

int M1 = TargetMask[i + 0];

12387

int M2 = TargetMask[i + 1];

12388

Undef1 &= (SM_SentinelUndef == M1);

12389

Undef2 &= (SM_SentinelUndef == M2);

12390

Zero1 &= isUndefOrZero(M1);

12391

Zero2 &= isUndefOrZero(M2);

12392

}

12393

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__
__PRETTY_FUNCTION__))

12394

"Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__
__PRETTY_FUNCTION__));

12395

12396

// Attempt to match the target mask against the unpack lo/hi mask patterns.

12397

SmallVector<int, 64> Unpckl, Unpckh;

12398

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

12399

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,

12400

(IsUnary ? V1 : V2))) {

12401

UnpackOpcode = X86ISD::UNPCKL;

12402

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12403

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12404

return true;

12405

}

12406

12407

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

12408

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,

12409

(IsUnary ? V1 : V2))) {

12410

UnpackOpcode = X86ISD::UNPCKH;

12411

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12412

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12413

return true;

12414

}

12415

12416

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

12417

if (IsUnary && (Zero1 || Zero2)) {

12418

// Don't bother if we can blend instead.

12419

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

12420

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

12421

return false;

12422

12423

bool MatchLo = true, MatchHi = true;

12424

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

12425

int M = TargetMask[i];

12426

12427

// Ignore if the input is known to be zero or the index is undef.

12428

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

12429

(M == SM_SentinelUndef))

12430

continue;

12431

12432

MatchLo &= (M == Unpckl[i]);

12433

MatchHi &= (M == Unpckh[i]);

12434

}

12435

12436

if (MatchLo || MatchHi) {

12437

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

12438

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12439

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12440

return true;

12441

}

12442

}

12443

12444

// If a binary shuffle, commute and try again.

12445

if (!IsUnary) {

12446

ShuffleVectorSDNode::commuteMask(Unpckl);

12447

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {

12448

UnpackOpcode = X86ISD::UNPCKL;

12449

std::swap(V1, V2);

12450

return true;

12451

}

12452

12453

ShuffleVectorSDNode::commuteMask(Unpckh);

12454

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {

12455

UnpackOpcode = X86ISD::UNPCKH;

12456

std::swap(V1, V2);

12457

return true;

12458

}

12459

}

12460

12461

return false;

12462

}

12463

12464

// X86 has dedicated unpack instructions that can handle specific blend

12465

// operations: UNPCKH and UNPCKL.

12466

static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

12467

ArrayRef<int> Mask, SDValue V1, SDValue V2,

12468

SelectionDAG &DAG) {

12469

SmallVector<int, 8> Unpckl;

12470

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

12471

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12472

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

12473

12474

SmallVector<int, 8> Unpckh;

12475

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

12476

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12477

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

12478

12479

// Commute and try again.

12480

ShuffleVectorSDNode::commuteMask(Unpckl);

12481

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12482

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

12483

12484

ShuffleVectorSDNode::commuteMask(Unpckh);

12485

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12486

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

12487

12488

return SDValue();

12489

}

12490

12491

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

12492

/// followed by unpack 256-bit.

12493

static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

12494

ArrayRef<int> Mask, SDValue V1,

12495

SDValue V2, SelectionDAG &DAG) {

12496

SmallVector<int, 32> Unpckl, Unpckh;

12497

createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

12498

createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

12499

12500

unsigned UnpackOpcode;

12501

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12502

UnpackOpcode = X86ISD::UNPCKL;

12503

else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12504

UnpackOpcode = X86ISD::UNPCKH;

12505

else

12506

return SDValue();

12507

12508

// This is a "natural" unpack operation (rather than the 128-bit sectored

12509

// operation implemented by AVX). We need to rearrange 64-bit chunks of the

12510

// input in order to use the x86 instruction.

12511

V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

12512

DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

12513

V1 = DAG.getBitcast(VT, V1);

12514

return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

12515

}

12516

12517

// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

12518

// source into the lower elements and zeroing the upper elements.

12519

static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

12520

ArrayRef<int> Mask, const APInt &Zeroable,

12521

const X86Subtarget &Subtarget) {

12522

if (!VT.is512BitVector() && !Subtarget.hasVLX())

12523

return false;

12524

12525

unsigned NumElts = Mask.size();

12526

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12527

unsigned MaxScale = 64 / EltSizeInBits;

12528

12529

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12530

unsigned SrcEltBits = EltSizeInBits * Scale;

12531

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12532

continue;

12533

unsigned NumSrcElts = NumElts / Scale;

12534

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

12535

continue;

12536

unsigned UpperElts = NumElts - NumSrcElts;

12537

if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12538

continue;

12539

SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

12540

SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

12541

DstVT = MVT::getIntegerVT(EltSizeInBits);

12542

if ((NumSrcElts * EltSizeInBits) >= 128) {

12543

// ISD::TRUNCATE

12544

DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

12545

} else {

12546

// X86ISD::VTRUNC

12547

DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

12548

}

12549

return true;

12550

}

12551

12552

return false;

12553

}

12554

12555

// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

12556

// element padding to the final DstVT.

12557

static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

12558

const X86Subtarget &Subtarget,

12559

SelectionDAG &DAG, bool ZeroUppers) {

12560

MVT SrcVT = Src.getSimpleValueType();

12561

MVT DstSVT = DstVT.getScalarType();

12562

unsigned NumDstElts = DstVT.getVectorNumElements();

12563

unsigned NumSrcElts = SrcVT.getVectorNumElements();

12564

unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();

12565

12566

if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

12567

return SDValue();

12568

12569

// Perform a direct ISD::TRUNCATE if possible.

12570

if (NumSrcElts == NumDstElts)

12571

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);

12572

12573

if (NumSrcElts > NumDstElts) {

12574

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12575

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12576

return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

12577

}

12578

12579

if ((NumSrcElts * DstEltSizeInBits) >= 128) {

12580

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12581

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12582

return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12583

DstVT.getSizeInBits());

12584

}

12585

12586

// Non-VLX targets must truncate from a 512-bit type, so we need to

12587

// widen, truncate and then possibly extract the original subvector.

12588

if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

12589

SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

12590

return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

12591

}

12592

12593

// Fallback to a X86ISD::VTRUNC, padding if necessary.

12594

MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

12595

SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

12596

if (DstVT != TruncVT)

12597

Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12598

DstVT.getSizeInBits());

12599

return Trunc;

12600

}

12601

12602

// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

12603

//

12604

// An example is the following:

12605

//

12606

// t0: ch = EntryToken

12607

// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

12608

// t25: v4i32 = truncate t2

12609

// t41: v8i16 = bitcast t25

12610

// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

12611

// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

12612

// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

12613

// t18: v2i64 = bitcast t51

12614

//

12615

// One can just use a single vpmovdw instruction, without avx512vl we need to

12616

// use the zmm variant and extract the lower subvector, padding with zeroes.

12617

// TODO: Merge with lowerShuffleAsVTRUNC.

12618

static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

12619

SDValue V2, ArrayRef<int> Mask,

12620

const APInt &Zeroable,

12621

const X86Subtarget &Subtarget,

12622

SelectionDAG &DAG) {

12623

assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12623, __extension__
__PRETTY_FUNCTION__));

12624

if (!Subtarget.hasAVX512())

12625

return SDValue();

12626

12627

unsigned NumElts = VT.getVectorNumElements();

12628

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12629

unsigned MaxScale = 64 / EltSizeInBits;

12630

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12631

unsigned SrcEltBits = EltSizeInBits * Scale;

12632

unsigned NumSrcElts = NumElts / Scale;

12633

unsigned UpperElts = NumElts - NumSrcElts;

12634

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

12635

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12636

continue;

12637

12638

// Attempt to find a matching source truncation, but as a fall back VLX

12639

// cases can use the VPMOV directly.

12640

SDValue Src = peekThroughBitcasts(V1);

12641

if (Src.getOpcode() == ISD::TRUNCATE &&

12642

Src.getScalarValueSizeInBits() == SrcEltBits) {

12643

Src = Src.getOperand(0);

12644

} else if (Subtarget.hasVLX()) {

12645

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12646

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12647

Src = DAG.getBitcast(SrcVT, Src);

12648

// Don't do this if PACKSS/PACKUS could perform it cheaper.

12649

if (Scale == 2 &&

12650

((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||

12651

(DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))

12652

return SDValue();

12653

} else

12654

return SDValue();

12655

12656

// VPMOVWB is only available with avx512bw.

12657

if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)

12658

return SDValue();

12659

12660

bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

12661

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12662

}

12663

12664

return SDValue();

12665

}

12666

12667

// Attempt to match binary shuffle patterns as a truncate.

12668

static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

12669

SDValue V2, ArrayRef<int> Mask,

12670

const APInt &Zeroable,

12671

const X86Subtarget &Subtarget,

12672

SelectionDAG &DAG) {

12673

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__
__PRETTY_FUNCTION__))

12674

"Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__
__PRETTY_FUNCTION__));

12675

if (!Subtarget.hasAVX512())

12676

return SDValue();

12677

12678

unsigned NumElts = VT.getVectorNumElements();

12679

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12680

unsigned MaxScale = 64 / EltSizeInBits;

12681

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12682

// TODO: Support non-BWI VPMOVWB truncations?

12683

unsigned SrcEltBits = EltSizeInBits * Scale;

12684

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12685

continue;

12686

12687

// Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>

12688

// Bail if the V2 elements are undef.

12689

unsigned NumHalfSrcElts = NumElts / Scale;

12690

unsigned NumSrcElts = 2 * NumHalfSrcElts;

12691

for (unsigned Offset = 0; Offset != Scale; ++Offset) {

12692

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||

12693

isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

12694

continue;

12695

12696

// The elements beyond the truncation must be undef/zero.

12697

unsigned UpperElts = NumElts - NumSrcElts;

12698

if (UpperElts > 0 &&

12699

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12700

continue;

12701

bool UndefUppers =

12702

UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);

12703

12704

// For offset truncations, ensure that the concat is cheap.

12705

if (Offset) {

12706

auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {

12707

if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

12708

Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)

12709

return Lo.getOperand(0) == Hi.getOperand(0);

12710

if (ISD::isNormalLoad(Lo.getNode()) &&

12711

ISD::isNormalLoad(Hi.getNode())) {

12712

auto *LDLo = cast<LoadSDNode>(Lo);

12713

auto *LDHi = cast<LoadSDNode>(Hi);

12714

return DAG.areNonVolatileConsecutiveLoads(

12715

LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);

12716

}

12717

return false;

12718

};

12719

if (!IsCheapConcat(V1, V2))

12720

continue;

12721

}

12722

12723

// As we're using both sources then we need to concat them together

12724

// and truncate from the double-sized src.

12725

MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);

12726

SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

12727

12728

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12729

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12730

Src = DAG.getBitcast(SrcVT, Src);

12731

12732

// Shift the offset'd elements into place for the truncation.

12733

// TODO: Use getTargetVShiftByConstNode.

12734

if (Offset)

12735

Src = DAG.getNode(

12736

X86ISD::VSRLI, DL, SrcVT, Src,

12737

DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));

12738

12739

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12740

}

12741

}

12742

12743

return SDValue();

12744

}

12745

12746

/// Check whether a compaction lowering can be done by dropping even/odd

12747

/// elements and compute how many times even/odd elements must be dropped.

12748

///

12749

/// This handles shuffles which take every Nth element where N is a power of

12750

/// two. Example shuffle masks:

12751

///

12752

/// (even)

12753

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

12754

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

12755

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

12756

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

12757

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

12758

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

12759

///

12760

/// (odd)

12761

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14

12762

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31

12763

///

12764

/// Any of these lanes can of course be undef.

12765

///

12766

/// This routine only supports N <= 3.

12767

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

12768

/// for larger N.

12769

///

12770

/// \returns N above, or the number of times even/odd elements must be dropped

12771

/// if there is such a number. Otherwise returns zero.

12772

static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,

12773

bool IsSingleInput) {

12774

// The modulus for the shuffle vector entries is based on whether this is

12775

// a single input or not.

12776

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

12777

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__
__PRETTY_FUNCTION__))

12778

"We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__
__PRETTY_FUNCTION__));

12779

12780

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

12781

int Offset = MatchEven ? 0 : 1;

12782

12783

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

12784

// and 2^3 simultaneously. This is because we may have ambiguity with

12785

// partially undef inputs.

12786

bool ViableForN[3] = {true, true, true};

12787

12788

for (int i = 0, e = Mask.size(); i < e; ++i) {

12789

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

12790

// want.

12791

if (Mask[i] < 0)

12792

continue;

12793

12794

bool IsAnyViable = false;

12795

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12796

if (ViableForN[j]) {

12797

uint64_t N = j + 1;

12798

12799

// The shuffle mask must be equal to (i * 2^N) % M.

12800

if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))

12801

IsAnyViable = true;

12802

else

12803

ViableForN[j] = false;

12804

}

12805

// Early exit if we exhaust the possible powers of two.

12806

if (!IsAnyViable)

12807

break;

12808

}

12809

12810

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12811

if (ViableForN[j])

12812

return j + 1;

12813

12814

// Return 0 as there is no viable power of two.

12815

return 0;

12816

}

12817

12818

// X86 has dedicated pack instructions that can handle specific truncation

12819

// operations: PACKSS and PACKUS.

12820

// Checks for compaction shuffle masks if MaxStages > 1.

12821

// TODO: Add support for matching multiple PACKSS/PACKUS stages.

12822

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

12823

unsigned &PackOpcode, ArrayRef<int> TargetMask,

12824

const SelectionDAG &DAG,

12825

const X86Subtarget &Subtarget,

12826

unsigned MaxStages = 1) {

12827

unsigned NumElts = VT.getVectorNumElements();

12828

unsigned BitSize = VT.getScalarSizeInBits();

12829

assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__
__PRETTY_FUNCTION__))

12830

"Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__
__PRETTY_FUNCTION__));

12831

12832

auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

12833

unsigned NumSrcBits = PackVT.getScalarSizeInBits();

12834

unsigned NumPackedBits = NumSrcBits - BitSize;

12835

N1 = peekThroughBitcasts(N1);

12836

N2 = peekThroughBitcasts(N2);

12837

unsigned NumBits1 = N1.getScalarValueSizeInBits();

12838

unsigned NumBits2 = N2.getScalarValueSizeInBits();

12839

bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);

12840

bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);

12841

if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||

12842

(!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))

12843

return false;

12844

if (Subtarget.hasSSE41() || BitSize == 8) {

12845

APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

12846

if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&

12847

(N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {

12848

V1 = N1;

12849

V2 = N2;

12850

SrcVT = PackVT;

12851

PackOpcode = X86ISD::PACKUS;

12852

return true;

12853

}

12854

}

12855

bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);

12856

bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);

12857

if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||

12858

DAG.ComputeNumSignBits(N1) > NumPackedBits) &&

12859

(N2.isUndef() || IsZero2 || IsAllOnes2 ||

12860

DAG.ComputeNumSignBits(N2) > NumPackedBits)) {

12861

V1 = N1;

12862

V2 = N2;

12863

SrcVT = PackVT;

12864

PackOpcode = X86ISD::PACKSS;

12865

return true;

12866

}

12867

return false;

12868

};

12869

12870

// Attempt to match against wider and wider compaction patterns.

12871

for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

12872

MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

12873

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

12874

12875

// Try binary shuffle.

12876

SmallVector<int, 32> BinaryMask;

12877

createPackShuffleMask(VT, BinaryMask, false, NumStages);

12878

if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))

12879

if (MatchPACK(V1, V2, PackVT))

12880

return true;

12881

12882

// Try unary shuffle.

12883

SmallVector<int, 32> UnaryMask;

12884

createPackShuffleMask(VT, UnaryMask, true, NumStages);

12885

if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))

12886

if (MatchPACK(V1, V1, PackVT))

12887

return true;

12888

}

12889

12890

return false;

12891

}

12892

12893

static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

12894

SDValue V1, SDValue V2, SelectionDAG &DAG,

12895

const X86Subtarget &Subtarget) {

12896

MVT PackVT;

12897

unsigned PackOpcode;

12898

unsigned SizeBits = VT.getSizeInBits();

12899

unsigned EltBits = VT.getScalarSizeInBits();

12900

unsigned MaxStages = Log2_32(64 / EltBits);

12901

if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

12902

Subtarget, MaxStages))

12903

return SDValue();

12904

12905

unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

12906

unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

12907

12908

// Don't lower multi-stage packs on AVX512, truncation is better.

12909

if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

12910

return SDValue();

12911

12912

// Pack to the largest type possible:

12913

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

12914

unsigned MaxPackBits = 16;

12915

if (CurrentEltBits > 16 &&

12916

(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

12917

MaxPackBits = 32;

12918

12919

// Repeatedly pack down to the target size.

12920

SDValue Res;

12921

for (unsigned i = 0; i != NumStages; ++i) {

12922

unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

12923

unsigned NumSrcElts = SizeBits / SrcEltBits;

12924

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12925

MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

12926

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12927

MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

12928

Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

12929

DAG.getBitcast(SrcVT, V2));

12930

V1 = V2 = Res;

12931

CurrentEltBits /= 2;

12932

}

12933

assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__
__PRETTY_FUNCTION__))

12934

"Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__
__PRETTY_FUNCTION__));

12935

return Res;

12936

}

12937

12938

/// Try to emit a bitmask instruction for a shuffle.

12939

///

12940

/// This handles cases where we can model a blend exactly as a bitmask due to

12941

/// one of the inputs being zeroable.

12942

static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

12943

SDValue V2, ArrayRef<int> Mask,

12944

const APInt &Zeroable,

12945

const X86Subtarget &Subtarget,

12946

SelectionDAG &DAG) {

12947

MVT MaskVT = VT;

12948

MVT EltVT = VT.getVectorElementType();

12949

SDValue Zero, AllOnes;

12950

// Use f64 if i64 isn't legal.

12951

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

12952

EltVT = MVT::f64;

12953

MaskVT = MVT::getVectorVT(EltVT, Mask.size());

12954

}

12955

12956

MVT LogicVT = VT;

12957

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

12958

Zero = DAG.getConstantFP(0.0, DL, EltVT);

12959

APFloat AllOnesValue =

12960

APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));

12961

AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

12962

LogicVT =

12963

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

12964

} else {

12965

Zero = DAG.getConstant(0, DL, EltVT);

12966

AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12967

}

12968

12969

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

12970

SDValue V;

12971

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12972

if (Zeroable[i])

12973

continue;

12974

if (Mask[i] % Size != i)

12975

return SDValue(); // Not a blend.

12976

if (!V)

12977

V = Mask[i] < Size ? V1 : V2;

12978

else if (V != (Mask[i] < Size ? V1 : V2))

12979

return SDValue(); // Can only let one input through the mask.

12980

12981

VMaskOps[i] = AllOnes;

12982

}

12983

if (!V)

12984

return SDValue(); // No non-zeroable elements!

12985

12986

SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

12987

VMask = DAG.getBitcast(LogicVT, VMask);

12988

V = DAG.getBitcast(LogicVT, V);

12989

SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

12990

return DAG.getBitcast(VT, And);

12991

}

12992

12993

/// Try to emit a blend instruction for a shuffle using bit math.

12994

///

12995

/// This is used as a fallback approach when first class blend instructions are

12996

/// unavailable. Currently it is only suitable for integer vectors, but could

12997

/// be generalized for floating point vectors if desirable.

12998

static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

12999

SDValue V2, ArrayRef<int> Mask,

13000

SelectionDAG &DAG) {

13001

assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__
__PRETTY_FUNCTION__));

13002

MVT EltVT = VT.getVectorElementType();

13003

SDValue Zero = DAG.getConstant(0, DL, EltVT);

13004

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

13005

SmallVector<SDValue, 16> MaskOps;

13006

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

13007

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

13008

return SDValue(); // Shuffled input!

13009

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

13010

}

13011

13012

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

13013

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

13014

V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);

13015

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

13016

}

13017

13018

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

13019

SDValue PreservedSrc,

13020

const X86Subtarget &Subtarget,

13021

SelectionDAG &DAG);

13022

13023

static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,

13024

MutableArrayRef<int> Mask,

13025

const APInt &Zeroable, bool &ForceV1Zero,

13026

bool &ForceV2Zero, uint64_t &BlendMask) {

13027

bool V1IsZeroOrUndef =

13028

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

13029

bool V2IsZeroOrUndef =

13030

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

13031

13032

BlendMask = 0;

13033

ForceV1Zero = false, ForceV2Zero = false;

13034

assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13034, __extension__
__PRETTY_FUNCTION__));

13035

13036

int NumElts = Mask.size();

13037

int NumLanes = VT.getSizeInBits() / 128;

13038

int NumEltsPerLane = NumElts / NumLanes;

13039

assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13039, __extension__
__PRETTY_FUNCTION__));

13040

13041

// For 32/64-bit elements, if we only reference one input (plus any undefs),

13042

// then ensure the blend mask part for that lane just references that input.

13043

bool ForceWholeLaneMasks =

13044

VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;

13045

13046

// Attempt to generate the binary blend mask. If an input is zero then

13047

// we can use any lane.

13048

for (int Lane = 0; Lane != NumLanes; ++Lane) {

13049

// Keep track of the inputs used per lane.

13050

bool LaneV1InUse = false;

13051

bool LaneV2InUse = false;

13052

uint64_t LaneBlendMask = 0;

13053

for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {

13054

int Elt = (Lane * NumEltsPerLane) + LaneElt;

13055

int M = Mask[Elt];

13056

if (M == SM_SentinelUndef)

13057

continue;

13058

if (M == Elt || (0 <= M && M < NumElts &&

13059

IsElementEquivalent(NumElts, V1, V1, M, Elt))) {

13060

Mask[Elt] = Elt;

13061

LaneV1InUse = true;

13062

continue;

13063

}

13064

if (M == (Elt + NumElts) ||

13065

(NumElts <= M &&

13066

IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {

13067

LaneBlendMask |= 1ull << LaneElt;

13068

Mask[Elt] = Elt + NumElts;

13069

LaneV2InUse = true;

13070

continue;

13071

}

13072

if (Zeroable[Elt]) {

13073

if (V1IsZeroOrUndef) {

13074

ForceV1Zero = true;

13075

Mask[Elt] = Elt;

13076

LaneV1InUse = true;

13077

continue;

13078

}

13079

if (V2IsZeroOrUndef) {

13080

ForceV2Zero = true;

13081

LaneBlendMask |= 1ull << LaneElt;

13082

Mask[Elt] = Elt + NumElts;

13083

LaneV2InUse = true;

13084

continue;

13085

}

13086

}

13087

return false;

13088

}

13089

13090

// If we only used V2 then splat the lane blend mask to avoid any demanded

13091

// elts from V1 in this lane (the V1 equivalent is implicit with a zero

13092

// blend mask bit).

13093

if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)

13094

LaneBlendMask = (1ull << NumEltsPerLane) - 1;

13095

13096

BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);

13097

}

13098

return true;

13099

}

13100

13101

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

13102

int Scale) {

13103

uint64_t ScaledMask = 0;

13104

for (int i = 0; i != Size; ++i)

13105

if (BlendMask & (1ull << i))

13106

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

13107

return ScaledMask;

13108

}

13109

13110

/// Try to emit a blend instruction for a shuffle.

13111

///

13112

/// This doesn't do any checks for the availability of instructions for blending

13113

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

13114

/// be matched in the backend with the type given. What it does check for is

13115

/// that the shuffle mask is a blend, or convertible into a blend with zero.

13116

static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

13117

SDValue V2, ArrayRef<int> Original,

13118

const APInt &Zeroable,

13119

const X86Subtarget &Subtarget,

13120

SelectionDAG &DAG) {

13121

uint64_t BlendMask = 0;

13122

bool ForceV1Zero = false, ForceV2Zero = false;

13123

SmallVector<int, 64> Mask(Original);

13124

if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

13125

BlendMask))

13126

return SDValue();

13127

13128

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

13129

if (ForceV1Zero)

13130

V1 = getZeroVector(VT, Subtarget, DAG, DL);

13131

if (ForceV2Zero)

13132

V2 = getZeroVector(VT, Subtarget, DAG, DL);

13133

13134

unsigned NumElts = VT.getVectorNumElements();

13135

13136

switch (VT.SimpleTy) {

13137

case MVT::v4i64:

13138

case MVT::v8i32:

13139

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13139, __extension__
__PRETTY_FUNCTION__));

13140

[[fallthrough]];

13141

case MVT::v4f64:

13142

case MVT::v8f32:

13143

assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13143, __extension__
__PRETTY_FUNCTION__));

13144

[[fallthrough]];

13145

case MVT::v2f64:

13146

case MVT::v2i64:

13147

case MVT::v4f32:

13148

case MVT::v4i32:

13149

case MVT::v8i16:

13150

assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13150, __extension__
__PRETTY_FUNCTION__));

13151

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

13152

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13153

case MVT::v16i16: {

13154

assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13154, __extension__
__PRETTY_FUNCTION__));

13155

SmallVector<int, 8> RepeatedMask;

13156

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

13157

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

13158

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13158, __extension__
__PRETTY_FUNCTION__));

13159

BlendMask = 0;

13160

for (int i = 0; i < 8; ++i)

13161

if (RepeatedMask[i] >= 8)

13162

BlendMask |= 1ull << i;

13163

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13164

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13165

}

13166

// Use PBLENDW for lower/upper lanes and then blend lanes.

13167

// TODO - we should allow 2 PBLENDW here and leave shuffle combine to

13168

// merge to VSELECT where useful.

13169

uint64_t LoMask = BlendMask & 0xFF;

13170

uint64_t HiMask = (BlendMask >> 8) & 0xFF;

13171

if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

13172

SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13173

DAG.getTargetConstant(LoMask, DL, MVT::i8));

13174

SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13175

DAG.getTargetConstant(HiMask, DL, MVT::i8));

13176

return DAG.getVectorShuffle(

13177

MVT::v16i16, DL, Lo, Hi,

13178

{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

13179

}

13180

[[fallthrough]];

13181

}

13182

case MVT::v32i8:

13183

assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13183, __extension__
__PRETTY_FUNCTION__));

13184

[[fallthrough]];

13185

case MVT::v16i8: {

13186

assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13186, __extension__
__PRETTY_FUNCTION__));

13187

13188

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

13189

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13190

Subtarget, DAG))

13191

return Masked;

13192

13193

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

13194

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13195

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13196

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13197

}

13198

13199

// If we have VPTERNLOG, we can use that as a bit blend.

13200

if (Subtarget.hasVLX())

13201

if (SDValue BitBlend =

13202

lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

13203

return BitBlend;

13204

13205

// Scale the blend by the number of bytes per element.

13206

int Scale = VT.getScalarSizeInBits() / 8;

13207

13208

// This form of blend is always done on bytes. Compute the byte vector

13209

// type.

13210

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13211

13212

// x86 allows load folding with blendvb from the 2nd source operand. But

13213

// we are still using LLVM select here (see comment below), so that's V1.

13214

// If V2 can be load-folded and V1 cannot be load-folded, then commute to

13215

// allow that load-folding possibility.

13216

if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

13217

ShuffleVectorSDNode::commuteMask(Mask);

13218

std::swap(V1, V2);

13219

}

13220

13221

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

13222

// mix of LLVM's code generator and the x86 backend. We tell the code

13223

// generator that boolean values in the elements of an x86 vector register

13224

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

13225

// mapping a select to operand #1, and 'false' mapping to operand #2. The

13226

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

13227

// of the element (the remaining are ignored) and 0 in that high bit would

13228

// mean operand #1 while 1 in the high bit would mean operand #2. So while

13229

// the LLVM model for boolean values in vector elements gets the relevant

13230

// bit set, it is set backwards and over constrained relative to x86's

13231

// actual model.

13232

SmallVector<SDValue, 32> VSELECTMask;

13233

for (int i = 0, Size = Mask.size(); i < Size; ++i)

13234

for (int j = 0; j < Scale; ++j)

13235

VSELECTMask.push_back(

13236

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

13237

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

13238

MVT::i8));

13239

13240

V1 = DAG.getBitcast(BlendVT, V1);

13241

V2 = DAG.getBitcast(BlendVT, V2);

13242

return DAG.getBitcast(

13243

VT,

13244

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

13245

V1, V2));

13246

}

13247

case MVT::v16f32:

13248

case MVT::v8f64:

13249

case MVT::v8i64:

13250

case MVT::v16i32:

13251

case MVT::v32i16:

13252

case MVT::v64i8: {

13253

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

13254

bool OptForSize = DAG.shouldOptForSize();

13255

if (!OptForSize) {

13256

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13257

Subtarget, DAG))

13258

return Masked;

13259

}

13260

13261

// Otherwise load an immediate into a GPR, cast to k-register, and use a

13262

// masked move.

13263

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13264

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13265

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13266

}

13267

default:

13268

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13268);

13269

}

13270

}

13271

13272

/// Try to lower as a blend of elements from two inputs followed by

13273

/// a single-input permutation.

13274

///

13275

/// This matches the pattern where we can blend elements from two inputs and

13276

/// then reduce the shuffle to a single-input permutation.

13277

static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

13278

SDValue V1, SDValue V2,

13279

ArrayRef<int> Mask,

13280

SelectionDAG &DAG,

13281

bool ImmBlends = false) {

13282

// We build up the blend mask while checking whether a blend is a viable way

13283

// to reduce the shuffle.

13284

SmallVector<int, 32> BlendMask(Mask.size(), -1);

13285

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

13286

13287

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

13288

if (Mask[i] < 0)

13289

continue;

13290

13291

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13291, __extension__
__PRETTY_FUNCTION__));

13292

13293

if (BlendMask[Mask[i] % Size] < 0)

13294

BlendMask[Mask[i] % Size] = Mask[i];

13295

else if (BlendMask[Mask[i] % Size] != Mask[i])

13296

return SDValue(); // Can't blend in the needed input!

13297

13298

PermuteMask[i] = Mask[i] % Size;

13299

}

13300

13301

// If only immediate blends, then bail if the blend mask can't be widened to

13302

// i16.

13303

unsigned EltSize = VT.getScalarSizeInBits();

13304

if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

13305

return SDValue();

13306

13307

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

13308

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

13309

}

13310

13311

/// Try to lower as an unpack of elements from two inputs followed by

13312

/// a single-input permutation.

13313

///

13314

/// This matches the pattern where we can unpack elements from two inputs and

13315

/// then reduce the shuffle to a single-input (wider) permutation.

13316

static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

13317

SDValue V1, SDValue V2,

13318

ArrayRef<int> Mask,

13319

SelectionDAG &DAG) {

13320

int NumElts = Mask.size();

13321

int NumLanes = VT.getSizeInBits() / 128;

13322

int NumLaneElts = NumElts / NumLanes;

13323

int NumHalfLaneElts = NumLaneElts / 2;

13324

13325

bool MatchLo = true, MatchHi = true;

13326

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

13327

13328

// Determine UNPCKL/UNPCKH type and operand order.

13329

for (int Elt = 0; Elt != NumElts; ++Elt) {

13330

int M = Mask[Elt];

13331

if (M < 0)

13332

continue;

13333

13334

// Normalize the mask value depending on whether it's V1 or V2.

13335

int NormM = M;

13336

SDValue &Op = Ops[Elt & 1];

13337

if (M < NumElts && (Op.isUndef() || Op == V1))

13338

Op = V1;

13339

else if (NumElts <= M && (Op.isUndef() || Op == V2)) {

13340

Op = V2;

13341

NormM -= NumElts;

13342

} else

13343

return SDValue();

13344

13345

bool MatchLoAnyLane = false, MatchHiAnyLane = false;

13346

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13347

int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

13348

MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);

13349

MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);

13350

if (MatchLoAnyLane || MatchHiAnyLane) {

13351

assert((MatchLoAnyLane ^ MatchHiAnyLane) &&(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__
__PRETTY_FUNCTION__))

13352

"Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__
__PRETTY_FUNCTION__));

13353

break;

13354

}

13355

}

13356

MatchLo &= MatchLoAnyLane;

13357

MatchHi &= MatchHiAnyLane;

13358

if (!MatchLo && !MatchHi)

13359

return SDValue();

13360

}

13361

assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13361, __extension__
__PRETTY_FUNCTION__));

13362

13363

// Element indices have changed after unpacking. Calculate permute mask

13364

// so that they will be put back to the position as dictated by the

13365

// original shuffle mask indices.

13366

SmallVector<int, 32> PermuteMask(NumElts, -1);

13367

for (int Elt = 0; Elt != NumElts; ++Elt) {

13368

int M = Mask[Elt];

13369

if (M < 0)

13370

continue;

13371

int NormM = M;

13372

if (NumElts <= M)

13373

NormM -= NumElts;

13374

bool IsFirstOp = M < NumElts;

13375

int BaseMaskElt =

13376

NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));

13377

if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))

13378

PermuteMask[Elt] = BaseMaskElt;

13379

else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))

13380

PermuteMask[Elt] = BaseMaskElt + 1;

13381

assert(PermuteMask[Elt] != -1 &&(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"
) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__
__PRETTY_FUNCTION__))

13382

"Input mask element is defined but failed to assign permute mask")(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"
) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__
__PRETTY_FUNCTION__));

13383

}

13384

13385

unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

13386

SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

13387

return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

13388

}

13389

13390

/// Try to lower a shuffle as a permute of the inputs followed by an

13391

/// UNPCK instruction.

13392

///

13393

/// This specifically targets cases where we end up with alternating between

13394

/// the two inputs, and so can permute them into something that feeds a single

13395

/// UNPCK instruction. Note that this routine only targets integer vectors

13396

/// because for floating point vectors we have a generalized SHUFPS lowering

13397

/// strategy that handles everything that doesn't *exactly* match an unpack,

13398

/// making this clever lowering unnecessary.

13399

static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,

13400

SDValue V1, SDValue V2,

13401

ArrayRef<int> Mask,

13402

const X86Subtarget &Subtarget,

13403

SelectionDAG &DAG) {

13404

int Size = Mask.size();

13405

assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13405, __extension__
__PRETTY_FUNCTION__));

13406

13407

// This routine only supports 128-bit integer dual input vectors.

13408

if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())

13409

return SDValue();

13410

13411

int NumLoInputs =

13412

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

13413

int NumHiInputs =

13414

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

13415

13416

bool UnpackLo = NumLoInputs >= NumHiInputs;

13417

13418

auto TryUnpack = [&](int ScalarSize, int Scale) {

13419

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

13420

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

13421

13422

for (int i = 0; i < Size; ++i) {

13423

if (Mask[i] < 0)

13424

continue;

13425

13426

// Each element of the unpack contains Scale elements from this mask.

13427

int UnpackIdx = i / Scale;

13428

13429

// We only handle the case where V1 feeds the first slots of the unpack.

13430

// We rely on canonicalization to ensure this is the case.

13431

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

13432

return SDValue();

13433

13434

// Setup the mask for this input. The indexing is tricky as we have to

13435

// handle the unpack stride.

13436

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

13437

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

13438

Mask[i] % Size;

13439

}

13440

13441

// If we will have to shuffle both inputs to use the unpack, check whether

13442

// we can just unpack first and shuffle the result. If so, skip this unpack.

13443

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

13444

!isNoopShuffleMask(V2Mask))

13445

return SDValue();

13446

13447

// Shuffle the inputs into place.

13448

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13449

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13450

13451

// Cast the inputs to the type we will use to unpack them.

13452

MVT UnpackVT =

13453

MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

13454

V1 = DAG.getBitcast(UnpackVT, V1);

13455

V2 = DAG.getBitcast(UnpackVT, V2);

13456

13457

// Unpack the inputs and cast the result back to the desired type.

13458

return DAG.getBitcast(

13459

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

13460

UnpackVT, V1, V2));

13461

};

13462

13463

// We try each unpack from the largest to the smallest to try and find one

13464

// that fits this mask.

13465

int OrigScalarSize = VT.getScalarSizeInBits();

13466

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

13467

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

13468

return Unpack;

13469

13470

// If we're shuffling with a zero vector then we're better off not doing

13471

// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

13472

if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

13473

ISD::isBuildVectorAllZeros(V2.getNode()))

13474

return SDValue();

13475

13476

// If none of the unpack-rooted lowerings worked (or were profitable) try an

13477

// initial unpack.

13478

if (NumLoInputs == 0 || NumHiInputs == 0) {

13479

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__
__PRETTY_FUNCTION__))

13480

"We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__
__PRETTY_FUNCTION__));

13481

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

13482

13483

// FIXME: We could consider the total complexity of the permute of each

13484

// possible unpacking. Or at the least we should consider how many

13485

// half-crossings are created.

13486

// FIXME: We could consider commuting the unpacks.

13487

13488

SmallVector<int, 32> PermMask((unsigned)Size, -1);

13489

for (int i = 0; i < Size; ++i) {

13490

if (Mask[i] < 0)

13491

continue;

13492

13493

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13493, __extension__
__PRETTY_FUNCTION__));

13494

13495

PermMask[i] =

13496

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

13497

}

13498

return DAG.getVectorShuffle(

13499

VT, DL,

13500

DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,

13501

V1, V2),

13502

DAG.getUNDEF(VT), PermMask);

13503

}

13504

13505

return SDValue();

13506

}

13507

13508

/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

13509

/// permuting the elements of the result in place.

13510

static SDValue lowerShuffleAsByteRotateAndPermute(

13511

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13512

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13513

if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

13514

(VT.is256BitVector() && !Subtarget.hasAVX2()) ||

13515

(VT.is512BitVector() && !Subtarget.hasBWI()))

13516

return SDValue();

13517

13518

// We don't currently support lane crossing permutes.

13519

if (is128BitLaneCrossingShuffleMask(VT, Mask))

13520

return SDValue();

13521

13522

int Scale = VT.getScalarSizeInBits() / 8;

13523

int NumLanes = VT.getSizeInBits() / 128;

13524

int NumElts = VT.getVectorNumElements();

13525

int NumEltsPerLane = NumElts / NumLanes;

13526

13527

// Determine range of mask elts.

13528

bool Blend1 = true;

13529

bool Blend2 = true;

13530

std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13531

std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13532

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13533

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13534

int M = Mask[Lane + Elt];

13535

if (M < 0)

13536

continue;

13537

if (M < NumElts) {

13538

Blend1 &= (M == (Lane + Elt));

13539

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13539, __extension__
__PRETTY_FUNCTION__));

13540

M = M % NumEltsPerLane;

13541

Range1.first = std::min(Range1.first, M);

13542

Range1.second = std::max(Range1.second, M);

13543

} else {

13544

M -= NumElts;

13545

Blend2 &= (M == (Lane + Elt));

13546

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13546, __extension__
__PRETTY_FUNCTION__));

13547

M = M % NumEltsPerLane;

13548

Range2.first = std::min(Range2.first, M);

13549

Range2.second = std::max(Range2.second, M);

13550

}

13551

}

13552

}

13553

13554

// Bail if we don't need both elements.

13555

// TODO - it might be worth doing this for unary shuffles if the permute

13556

// can be widened.

13557

if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

13558

!(0 <= Range2.first && Range2.second < NumEltsPerLane))

13559

return SDValue();

13560

13561

if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

13562

return SDValue();

13563

13564

// Rotate the 2 ops so we can access both ranges, then permute the result.

13565

auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

13566

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13567

SDValue Rotate = DAG.getBitcast(

13568

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

13569

DAG.getBitcast(ByteVT, Lo),

13570

DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

13571

SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

13572

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13573

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13574

int M = Mask[Lane + Elt];

13575

if (M < 0)

13576

continue;

13577

if (M < NumElts)

13578

PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

13579

else

13580

PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

13581

}

13582

}

13583

return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

13584

};

13585

13586

// Check if the ranges are small enough to rotate from either direction.

13587

if (Range2.second < Range1.first)

13588

return RotateAndPermute(V1, V2, Range1.first, 0);

13589

if (Range1.second < Range2.first)

13590

return RotateAndPermute(V2, V1, Range2.first, NumElts);

13591

return SDValue();

13592

}

13593

13594

static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {

13595

return isUndefOrEqual(Mask, 0);

13596

}

13597

13598

static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {

13599

return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);

13600

}

13601

13602

/// Check if the Mask consists of the same element repeated multiple times.

13603

static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {

13604

size_t NumUndefs = 0;

13605

std::optional<int> UniqueElt;

13606

for (int Elt : Mask) {

13607

if (Elt == SM_SentinelUndef) {

13608

NumUndefs++;

13609

continue;

13610

}

13611

if (UniqueElt.has_value() && UniqueElt.value() != Elt)

13612

return false;

13613

UniqueElt = Elt;

13614

}

13615

// Make sure the element is repeated enough times by checking the number of

13616

// undefs is small.

13617

return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();

13618

}

13619

13620

/// Generic routine to decompose a shuffle and blend into independent

13621

/// blends and permutes.

13622

///

13623

/// This matches the extremely common pattern for handling combined

13624

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

13625

/// operations. It will try to pick the best arrangement of shuffles and

13626

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.

13627

static SDValue lowerShuffleAsDecomposedShuffleMerge(

13628

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13629

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13630

int NumElts = Mask.size();

13631

int NumLanes = VT.getSizeInBits() / 128;

13632

int NumEltsPerLane = NumElts / NumLanes;

13633

13634

// Shuffle the input elements into the desired positions in V1 and V2 and

13635

// unpack/blend them together.

13636

bool IsAlternating = true;

13637

SmallVector<int, 32> V1Mask(NumElts, -1);

13638

SmallVector<int, 32> V2Mask(NumElts, -1);

13639

SmallVector<int, 32> FinalMask(NumElts, -1);

13640

for (int i = 0; i < NumElts; ++i) {

13641

int M = Mask[i];

13642

if (M >= 0 && M < NumElts) {

13643

V1Mask[i] = M;

13644

FinalMask[i] = i;

13645

IsAlternating &= (i & 1) == 0;

13646

} else if (M >= NumElts) {

13647

V2Mask[i] = M - NumElts;

13648

FinalMask[i] = i + NumElts;

13649

IsAlternating &= (i & 1) == 1;

13650

}

13651

}

13652

13653

// If we effectively only demand the 0'th element of \p Input, and not only

13654

// as 0'th element, then broadcast said input,

13655

// and change \p InputMask to be a no-op (identity) mask.

13656

auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,

13657

&DAG](SDValue &Input,

13658

MutableArrayRef<int> InputMask) {

13659

unsigned EltSizeInBits = Input.getScalarValueSizeInBits();

13660

if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||

13661

!X86::mayFoldLoad(Input, Subtarget)))

13662

return;

13663

if (isNoopShuffleMask(InputMask))

13664

return;

13665

assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__
__PRETTY_FUNCTION__))

13666

"Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__
__PRETTY_FUNCTION__));

13667

Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);

13668

for (auto I : enumerate(InputMask)) {

13669

int &InputMaskElt = I.value();

13670

if (InputMaskElt >= 0)

13671

InputMaskElt = I.index();

13672

}

13673

};

13674

13675

// Currently, we may need to produce one shuffle per input, and blend results.

13676

// It is possible that the shuffle for one of the inputs is already a no-op.

13677

// See if we can simplify non-no-op shuffles into broadcasts,

13678

// which we consider to be strictly better than an arbitrary shuffle.

13679

if (isNoopOrBroadcastShuffleMask(V1Mask) &&

13680

isNoopOrBroadcastShuffleMask(V2Mask)) {

13681

canonicalizeBroadcastableInput(V1, V1Mask);

13682

canonicalizeBroadcastableInput(V2, V2Mask);

13683

}

13684

13685

// Try to lower with the simpler initial blend/unpack/rotate strategies unless

13686

// one of the input shuffles would be a no-op. We prefer to shuffle inputs as

13687

// the shuffle may be able to fold with a load or other benefit. However, when

13688

// we'll have to do 2x as many shuffles in order to achieve this, a 2-input

13689

// pre-shuffle first is a better strategy.

13690

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

13691

// Only prefer immediate blends to unpack/rotate.

13692

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13693

DAG, true))

13694

return BlendPerm;

13695

// If either input vector provides only a single element which is repeated

13696

// multiple times, unpacking from both input vectors would generate worse

13697

// code. e.g. for

13698

// t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4

13699

// it is better to process t4 first to create a vector of t4[0], then unpack

13700

// that vector with t2.

13701

if (!isSingleElementRepeatedMask(V1Mask) &&

13702

!isSingleElementRepeatedMask(V2Mask))

13703

if (SDValue UnpackPerm =

13704

lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))

13705

return UnpackPerm;

13706

if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

13707

DL, VT, V1, V2, Mask, Subtarget, DAG))

13708

return RotatePerm;

13709

// Unpack/rotate failed - try again with variable blends.

13710

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13711

DAG))

13712

return BlendPerm;

13713

if (VT.getScalarSizeInBits() >= 32)

13714

if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(

13715

DL, VT, V1, V2, Mask, Subtarget, DAG))

13716

return PermUnpack;

13717

}

13718

13719

// If the final mask is an alternating blend of vXi8/vXi16, convert to an

13720

// UNPCKL(SHUFFLE, SHUFFLE) pattern.

13721

// TODO: It doesn't have to be alternating - but each lane mustn't have more

13722

// than half the elements coming from each source.

13723

if (IsAlternating && VT.getScalarSizeInBits() < 32) {

13724

V1Mask.assign(NumElts, -1);

13725

V2Mask.assign(NumElts, -1);

13726

FinalMask.assign(NumElts, -1);

13727

for (int i = 0; i != NumElts; i += NumEltsPerLane)

13728

for (int j = 0; j != NumEltsPerLane; ++j) {

13729

int M = Mask[i + j];

13730

if (M >= 0 && M < NumElts) {

13731

V1Mask[i + (j / 2)] = M;

13732

FinalMask[i + j] = i + (j / 2);

13733

} else if (M >= NumElts) {

13734

V2Mask[i + (j / 2)] = M - NumElts;

13735

FinalMask[i + j] = i + (j / 2) + NumElts;

13736

}

13737

}

13738

}

13739

13740

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13741

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13742

return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

13743

}

13744

13745

/// Try to lower a vector shuffle as a bit rotation.

13746

///

13747

/// Look for a repeated rotation pattern in each sub group.

13748

/// Returns a ISD::ROTL element rotation amount or -1 if failed.

13749

static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

13750

int NumElts = Mask.size();

13751

assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13751, __extension__
__PRETTY_FUNCTION__));

13752

13753

int RotateAmt = -1;

13754

for (int i = 0; i != NumElts; i += NumSubElts) {

13755

for (int j = 0; j != NumSubElts; ++j) {

13756

int M = Mask[i + j];

13757

if (M < 0)

13758

continue;

13759

if (!isInRange(M, i, i + NumSubElts))

13760

return -1;

13761

int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

13762

if (0 <= RotateAmt && Offset != RotateAmt)

13763

return -1;

13764

RotateAmt = Offset;

13765

}

13766

}

13767

return RotateAmt;

13768

}

13769

13770

static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

13771

const X86Subtarget &Subtarget,

13772

ArrayRef<int> Mask) {

13773

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13773, __extension__
__PRETTY_FUNCTION__));

13774

assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13774, __extension__
__PRETTY_FUNCTION__));

13775

13776

// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

13777

int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

13778

int MaxSubElts = 64 / EltSizeInBits;

13779

for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

13780

int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

13781

if (RotateAmt < 0)

13782

continue;

13783

13784

int NumElts = Mask.size();

13785

MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

13786

RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

13787

return RotateAmt * EltSizeInBits;

13788

}

13789

13790

return -1;

13791

}

13792

13793

/// Lower shuffle using X86ISD::VROTLI rotations.

13794

static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

13795

ArrayRef<int> Mask,

13796

const X86Subtarget &Subtarget,

13797

SelectionDAG &DAG) {

13798

// Only XOP + AVX512 targets have bit rotation instructions.

13799

// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

13800

bool IsLegal =

13801

(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

13802

if (!IsLegal && Subtarget.hasSSE3())

13803

return SDValue();

13804

13805

MVT RotateVT;

13806

int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

13807

Subtarget, Mask);

13808

if (RotateAmt < 0)

13809

return SDValue();

13810

13811

// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

13812

// expanded to OR(SRL,SHL), will be more efficient, but if they can

13813

// widen to vXi16 or more then existing lowering should will be better.

13814

if (!IsLegal) {

13815

if ((RotateAmt % 16) == 0)

13816

return SDValue();

13817

// TODO: Use getTargetVShiftByConstNode.

13818

unsigned ShlAmt = RotateAmt;

13819

unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

13820

V1 = DAG.getBitcast(RotateVT, V1);

13821

SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

13822

DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

13823

SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

13824

DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

13825

SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

13826

return DAG.getBitcast(VT, Rot);

13827

}

13828

13829

SDValue Rot =

13830

DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

13831

DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

13832

return DAG.getBitcast(VT, Rot);

13833

}

13834

13835

/// Try to match a vector shuffle as an element rotation.

13836

///

13837

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

13838

static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

13839

ArrayRef<int> Mask) {

13840

int NumElts = Mask.size();

13841

13842

// We need to detect various ways of spelling a rotation:

13843

// [11, 12, 13, 14, 15, 0, 1, 2]

13844

// [-1, 12, 13, 14, -1, -1, 1, -1]

13845

// [-1, -1, -1, -1, -1, -1, 1, 2]

13846

// [ 3, 4, 5, 6, 7, 8, 9, 10]

13847

// [-1, 4, 5, 6, -1, -1, 9, -1]

13848

// [-1, 4, 5, 6, -1, -1, -1, -1]

13849

int Rotation = 0;

13850

SDValue Lo, Hi;

13851

for (int i = 0; i < NumElts; ++i) {

13852

int M = Mask[i];

13853

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__
__PRETTY_FUNCTION__))

13854

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__
__PRETTY_FUNCTION__));

13855

if (M < 0)

13856

continue;

13857

13858

// Determine where a rotated vector would have started.

13859

int StartIdx = i - (M % NumElts);

13860

if (StartIdx == 0)

13861

// The identity rotation isn't interesting, stop.

13862

return -1;

13863

13864

// If we found the tail of a vector the rotation must be the missing

13865

// front. If we found the head of a vector, it must be how much of the

13866

// head.

13867

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

13868

13869

if (Rotation == 0)

13870

Rotation = CandidateRotation;

13871

else if (Rotation != CandidateRotation)

13872

// The rotations don't match, so we can't match this mask.

13873

return -1;

13874

13875

// Compute which value this mask is pointing at.

13876

SDValue MaskV = M < NumElts ? V1 : V2;

13877

13878

// Compute which of the two target values this index should be assigned

13879

// to. This reflects whether the high elements are remaining or the low

13880

// elements are remaining.

13881

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

13882

13883

// Either set up this value if we've not encountered it before, or check

13884

// that it remains consistent.

13885

if (!TargetV)

13886

TargetV = MaskV;

13887

else if (TargetV != MaskV)

13888

// This may be a rotation, but it pulls from the inputs in some

13889

// unsupported interleaving.

13890

return -1;

13891

}

13892

13893

// Check that we successfully analyzed the mask, and normalize the results.

13894

assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13894, __extension__
__PRETTY_FUNCTION__));

13895

assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__
__PRETTY_FUNCTION__));

13896

if (!Lo)

13897

Lo = Hi;

13898

else if (!Hi)

13899

Hi = Lo;

13900

13901

V1 = Lo;

13902

V2 = Hi;

13903

13904

return Rotation;

13905

}

13906

13907

/// Try to lower a vector shuffle as a byte rotation.

13908

///

13909

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

13910

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

13911

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

13912

/// try to generically lower a vector shuffle through such an pattern. It

13913

/// does not check for the profitability of lowering either as PALIGNR or

13914

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

13915

/// This matches shuffle vectors that look like:

13916

///

13917

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

13918

///

13919

/// Essentially it concatenates V1 and V2, shifts right by some number of

13920

/// elements, and takes the low elements as the result. Note that while this is

13921

/// specified as a *right shift* because x86 is little-endian, it is a *left

13922

/// rotate* of the vector lanes.

13923

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

13924

ArrayRef<int> Mask) {

13925

// Don't accept any shuffles with zero elements.

13926

if (isAnyZero(Mask))

13927

return -1;

13928

13929

// PALIGNR works on 128-bit lanes.

13930

SmallVector<int, 16> RepeatedMask;

13931

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

13932

return -1;

13933

13934

int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

13935

if (Rotation <= 0)

13936

return -1;

13937

13938

// PALIGNR rotates bytes, so we need to scale the

13939

// rotation based on how many bytes are in the vector lane.

13940

int NumElts = RepeatedMask.size();

13941

int Scale = 16 / NumElts;

13942

return Rotation * Scale;

13943

}

13944

13945

static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

13946

SDValue V2, ArrayRef<int> Mask,

13947

const X86Subtarget &Subtarget,

13948

SelectionDAG &DAG) {

13949

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13949, __extension__
__PRETTY_FUNCTION__));

13950

13951

SDValue Lo = V1, Hi = V2;

13952

int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

13953

if (ByteRotation <= 0)

13954

return SDValue();

13955

13956

// Cast the inputs to i8 vector of correct length to match PALIGNR or

13957

// PSLLDQ/PSRLDQ.

13958

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13959

Lo = DAG.getBitcast(ByteVT, Lo);

13960

Hi = DAG.getBitcast(ByteVT, Hi);

13961

13962

// SSSE3 targets can use the palignr instruction.

13963

if (Subtarget.hasSSSE3()) {

13964

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))

13965

"512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__));

13966

return DAG.getBitcast(

13967

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

13968

DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

13969

}

13970

13971

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__
__PRETTY_FUNCTION__))

13972

"Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__
__PRETTY_FUNCTION__));

13973

assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__
__PRETTY_FUNCTION__))

13974

"Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__
__PRETTY_FUNCTION__));

13975

assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__
__PRETTY_FUNCTION__))

13976

"SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__
__PRETTY_FUNCTION__));

13977

13978

// Default SSE2 implementation

13979

int LoByteShift = 16 - ByteRotation;

13980

int HiByteShift = ByteRotation;

13981

13982

SDValue LoShift =

13983

DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

13984

DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

13985

SDValue HiShift =

13986

DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

13987

DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

13988

return DAG.getBitcast(VT,

13989

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

13990

}

13991

13992

/// Try to lower a vector shuffle as a dword/qword rotation.

13993

///

13994

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

13995

/// rotation of the concatenation of two vectors; This routine will

13996

/// try to generically lower a vector shuffle through such an pattern.

13997

///

13998

/// Essentially it concatenates V1 and V2, shifts right by some number of

13999

/// elements, and takes the low elements as the result. Note that while this is

14000

/// specified as a *right shift* because x86 is little-endian, it is a *left

14001

/// rotate* of the vector lanes.

14002

static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

14003

SDValue V2, ArrayRef<int> Mask,

14004

const X86Subtarget &Subtarget,

14005

SelectionDAG &DAG) {

14006

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__
__PRETTY_FUNCTION__))

14007

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__
__PRETTY_FUNCTION__));

14008

14009

// 128/256-bit vectors are only supported with VLX.

14010

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__
__PRETTY_FUNCTION__))

14011

&& "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__
__PRETTY_FUNCTION__));

14012

14013

SDValue Lo = V1, Hi = V2;

14014

int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

14015

if (Rotation <= 0)

14016

return SDValue();

14017

14018

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

14019

DAG.getTargetConstant(Rotation, DL, MVT::i8));

14020

}

14021

14022

/// Try to lower a vector shuffle as a byte shift sequence.

14023

static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

14024

SDValue V2, ArrayRef<int> Mask,

14025

const APInt &Zeroable,

14026

const X86Subtarget &Subtarget,

14027

SelectionDAG &DAG) {

14028

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14028, __extension__
__PRETTY_FUNCTION__));

14029

assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14029, __extension__
__PRETTY_FUNCTION__));

14030

14031

// We need a shuffle that has zeros at one/both ends and a sequential

14032

// shuffle from one source within.

14033

unsigned ZeroLo = Zeroable.countr_one();

14034

unsigned ZeroHi = Zeroable.countl_one();

14035

if (!ZeroLo && !ZeroHi)

14036

return SDValue();

14037

14038

unsigned NumElts = Mask.size();

14039

unsigned Len = NumElts - (ZeroLo + ZeroHi);

14040

if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

14041

return SDValue();

14042

14043

unsigned Scale = VT.getScalarSizeInBits() / 8;

14044

ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

14045

if (!isUndefOrInRange(StubMask, 0, NumElts) &&

14046

!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

14047

return SDValue();

14048

14049

SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

14050

Res = DAG.getBitcast(MVT::v16i8, Res);

14051

14052

// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

14053

// inner sequential set of elements, possibly offset:

14054

// 01234567 --> zzzzzz01 --> 1zzzzzzz

14055

// 01234567 --> 4567zzzz --> zzzzz456

14056

// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

14057

if (ZeroLo == 0) {

14058

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

14059

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14060

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14061

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14062

DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

14063

} else if (ZeroHi == 0) {

14064

unsigned Shift = Mask[ZeroLo] % NumElts;

14065

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14066

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14067

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14068

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

14069

} else if (!Subtarget.hasSSSE3()) {

14070

// If we don't have PSHUFB then its worth avoiding an AND constant mask

14071

// by performing 3 byte shifts. Shuffle combining can kick in above that.

14072

// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

14073

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

14074

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14075

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14076

Shift += Mask[ZeroLo] % NumElts;

14077

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14078

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14079

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14080

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

14081

} else

14082

return SDValue();

14083

14084

return DAG.getBitcast(VT, Res);

14085

}

14086

14087

/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

14088

///

14089

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

14090

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

14091

/// matches elements from one of the input vectors shuffled to the left or

14092

/// right with zeroable elements 'shifted in'. It handles both the strictly

14093

/// bit-wise element shifts and the byte shift across an entire 128-bit double

14094

/// quad word lane.

14095

///

14096

/// PSHL : (little-endian) left bit shift.

14097

/// [ zz, 0, zz, 2 ]

14098

/// [ -1, 4, zz, -1 ]

14099

/// PSRL : (little-endian) right bit shift.

14100

/// [ 1, zz, 3, zz]

14101

/// [ -1, -1, 7, zz]

14102

/// PSLLDQ : (little-endian) left byte shift

14103

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

14104

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

14105

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

14106

/// PSRLDQ : (little-endian) right byte shift

14107

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

14108

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

14109

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

14110

static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

14111

unsigned ScalarSizeInBits, ArrayRef<int> Mask,

14112

int MaskOffset, const APInt &Zeroable,

14113

const X86Subtarget &Subtarget) {

14114

int Size = Mask.size();

14115

unsigned SizeInBits = Size * ScalarSizeInBits;

14116

14117

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

14118

for (int i = 0; i < Size; i += Scale)

14119

for (int j = 0; j < Shift; ++j)

14120

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

14121

return false;

14122

14123

return true;

14124

};

14125

14126

auto MatchShift = [&](int Shift, int Scale, bool Left) {

14127

for (int i = 0; i != Size; i += Scale) {

14128

unsigned Pos = Left ? i + Shift : i;

14129

unsigned Low = Left ? i : i + Shift;

14130

unsigned Len = Scale - Shift;

14131

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

14132

return -1;

14133

}

14134

14135

int ShiftEltBits = ScalarSizeInBits * Scale;

14136

bool ByteShift = ShiftEltBits > 64;

14137

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

14138

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

14139

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

14140

14141

// Normalize the scale for byte shifts to still produce an i64 element

14142

// type.

14143

Scale = ByteShift ? Scale / 2 : Scale;

14144

14145

// We need to round trip through the appropriate type for the shift.

14146

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

14147

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

14148

: MVT::getVectorVT(ShiftSVT, Size / Scale);

14149

return (int)ShiftAmt;

14150

};

14151

14152

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

14153

// keep doubling the size of the integer elements up to that. We can

14154

// then shift the elements of the integer vector by whole multiples of

14155

// their width within the elements of the larger integer vector. Test each

14156

// multiple to see if we can find a match with the moved element indices

14157

// and that the shifted in elements are all zeroable.

14158

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

14159

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

14160

for (int Shift = 1; Shift != Scale; ++Shift)

14161

for (bool Left : {true, false})

14162

if (CheckZeros(Shift, Scale, Left)) {

14163

int ShiftAmt = MatchShift(Shift, Scale, Left);

14164

if (0 < ShiftAmt)

14165

return ShiftAmt;

14166

}

14167

14168

// no match

14169

return -1;

14170

}

14171

14172

static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

14173

SDValue V2, ArrayRef<int> Mask,

14174

const APInt &Zeroable,

14175

const X86Subtarget &Subtarget,

14176

SelectionDAG &DAG, bool BitwiseOnly) {

14177

int Size = Mask.size();

14178

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14178, __extension__
__PRETTY_FUNCTION__));

14179

14180

MVT ShiftVT;

14181

SDValue V = V1;

14182

unsigned Opcode;

14183

14184

// Try to match shuffle against V1 shift.

14185

int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14186

Mask, 0, Zeroable, Subtarget);

14187

14188

// If V1 failed, try to match shuffle against V2 shift.

14189

if (ShiftAmt < 0) {

14190

ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14191

Mask, Size, Zeroable, Subtarget);

14192

V = V2;

14193

}

14194

14195

if (ShiftAmt < 0)

14196

return SDValue();

14197

14198

if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))

14199

return SDValue();

14200

14201

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__
__PRETTY_FUNCTION__))

14202

"Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__
__PRETTY_FUNCTION__));

14203

V = DAG.getBitcast(ShiftVT, V);

14204

V = DAG.getNode(Opcode, DL, ShiftVT, V,

14205

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

14206

return DAG.getBitcast(VT, V);

14207

}

14208

14209

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

14210

// Remainder of lower half result is zero and upper half is all undef.

14211

static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

14212

ArrayRef<int> Mask, uint64_t &BitLen,

14213

uint64_t &BitIdx, const APInt &Zeroable) {

14214

int Size = Mask.size();

14215

int HalfSize = Size / 2;

14216

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14216, __extension__
__PRETTY_FUNCTION__));

14217

assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14217, __extension__
__PRETTY_FUNCTION__));

14218

14219

// Upper half must be undefined.

14220

if (!isUndefUpperHalf(Mask))

14221

return false;

14222

14223

// Determine the extraction length from the part of the

14224

// lower half that isn't zeroable.

14225

int Len = HalfSize;

14226

for (; Len > 0; --Len)

14227

if (!Zeroable[Len - 1])

14228

break;

14229

assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14229, __extension__
__PRETTY_FUNCTION__));

14230

14231

// Attempt to match first Len sequential elements from the lower half.

14232

SDValue Src;

14233

int Idx = -1;

14234

for (int i = 0; i != Len; ++i) {

14235

int M = Mask[i];

14236

if (M == SM_SentinelUndef)

14237

continue;

14238

SDValue &V = (M < Size ? V1 : V2);

14239

M = M % Size;

14240

14241

// The extracted elements must start at a valid index and all mask

14242

// elements must be in the lower half.

14243

if (i > M || M >= HalfSize)

14244

return false;

14245

14246

if (Idx < 0 || (Src == V && Idx == (M - i))) {

14247

Src = V;

14248

Idx = M - i;

14249

continue;

14250

}

14251

return false;

14252

}

14253

14254

if (!Src || Idx < 0)

14255

return false;

14256

14257

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14257, __extension__
__PRETTY_FUNCTION__));

14258

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14259

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14260

V1 = Src;

14261

return true;

14262

}

14263

14264

// INSERTQ: Extract lowest Len elements from lower half of second source and

14265

// insert over first source, starting at Idx.

14266

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

14267

static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

14268

ArrayRef<int> Mask, uint64_t &BitLen,

14269

uint64_t &BitIdx) {

14270

int Size = Mask.size();

14271

int HalfSize = Size / 2;

14272

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14272, __extension__
__PRETTY_FUNCTION__));

14273

14274

// Upper half must be undefined.

14275

if (!isUndefUpperHalf(Mask))

14276

return false;

14277

14278

for (int Idx = 0; Idx != HalfSize; ++Idx) {

14279

SDValue Base;

14280

14281

// Attempt to match first source from mask before insertion point.

14282

if (isUndefInRange(Mask, 0, Idx)) {

14283

/* EMPTY */

14284

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

14285

Base = V1;

14286

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

14287

Base = V2;

14288

} else {

14289

continue;

14290

}

14291

14292

// Extend the extraction length looking to match both the insertion of

14293

// the second source and the remaining elements of the first.

14294

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

14295

SDValue Insert;

14296

int Len = Hi - Idx;

14297

14298

// Match insertion.

14299

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

14300

Insert = V1;

14301

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

14302

Insert = V2;

14303

} else {

14304

continue;

14305

}

14306

14307

// Match the remaining elements of the lower half.

14308

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

14309

/* EMPTY */

14310

} else if ((!Base || (Base == V1)) &&

14311

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

14312

Base = V1;

14313

} else if ((!Base || (Base == V2)) &&

14314

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

14315

Size + Hi)) {

14316

Base = V2;

14317

} else {

14318

continue;

14319

}

14320

14321

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14322

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14323

V1 = Base;

14324

V2 = Insert;

14325

return true;

14326

}

14327

}

14328

14329

return false;

14330

}

14331

14332

/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

14333

static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

14334

SDValue V2, ArrayRef<int> Mask,

14335

const APInt &Zeroable, SelectionDAG &DAG) {

14336

uint64_t BitLen, BitIdx;

14337

if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

14338

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

14339

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14340

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14341

14342

if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

14343

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

14344

V2 ? V2 : DAG.getUNDEF(VT),

14345

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14346

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14347

14348

return SDValue();

14349

}

14350

14351

/// Lower a vector shuffle as a zero or any extension.

14352

///

14353

/// Given a specific number of elements, element bit width, and extension

14354

/// stride, produce either a zero or any extension based on the available

14355

/// features of the subtarget. The extended elements are consecutive and

14356

/// begin and can start from an offsetted element index in the input; to

14357

/// avoid excess shuffling the offset must either being in the bottom lane

14358

/// or at the start of a higher lane. All extended elements must be from

14359

/// the same lane.

14360

static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(

14361

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

14362

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

14363

assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__
__PRETTY_FUNCTION__));

14364

int EltBits = VT.getScalarSizeInBits();

14365

int NumElements = VT.getVectorNumElements();

14366

int NumEltsPerLane = 128 / EltBits;

14367

int OffsetLane = Offset / NumEltsPerLane;

14368

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__
__PRETTY_FUNCTION__))

14369

"Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__
__PRETTY_FUNCTION__));

14370

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14370, __extension__
__PRETTY_FUNCTION__));

14371

assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14371, __extension__
__PRETTY_FUNCTION__));

14372

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__
__PRETTY_FUNCTION__))

14373

"Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__
__PRETTY_FUNCTION__));

14374

14375

// Check that an index is in same lane as the base offset.

14376

auto SafeOffset = [&](int Idx) {

14377

return OffsetLane == (Idx / NumEltsPerLane);

14378

};

14379

14380

// Shift along an input so that the offset base moves to the first element.

14381

auto ShuffleOffset = [&](SDValue V) {

14382

if (!Offset)

14383

return V;

14384

14385

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14386

for (int i = 0; i * Scale < NumElements; ++i) {

14387

int SrcIdx = i + Offset;

14388

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

14389

}

14390

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

14391

};

14392

14393

// Found a valid a/zext mask! Try various lowering strategies based on the

14394

// input type and available ISA extensions.

14395

if (Subtarget.hasSSE41()) {

14396

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

14397

// PUNPCK will catch this in a later shuffle match.

14398

if (Offset && Scale == 2 && VT.is128BitVector())

14399

return SDValue();

14400

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

14401

NumElements / Scale);

14402

InputV = DAG.getBitcast(VT, InputV);

14403

InputV = ShuffleOffset(InputV);

14404

InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,

14405

DL, ExtVT, InputV, DAG);

14406

return DAG.getBitcast(VT, InputV);

14407

}

14408

14409

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14409, __extension__
__PRETTY_FUNCTION__));

14410

InputV = DAG.getBitcast(VT, InputV);

14411

14412

// For any extends we can cheat for larger element sizes and use shuffle

14413

// instructions that can fold with a load and/or copy.

14414

if (AnyExt && EltBits == 32) {

14415

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

14416

-1};

14417

return DAG.getBitcast(

14418

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14419

DAG.getBitcast(MVT::v4i32, InputV),

14420

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

14421

}

14422

if (AnyExt && EltBits == 16 && Scale > 2) {

14423

int PSHUFDMask[4] = {Offset / 2, -1,

14424

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

14425

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14426

DAG.getBitcast(MVT::v4i32, InputV),

14427

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

14428

int PSHUFWMask[4] = {1, -1, -1, -1};

14429

unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

14430

return DAG.getBitcast(

14431

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

14432

DAG.getBitcast(MVT::v8i16, InputV),

14433

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

14434

}

14435

14436

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

14437

// to 64-bits.

14438

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

14439

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14439, __extension__
__PRETTY_FUNCTION__));

14440

assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14440, __extension__
__PRETTY_FUNCTION__));

14441

14442

int LoIdx = Offset * EltBits;

14443

SDValue Lo = DAG.getBitcast(

14444

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14445

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14446

DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

14447

14448

if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

14449

return DAG.getBitcast(VT, Lo);

14450

14451

int HiIdx = (Offset + 1) * EltBits;

14452

SDValue Hi = DAG.getBitcast(

14453

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14454

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14455

DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

14456

return DAG.getBitcast(VT,

14457

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

14458

}

14459

14460

// If this would require more than 2 unpack instructions to expand, use

14461

// pshufb when available. We can only use more than 2 unpack instructions

14462

// when zero extending i8 elements which also makes it easier to use pshufb.

14463

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

14464

assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14464, __extension__
__PRETTY_FUNCTION__));

14465

SDValue PSHUFBMask[16];

14466

for (int i = 0; i < 16; ++i) {

14467

int Idx = Offset + (i / Scale);

14468

if ((i % Scale == 0 && SafeOffset(Idx))) {

14469

PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

14470

continue;

14471

}

14472

PSHUFBMask[i] =

14473

AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

14474

}

14475

InputV = DAG.getBitcast(MVT::v16i8, InputV);

14476

return DAG.getBitcast(

14477

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

14478

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

14479

}

14480

14481

// If we are extending from an offset, ensure we start on a boundary that

14482

// we can unpack from.

14483

int AlignToUnpack = Offset % (NumElements / Scale);

14484

if (AlignToUnpack) {

14485

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14486

for (int i = AlignToUnpack; i < NumElements; ++i)

14487

ShMask[i - AlignToUnpack] = i;

14488

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

14489

Offset -= AlignToUnpack;

14490

}

14491

14492

// Otherwise emit a sequence of unpacks.

14493

do {

14494

unsigned UnpackLoHi = X86ISD::UNPCKL;

14495

if (Offset >= (NumElements / 2)) {

14496

UnpackLoHi = X86ISD::UNPCKH;

14497

Offset -= (NumElements / 2);

14498

}

14499

14500

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

14501

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

14502

: getZeroVector(InputVT, Subtarget, DAG, DL);

14503

InputV = DAG.getBitcast(InputVT, InputV);

14504

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

14505

Scale /= 2;

14506

EltBits *= 2;

14507

NumElements /= 2;

14508

} while (Scale > 1);

14509

return DAG.getBitcast(VT, InputV);

14510

}

14511

14512

/// Try to lower a vector shuffle as a zero extension on any microarch.

14513

///

14514

/// This routine will try to do everything in its power to cleverly lower

14515

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

14516

/// check for the profitability of this lowering, it tries to aggressively

14517

/// match this pattern. It will use all of the micro-architectural details it

14518

/// can to emit an efficient lowering. It handles both blends with all-zero

14519

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

14520

/// masking out later).

14521

///

14522

/// The reason we have dedicated lowering for zext-style shuffles is that they

14523

/// are both incredibly common and often quite performance sensitive.

14524

static SDValue lowerShuffleAsZeroOrAnyExtend(

14525

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14526

const APInt &Zeroable, const X86Subtarget &Subtarget,

14527

SelectionDAG &DAG) {

14528

int Bits = VT.getSizeInBits();

14529

int NumLanes = Bits / 128;

14530

int NumElements = VT.getVectorNumElements();

14531

int NumEltsPerLane = NumElements / NumLanes;

14532

assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__
__PRETTY_FUNCTION__))

14533

"Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__
__PRETTY_FUNCTION__));

14534

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14534, __extension__
__PRETTY_FUNCTION__));

14535

14536

// Define a helper function to check a particular ext-scale and lower to it if

14537

// valid.

14538

auto Lower = [&](int Scale) -> SDValue {

14539

SDValue InputV;

14540

bool AnyExt = true;

14541

int Offset = 0;

14542

int Matches = 0;

14543

for (int i = 0; i < NumElements; ++i) {

14544

int M = Mask[i];

14545

if (M < 0)

14546

continue; // Valid anywhere but doesn't tell us anything.

14547

if (i % Scale != 0) {

14548

// Each of the extended elements need to be zeroable.

14549

if (!Zeroable[i])

14550

return SDValue();

14551

14552

// We no longer are in the anyext case.

14553

AnyExt = false;

14554

continue;

14555

}

14556

14557

// Each of the base elements needs to be consecutive indices into the

14558

// same input vector.

14559

SDValue V = M < NumElements ? V1 : V2;

14560

M = M % NumElements;

14561

if (!InputV) {

14562

InputV = V;

14563

Offset = M - (i / Scale);

14564

} else if (InputV != V)

14565

return SDValue(); // Flip-flopping inputs.

14566

14567

// Offset must start in the lowest 128-bit lane or at the start of an

14568

// upper lane.

14569

// FIXME: Is it ever worth allowing a negative base offset?

14570

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

14571

(Offset % NumEltsPerLane) == 0))

14572

return SDValue();

14573

14574

// If we are offsetting, all referenced entries must come from the same

14575

// lane.

14576

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

14577

return SDValue();

14578

14579

if ((M % NumElements) != (Offset + (i / Scale)))

14580

return SDValue(); // Non-consecutive strided elements.

14581

Matches++;

14582

}

14583

14584

// If we fail to find an input, we have a zero-shuffle which should always

14585

// have already been handled.

14586

// FIXME: Maybe handle this here in case during blending we end up with one?

14587

if (!InputV)

14588

return SDValue();

14589

14590

// If we are offsetting, don't extend if we only match a single input, we

14591

// can always do better by using a basic PSHUF or PUNPCK.

14592

if (Offset != 0 && Matches < 2)

14593

return SDValue();

14594

14595

return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,

14596

InputV, Mask, Subtarget, DAG);

14597

};

14598

14599

// The widest scale possible for extending is to a 64-bit integer.

14600

assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__
__PRETTY_FUNCTION__))

14601

"The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__
__PRETTY_FUNCTION__));

14602

int NumExtElements = Bits / 64;

14603

14604

// Each iteration, try extending the elements half as much, but into twice as

14605

// many elements.

14606

for (; NumExtElements < NumElements; NumExtElements *= 2) {

14607

assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__
__PRETTY_FUNCTION__))

14608

"The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__
__PRETTY_FUNCTION__));

14609

if (SDValue V = Lower(NumElements / NumExtElements))

14610

return V;

14611

}

14612

14613

// General extends failed, but 128-bit vectors may be able to use MOVQ.

14614

if (Bits != 128)

14615

return SDValue();

14616

14617

// Returns one of the source operands if the shuffle can be reduced to a

14618

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

14619

auto CanZExtLowHalf = [&]() {

14620

for (int i = NumElements / 2; i != NumElements; ++i)

14621

if (!Zeroable[i])

14622

return SDValue();

14623

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

14624

return V1;

14625

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

14626

return V2;

14627

return SDValue();

14628

};

14629

14630

if (SDValue V = CanZExtLowHalf()) {

14631

V = DAG.getBitcast(MVT::v2i64, V);

14632

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

14633

return DAG.getBitcast(VT, V);

14634

}

14635

14636

// No viable ext lowering found.

14637

return SDValue();

14638

}

14639

14640

/// Try to get a scalar value for a specific element of a vector.

14641

///

14642

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

14643

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

14644

SelectionDAG &DAG) {

14645

MVT VT = V.getSimpleValueType();

14646

MVT EltVT = VT.getVectorElementType();

14647

V = peekThroughBitcasts(V);

14648

14649

// If the bitcasts shift the element size, we can't extract an equivalent

14650

// element from it.

14651

MVT NewVT = V.getSimpleValueType();

14652

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

14653

return SDValue();

14654

14655

if (V.getOpcode() == ISD::BUILD_VECTOR ||

14656

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

14657

// Ensure the scalar operand is the same size as the destination.

14658

// FIXME: Add support for scalar truncation where possible.

14659

SDValue S = V.getOperand(Idx);

14660

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

14661

return DAG.getBitcast(EltVT, S);

14662

}

14663

14664

return SDValue();

14665

}

14666

14667

/// Helper to test for a load that can be folded with x86 shuffles.

14668

///

14669

/// This is particularly important because the set of instructions varies

14670

/// significantly based on whether the operand is a load or not.

14671

static bool isShuffleFoldableLoad(SDValue V) {

14672

return V->hasOneUse() &&

14673

ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());

14674

}

14675

14676

template<typename T>

14677

static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {

14678

return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();

14679

}

14680

14681

template<typename T>

14682

bool X86TargetLowering::isSoftFP16(T VT) const {

14683

return ::isSoftFP16(VT, Subtarget);

14684

}

14685

14686

/// Try to lower insertion of a single element into a zero vector.

14687

///

14688

/// This is a common pattern that we have especially efficient patterns to lower

14689

/// across all subtarget feature sets.

14690

static SDValue lowerShuffleAsElementInsertion(

14691

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14692

const APInt &Zeroable, const X86Subtarget &Subtarget,

14693

SelectionDAG &DAG) {

14694

MVT ExtVT = VT;

14695

MVT EltVT = VT.getVectorElementType();

14696

unsigned NumElts = VT.getVectorNumElements();

14697

unsigned EltBits = VT.getScalarSizeInBits();

14698

14699

if (isSoftFP16(EltVT, Subtarget))

14700

return SDValue();

14701

14702

int V2Index =

14703

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

14704

Mask.begin();

14705

bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;

14706

bool IsV1Zeroable = true;

14707

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14708

if (i != V2Index && !Zeroable[i]) {

14709

IsV1Zeroable = false;

14710

break;

14711

}

14712

14713

// Bail if a non-zero V1 isn't used in place.

14714

if (!IsV1Zeroable) {

14715

SmallVector<int, 8> V1Mask(Mask);

14716

V1Mask[V2Index] = -1;

14717

if (!isNoopShuffleMask(V1Mask))

14718

return SDValue();

14719

}

14720

14721

// Check for a single input from a SCALAR_TO_VECTOR node.

14722

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

14723

// all the smarts here sunk into that routine. However, the current

14724

// lowering of BUILD_VECTOR makes that nearly impossible until the old

14725

// vector shuffle lowering is dead.

14726

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

14727

DAG);

14728

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

14729

// We need to zext the scalar if it is smaller than an i32.

14730

V2S = DAG.getBitcast(EltVT, V2S);

14731

if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {

14732

// Using zext to expand a narrow element won't work for non-zero

14733

// insertions. But we can use a masked constant vector if we're

14734

// inserting V2 into the bottom of V1.

14735

if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))

14736

return SDValue();

14737

14738

// Zero-extend directly to i32.

14739

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

14740

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

14741

14742

// If we're inserting into a constant, mask off the inserted index

14743

// and OR with the zero-extended scalar.

14744

if (!IsV1Zeroable) {

14745

SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));

14746

Bits[V2Index] = APInt::getZero(EltBits);

14747

SDValue BitMask = getConstVector(Bits, VT, DAG, DL);

14748

V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);

14749

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

14750

V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));

14751

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

14752

}

14753

}

14754

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

14755

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

14756

EltVT == MVT::i16) {

14757

// Either not inserting from the low element of the input or the input

14758

// element size is too small to use VZEXT_MOVL to clear the high bits.

14759

return SDValue();

14760

}

14761

14762

if (!IsV1Zeroable) {

14763

// If V1 can't be treated as a zero vector we have fewer options to lower

14764

// this. We can't support integer vectors or non-zero targets cheaply.

14765

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14765, __extension__
__PRETTY_FUNCTION__));

14766

if (!VT.isFloatingPoint() || V2Index != 0)

14767

return SDValue();

14768

if (!VT.is128BitVector())

14769

return SDValue();

14770

14771

// Otherwise, use MOVSD, MOVSS or MOVSH.

14772

unsigned MovOpc = 0;

14773

if (EltVT == MVT::f16)

14774

MovOpc = X86ISD::MOVSH;

14775

else if (EltVT == MVT::f32)

14776

MovOpc = X86ISD::MOVSS;

14777

else if (EltVT == MVT::f64)

14778

MovOpc = X86ISD::MOVSD;

14779

else

14780

llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14780);

14781

return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);

14782

}

14783

14784

// This lowering only works for the low element with floating point vectors.

14785

if (VT.isFloatingPoint() && V2Index != 0)

14786

return SDValue();

14787

14788

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

14789

if (ExtVT != VT)

14790

V2 = DAG.getBitcast(VT, V2);

14791

14792

if (V2Index != 0) {

14793

// If we have 4 or fewer lanes we can cheaply shuffle the element into

14794

// the desired position. Otherwise it is more efficient to do a vector

14795

// shift left. We know that we can do a vector shift left because all

14796

// the inputs are zero.

14797

if (VT.isFloatingPoint() || NumElts <= 4) {

14798

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

14799

V2Shuffle[V2Index] = 0;

14800

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

14801

} else {

14802

V2 = DAG.getBitcast(MVT::v16i8, V2);

14803

V2 = DAG.getNode(

14804

X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

14805

DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));

14806

V2 = DAG.getBitcast(VT, V2);

14807

}

14808

}

14809

return V2;

14810

}

14811

14812

/// Try to lower broadcast of a single - truncated - integer element,

14813

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

14814

///

14815

/// This assumes we have AVX2.

14816

static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

14817

int BroadcastIdx,

14818

const X86Subtarget &Subtarget,

14819

SelectionDAG &DAG) {

14820

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__
__PRETTY_FUNCTION__))

14821

"We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__
__PRETTY_FUNCTION__));

14822

14823

MVT EltVT = VT.getVectorElementType();

14824

MVT V0VT = V0.getSimpleValueType();

14825

14826

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14826, __extension__
__PRETTY_FUNCTION__));

14827

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14827, __extension__
__PRETTY_FUNCTION__));

14828

14829

MVT V0EltVT = V0VT.getVectorElementType();

14830

if (!V0EltVT.isInteger())

14831

return SDValue();

14832

14833

const unsigned EltSize = EltVT.getSizeInBits();

14834

const unsigned V0EltSize = V0EltVT.getSizeInBits();

14835

14836

// This is only a truncation if the original element type is larger.

14837

if (V0EltSize <= EltSize)

14838

return SDValue();

14839

14840

assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__
__PRETTY_FUNCTION__))

14841

"Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__
__PRETTY_FUNCTION__));

14842

14843

const unsigned V0Opc = V0.getOpcode();

14844

const unsigned Scale = V0EltSize / EltSize;

14845

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

14846

14847

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

14848

V0Opc != ISD::BUILD_VECTOR)

14849

return SDValue();

14850

14851

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

14852

14853

// If we're extracting non-least-significant bits, shift so we can truncate.

14854

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

14855

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

14856

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

14857

if (const int OffsetIdx = BroadcastIdx % Scale)

14858

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

14859

DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

14860

14861

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

14862

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

14863

}

14864

14865

/// Test whether this can be lowered with a single SHUFPS instruction.

14866

///

14867

/// This is used to disable more specialized lowerings when the shufps lowering

14868

/// will happen to be efficient.

14869

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

14870

// This routine only handles 128-bit shufps.

14871

assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14871, __extension__
__PRETTY_FUNCTION__));

14872

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14872, __extension__
__PRETTY_FUNCTION__));

14873

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14873, __extension__
__PRETTY_FUNCTION__));

14874

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14874, __extension__
__PRETTY_FUNCTION__));

14875

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14875, __extension__
__PRETTY_FUNCTION__));

14876

14877

// To lower with a single SHUFPS we need to have the low half and high half

14878

// each requiring a single input.

14879

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

14880

return false;

14881

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

14882

return false;

14883

14884

return true;

14885

}

14886

14887

/// Test whether the specified input (0 or 1) is in-place blended by the

14888

/// given mask.

14889

///

14890

/// This returns true if the elements from a particular input are already in the

14891

/// slot required by the given mask and require no permutation.

14892

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

14893

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14893, __extension__
__PRETTY_FUNCTION__));

14894

int Size = Mask.size();

14895

for (int i = 0; i < Size; ++i)

14896

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

14897

return false;

14898

14899

return true;

14900

}

14901

14902

/// If we are extracting two 128-bit halves of a vector and shuffling the

14903

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

14904

/// multi-shuffle lowering.

14905

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

14906

SDValue N1, ArrayRef<int> Mask,

14907

SelectionDAG &DAG) {

14908

MVT VT = N0.getSimpleValueType();

14909

assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))

14910

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__))

14911

"VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__
__PRETTY_FUNCTION__));

14912

14913

// Check that both sources are extracts of the same source vector.

14914

if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14915

N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14916

N0.getOperand(0) != N1.getOperand(0) ||

14917

!N0.hasOneUse() || !N1.hasOneUse())

14918

return SDValue();

14919

14920

SDValue WideVec = N0.getOperand(0);

14921

MVT WideVT = WideVec.getSimpleValueType();

14922

if (!WideVT.is256BitVector())

14923

return SDValue();

14924

14925

// Match extracts of each half of the wide source vector. Commute the shuffle

14926

// if the extract of the low half is N1.

14927

unsigned NumElts = VT.getVectorNumElements();

14928

SmallVector<int, 4> NewMask(Mask);

14929

const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

14930

const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

14931

if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

14932

ShuffleVectorSDNode::commuteMask(NewMask);

14933

else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

14934

return SDValue();

14935

14936

// Final bailout: if the mask is simple, we are better off using an extract

14937

// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

14938

// because that avoids a constant load from memory.

14939

if (NumElts == 4 &&

14940

(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))

14941

return SDValue();

14942

14943

// Extend the shuffle mask with undef elements.

14944

NewMask.append(NumElts, -1);

14945

14946

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

14947

SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

14948

NewMask);

14949

// This is free: ymm -> xmm.

14950

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

14951

DAG.getIntPtrConstant(0, DL));

14952

}

14953

14954

/// Try to lower broadcast of a single element.

14955

///

14956

/// For convenience, this code also bundles all of the subtarget feature set

14957

/// filtering. While a little annoying to re-dispatch on type here, there isn't

14958

/// a convenient way to factor it out.

14959

static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

14960

SDValue V2, ArrayRef<int> Mask,

14961

const X86Subtarget &Subtarget,

14962

SelectionDAG &DAG) {

14963

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

14964

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

14965

(Subtarget.hasAVX2() && VT.isInteger())))

14966

return SDValue();

14967

14968

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

14969

// we can only broadcast from a register with AVX2.

14970

unsigned NumEltBits = VT.getScalarSizeInBits();

14971

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

14972

? X86ISD::MOVDDUP

14973

: X86ISD::VBROADCAST;

14974

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

14975

14976

// Check that the mask is a broadcast.

14977

int BroadcastIdx = getSplatIndex(Mask);

14978

if (BroadcastIdx < 0)

14979

return SDValue();

14980

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))

14981

"a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__))

14982

"comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__
__PRETTY_FUNCTION__));

14983

14984

// Go up the chain of (vector) values to find a scalar load that we can

14985

// combine with the broadcast.

14986

// TODO: Combine this logic with findEltLoadSrc() used by

14987

// EltsFromConsecutiveLoads().

14988

int BitOffset = BroadcastIdx * NumEltBits;

14989

SDValue V = V1;

14990

for (;;) {

14991

switch (V.getOpcode()) {

14992

case ISD::BITCAST: {

14993

V = V.getOperand(0);

14994

continue;

14995

}

14996

case ISD::CONCAT_VECTORS: {

14997

int OpBitWidth = V.getOperand(0).getValueSizeInBits();

14998

int OpIdx = BitOffset / OpBitWidth;

14999

V = V.getOperand(OpIdx);

15000

BitOffset %= OpBitWidth;

15001

continue;

15002

}

15003

case ISD::EXTRACT_SUBVECTOR: {

15004

// The extraction index adds to the existing offset.

15005

unsigned EltBitWidth = V.getScalarValueSizeInBits();

15006

unsigned Idx = V.getConstantOperandVal(1);

15007

unsigned BeginOffset = Idx * EltBitWidth;

15008

BitOffset += BeginOffset;

15009

V = V.getOperand(0);

15010

continue;

15011

}

15012

case ISD::INSERT_SUBVECTOR: {

15013

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

15014

int EltBitWidth = VOuter.getScalarValueSizeInBits();

15015

int Idx = (int)V.getConstantOperandVal(2);

15016

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

15017

int BeginOffset = Idx * EltBitWidth;

15018

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

15019

if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

15020

BitOffset -= BeginOffset;

15021

V = VInner;

15022

} else {

15023

V = VOuter;

15024

}

15025

continue;

15026

}

15027

}

15028

break;

15029

}

15030

assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15030, __extension__
__PRETTY_FUNCTION__));

15031

BroadcastIdx = BitOffset / NumEltBits;

15032

15033

// Do we need to bitcast the source to retrieve the original broadcast index?

15034

bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

15035

15036

// Check if this is a broadcast of a scalar. We special case lowering

15037

// for scalars so that we can more effectively fold with loads.

15038

// If the original value has a larger element type than the shuffle, the

15039

// broadcast element is in essence truncated. Make that explicit to ease

15040

// folding.

15041

if (BitCastSrc && VT.isInteger())

15042

if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

15043

DL, VT, V, BroadcastIdx, Subtarget, DAG))

15044

return TruncBroadcast;

15045

15046

// Also check the simpler case, where we can directly reuse the scalar.

15047

if (!BitCastSrc &&

15048

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

15049

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

15050

V = V.getOperand(BroadcastIdx);

15051

15052

// If we can't broadcast from a register, check that the input is a load.

15053

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

15054

return SDValue();

15055

} else if (ISD::isNormalLoad(V.getNode()) &&

15056

cast<LoadSDNode>(V)->isSimple()) {

15057

// We do not check for one-use of the vector load because a broadcast load

15058

// is expected to be a win for code size, register pressure, and possibly

15059

// uops even if the original vector load is not eliminated.

15060

15061

// Reduce the vector load and shuffle to a broadcasted scalar load.

15062

LoadSDNode *Ld = cast<LoadSDNode>(V);

15063

SDValue BaseAddr = Ld->getOperand(1);

15064

MVT SVT = VT.getScalarType();

15065

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

15066

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15066, __extension__
__PRETTY_FUNCTION__));

15067

SDValue NewAddr =

15068

DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);

15069

15070

// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

15071

// than MOVDDUP.

15072

// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

15073

if (Opcode == X86ISD::VBROADCAST) {

15074

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

15075

SDValue Ops[] = {Ld->getChain(), NewAddr};

15076

V = DAG.getMemIntrinsicNode(

15077

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

15078

DAG.getMachineFunction().getMachineMemOperand(

15079

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

15080

DAG.makeEquivalentMemoryOrdering(Ld, V);

15081

return DAG.getBitcast(VT, V);

15082

}

15083

assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15083, __extension__
__PRETTY_FUNCTION__));

15084

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

15085

DAG.getMachineFunction().getMachineMemOperand(

15086

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

15087

DAG.makeEquivalentMemoryOrdering(Ld, V);

15088

} else if (!BroadcastFromReg) {

15089

// We can't broadcast from a vector register.

15090

return SDValue();

15091

} else if (BitOffset != 0) {

15092

// We can only broadcast from the zero-element of a vector register,

15093

// but it can be advantageous to broadcast from the zero-element of a

15094

// subvector.

15095

if (!VT.is256BitVector() && !VT.is512BitVector())

15096

return SDValue();

15097

15098

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

15099

if (VT == MVT::v4f64 || VT == MVT::v4i64)

15100

return SDValue();

15101

15102

// Only broadcast the zero-element of a 128-bit subvector.

15103

if ((BitOffset % 128) != 0)

15104

return SDValue();

15105

15106

assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__
__PRETTY_FUNCTION__))

15107

"Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__
__PRETTY_FUNCTION__));

15108

assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__
__PRETTY_FUNCTION__))

15109

"Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__
__PRETTY_FUNCTION__));

15110

unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

15111

V = extract128BitVector(V, ExtractIdx, DAG, DL);

15112

}

15113

15114

// On AVX we can use VBROADCAST directly for scalar sources.

15115

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {

15116

V = DAG.getBitcast(MVT::f64, V);

15117

if (Subtarget.hasAVX()) {

15118

V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);

15119

return DAG.getBitcast(VT, V);

15120

}

15121

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);

15122

}

15123

15124

// If this is a scalar, do the broadcast on this type and bitcast.

15125

if (!V.getValueType().isVector()) {

15126

assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__
__PRETTY_FUNCTION__))

15127

"Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__
__PRETTY_FUNCTION__));

15128

MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

15129

VT.getVectorNumElements());

15130

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

15131

}

15132

15133

// We only support broadcasting from 128-bit vectors to minimize the

15134

// number of patterns we need to deal with in isel. So extract down to

15135

// 128-bits, removing as many bitcasts as possible.

15136

if (V.getValueSizeInBits() > 128)

15137

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

15138

15139

// Otherwise cast V to a vector with the same element type as VT, but

15140

// possibly narrower than VT. Then perform the broadcast.

15141

unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

15142

MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

15143

return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

15144

}

15145

15146

// Check for whether we can use INSERTPS to perform the shuffle. We only use

15147

// INSERTPS when the V1 elements are already in the correct locations

15148

// because otherwise we can just always use two SHUFPS instructions which

15149

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

15150

// perform INSERTPS if a single V1 element is out of place and all V2

15151

// elements are zeroable.

15152

static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

15153

unsigned &InsertPSMask,

15154

const APInt &Zeroable,

15155

ArrayRef<int> Mask, SelectionDAG &DAG) {

15156

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15156, __extension__
__PRETTY_FUNCTION__));

15157

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15157, __extension__
__PRETTY_FUNCTION__));

15158

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15158, __extension__
__PRETTY_FUNCTION__));

15159

15160

// Attempt to match INSERTPS with one element from VA or VB being

15161

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

15162

// are updated.

15163

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

15164

ArrayRef<int> CandidateMask) {

15165

unsigned ZMask = 0;

15166

int VADstIndex = -1;

15167

int VBDstIndex = -1;

15168

bool VAUsedInPlace = false;

15169

15170

for (int i = 0; i < 4; ++i) {

15171

// Synthesize a zero mask from the zeroable elements (includes undefs).

15172

if (Zeroable[i]) {

15173

ZMask |= 1 << i;

15174

continue;

15175

}

15176

15177

// Flag if we use any VA inputs in place.

15178

if (i == CandidateMask[i]) {

15179

VAUsedInPlace = true;

15180

continue;

15181

}

15182

15183

// We can only insert a single non-zeroable element.

15184

if (VADstIndex >= 0 || VBDstIndex >= 0)

15185

return false;

15186

15187

if (CandidateMask[i] < 4) {

15188

// VA input out of place for insertion.

15189

VADstIndex = i;

15190

} else {

15191

// VB input for insertion.

15192

VBDstIndex = i;

15193

}

15194

}

15195

15196

// Don't bother if we have no (non-zeroable) element for insertion.

15197

if (VADstIndex < 0 && VBDstIndex < 0)

15198

return false;

15199

15200

// Determine element insertion src/dst indices. The src index is from the

15201

// start of the inserted vector, not the start of the concatenated vector.

15202

unsigned VBSrcIndex = 0;

15203

if (VADstIndex >= 0) {

15204

// If we have a VA input out of place, we use VA as the V2 element

15205

// insertion and don't use the original V2 at all.

15206

VBSrcIndex = CandidateMask[VADstIndex];

15207

VBDstIndex = VADstIndex;

15208

VB = VA;

15209

} else {

15210

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

15211

}

15212

15213

// If no V1 inputs are used in place, then the result is created only from

15214

// the zero mask and the V2 insertion - so remove V1 dependency.

15215

if (!VAUsedInPlace)

15216

VA = DAG.getUNDEF(MVT::v4f32);

15217

15218

// Update V1, V2 and InsertPSMask accordingly.

15219

V1 = VA;

15220

V2 = VB;

15221

15222

// Insert the V2 element into the desired position.

15223

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

15224

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15224, __extension__
__PRETTY_FUNCTION__));

15225

return true;

15226

};

15227

15228

if (matchAsInsertPS(V1, V2, Mask))

15229

return true;

15230

15231

// Commute and try again.

15232

SmallVector<int, 4> CommutedMask(Mask);

15233

ShuffleVectorSDNode::commuteMask(CommutedMask);

15234

if (matchAsInsertPS(V2, V1, CommutedMask))

15235

return true;

15236

15237

return false;

15238

}

15239

15240

static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

15241

ArrayRef<int> Mask, const APInt &Zeroable,

15242

SelectionDAG &DAG) {

15243

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15243, __extension__
__PRETTY_FUNCTION__));

15244

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15244, __extension__
__PRETTY_FUNCTION__));

15245

15246

// Attempt to match the insertps pattern.

15247

unsigned InsertPSMask = 0;

15248

if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

15249

return SDValue();

15250

15251

// Insert the V2 element into the desired position.

15252

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

15253

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

15254

}

15255

15256

/// Handle lowering of 2-lane 64-bit floating point shuffles.

15257

///

15258

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

15259

/// support for floating point shuffles but not integer shuffles. These

15260

/// instructions will incur a domain crossing penalty on some chips though so

15261

/// it is better to avoid lowering through this for integer vectors where

15262

/// possible.

15263

static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15264

const APInt &Zeroable, SDValue V1, SDValue V2,

15265

const X86Subtarget &Subtarget,

15266

SelectionDAG &DAG) {

15267

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15267, __extension__
__PRETTY_FUNCTION__));

15268

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15268, __extension__
__PRETTY_FUNCTION__));

15269

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15269, __extension__
__PRETTY_FUNCTION__));

15270

15271

if (V2.isUndef()) {

15272

// Check for being able to broadcast a single element.

15273

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

15274

Mask, Subtarget, DAG))

15275

return Broadcast;

15276

15277

// Straight shuffle of a single input vector. Simulate this by using the

15278

// single input as both of the "inputs" to this instruction..

15279

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

15280

15281

if (Subtarget.hasAVX()) {

15282

// If we have AVX, we can use VPERMILPS which will allow folding a load

15283

// into the shuffle.

15284

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

15285

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15286

}

15287

15288

return DAG.getNode(

15289

X86ISD::SHUFP, DL, MVT::v2f64,

15290

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15291

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15292

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15293

}

15294

assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15294, __extension__
__PRETTY_FUNCTION__));

15295

assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15295, __extension__
__PRETTY_FUNCTION__));

15296

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15296, __extension__
__PRETTY_FUNCTION__));

15297

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15297, __extension__
__PRETTY_FUNCTION__));

15298

15299

if (Subtarget.hasAVX2())

15300

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15301

return Extract;

15302

15303

// When loading a scalar and then shuffling it into a vector we can often do

15304

// the insertion cheaply.

15305

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15306

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15307

return Insertion;

15308

// Try inverting the insertion since for v2 masks it is easy to do and we

15309

// can't reliably sort the mask one way or the other.

15310

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

15311

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

15312

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15313

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15314

return Insertion;

15315

15316

// Try to use one of the special instruction patterns to handle two common

15317

// blend patterns if a zero-blend above didn't work.

15318

if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||

15319

isShuffleEquivalent(Mask, {1, 3}, V1, V2))

15320

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

15321

// We can either use a special instruction to load over the low double or

15322

// to move just the low double.

15323

return DAG.getNode(

15324

X86ISD::MOVSD, DL, MVT::v2f64, V2,

15325

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

15326

15327

if (Subtarget.hasSSE41())

15328

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

15329

Zeroable, Subtarget, DAG))

15330

return Blend;

15331

15332

// Use dedicated unpack instructions for masks that match their pattern.

15333

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

15334

return V;

15335

15336

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

15337

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

15338

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15339

}

15340

15341

/// Handle lowering of 2-lane 64-bit integer shuffles.

15342

///

15343

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

15344

/// the integer unit to minimize domain crossing penalties. However, for blends

15345

/// it falls back to the floating point shuffle operation with appropriate bit

15346

/// casting.

15347

static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15348

const APInt &Zeroable, SDValue V1, SDValue V2,

15349

const X86Subtarget &Subtarget,

15350

SelectionDAG &DAG) {

15351

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15351, __extension__
__PRETTY_FUNCTION__));

15352

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15352, __extension__
__PRETTY_FUNCTION__));

15353

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15353, __extension__
__PRETTY_FUNCTION__));

15354

15355

if (V2.isUndef()) {

15356

// Check for being able to broadcast a single element.

15357

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

15358

Mask, Subtarget, DAG))

15359

return Broadcast;

15360

15361

// Straight shuffle of a single input vector. For everything from SSE2

15362

// onward this has a single fast instruction with no scary immediates.

15363

// We have to map the mask as it is actually a v4i32 shuffle instruction.

15364

V1 = DAG.getBitcast(MVT::v4i32, V1);

15365

int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

15366

Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

15367

Mask[1] < 0 ? -1 : (Mask[1] * 2),

15368

Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

15369

return DAG.getBitcast(

15370

MVT::v2i64,

15371

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15372

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

15373

}

15374

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15374, __extension__
__PRETTY_FUNCTION__));

15375

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15375, __extension__
__PRETTY_FUNCTION__));

15376

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15376, __extension__
__PRETTY_FUNCTION__));

15377

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15377, __extension__
__PRETTY_FUNCTION__));

15378

15379

if (Subtarget.hasAVX2())

15380

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15381

return Extract;

15382

15383

// Try to use shift instructions.

15384

if (SDValue Shift =

15385

lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,

15386

DAG, /*BitwiseOnly*/ false))

15387

return Shift;

15388

15389

// When loading a scalar and then shuffling it into a vector we can often do

15390

// the insertion cheaply.

15391

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15392

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15393

return Insertion;

15394

// Try inverting the insertion since for v2 masks it is easy to do and we

15395

// can't reliably sort the mask one way or the other.

15396

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

15397

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15398

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15399

return Insertion;

15400

15401

// We have different paths for blend lowering, but they all must use the

15402

// *exact* same predicate.

15403

bool IsBlendSupported = Subtarget.hasSSE41();

15404

if (IsBlendSupported)

15405

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

15406

Zeroable, Subtarget, DAG))

15407

return Blend;

15408

15409

// Use dedicated unpack instructions for masks that match their pattern.

15410

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

15411

return V;

15412

15413

// Try to use byte rotation instructions.

15414

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15415

if (Subtarget.hasSSSE3()) {

15416

if (Subtarget.hasVLX())

15417

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

15418

Subtarget, DAG))

15419

return Rotate;

15420

15421

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

15422

Subtarget, DAG))

15423

return Rotate;

15424

}

15425

15426

// If we have direct support for blends, we should lower by decomposing into

15427

// a permute. That will be faster than the domain cross.

15428

if (IsBlendSupported)

15429

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

15430

Subtarget, DAG);

15431

15432

// We implement this with SHUFPD which is pretty lame because it will likely

15433

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

15434

// However, all the alternatives are still more cycles and newer chips don't

15435

// have this problem. It would be really nice if x86 had better shuffles here.

15436

V1 = DAG.getBitcast(MVT::v2f64, V1);

15437

V2 = DAG.getBitcast(MVT::v2f64, V2);

15438

return DAG.getBitcast(MVT::v2i64,

15439

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

15440

}

15441

15442

/// Lower a vector shuffle using the SHUFPS instruction.

15443

///

15444

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

15445

/// It makes no assumptions about whether this is the *best* lowering, it simply

15446

/// uses it.

15447

static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

15448

ArrayRef<int> Mask, SDValue V1,

15449

SDValue V2, SelectionDAG &DAG) {

15450

SDValue LowV = V1, HighV = V2;

15451

SmallVector<int, 4> NewMask(Mask);

15452

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15453

15454

if (NumV2Elements == 1) {

15455

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

15456

15457

// Compute the index adjacent to V2Index and in the same half by toggling

15458

// the low bit.

15459

int V2AdjIndex = V2Index ^ 1;

15460

15461

if (Mask[V2AdjIndex] < 0) {

15462

// Handles all the cases where we have a single V2 element and an undef.

15463

// This will only ever happen in the high lanes because we commute the

15464

// vector otherwise.

15465

if (V2Index < 2)

15466

std::swap(LowV, HighV);

15467

NewMask[V2Index] -= 4;

15468

} else {

15469

// Handle the case where the V2 element ends up adjacent to a V1 element.

15470

// To make this work, blend them together as the first step.

15471

int V1Index = V2AdjIndex;

15472

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

15473

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

15474

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15475

15476

// Now proceed to reconstruct the final blend as we have the necessary

15477

// high or low half formed.

15478

if (V2Index < 2) {

15479

LowV = V2;

15480

HighV = V1;

15481

} else {

15482

HighV = V2;

15483

}

15484

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

15485

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

15486

}

15487

} else if (NumV2Elements == 2) {

15488

if (Mask[0] < 4 && Mask[1] < 4) {

15489

// Handle the easy case where we have V1 in the low lanes and V2 in the

15490

// high lanes.

15491

NewMask[2] -= 4;

15492

NewMask[3] -= 4;

15493

} else if (Mask[2] < 4 && Mask[3] < 4) {

15494

// We also handle the reversed case because this utility may get called

15495

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

15496

// arrange things in the right direction.

15497

NewMask[0] -= 4;

15498

NewMask[1] -= 4;

15499

HighV = V1;

15500

LowV = V2;

15501

} else {

15502

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

15503

// trying to place elements directly, just blend them and set up the final

15504

// shuffle to place them.

15505

15506

// The first two blend mask elements are for V1, the second two are for

15507

// V2.

15508

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

15509

Mask[2] < 4 ? Mask[2] : Mask[3],

15510

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

15511

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

15512

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

15513

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15514

15515

// Now we do a normal shuffle of V1 by giving V1 as both operands to

15516

// a blend.

15517

LowV = HighV = V1;

15518

NewMask[0] = Mask[0] < 4 ? 0 : 2;

15519

NewMask[1] = Mask[0] < 4 ? 2 : 0;

15520

NewMask[2] = Mask[2] < 4 ? 1 : 3;

15521

NewMask[3] = Mask[2] < 4 ? 3 : 1;

15522

}

15523

} else if (NumV2Elements == 3) {

15524

// Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

15525

// we can get here due to other paths (e.g repeated mask matching) that we

15526

// don't want to do another round of lowerVECTOR_SHUFFLE.

15527

ShuffleVectorSDNode::commuteMask(NewMask);

15528

return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

15529

}

15530

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

15531

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

15532

}

15533

15534

/// Lower 4-lane 32-bit floating point shuffles.

15535

///

15536

/// Uses instructions exclusively from the floating point unit to minimize

15537

/// domain crossing penalties, as these are sufficient to implement all v4f32

15538

/// shuffles.

15539

static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15540

const APInt &Zeroable, SDValue V1, SDValue V2,

15541

const X86Subtarget &Subtarget,

15542

SelectionDAG &DAG) {

15543

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15543, __extension__
__PRETTY_FUNCTION__));

15544

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15544, __extension__
__PRETTY_FUNCTION__));

15545

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15545, __extension__
__PRETTY_FUNCTION__));

15546

15547

if (Subtarget.hasSSE41())

15548

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

15549

Zeroable, Subtarget, DAG))

15550

return Blend;

15551

15552

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15553

15554

if (NumV2Elements == 0) {

15555

// Check for being able to broadcast a single element.

15556

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

15557

Mask, Subtarget, DAG))

15558

return Broadcast;

15559

15560

// Use even/odd duplicate instructions for masks that match their pattern.

15561

if (Subtarget.hasSSE3()) {

15562

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

15563

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

15564

if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))

15565

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

15566

}

15567

15568

if (Subtarget.hasAVX()) {

15569

// If we have AVX, we can use VPERMILPS which will allow folding a load

15570

// into the shuffle.

15571

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

15572

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15573

}

15574

15575

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

15576

// in SSE1 because otherwise they are widened to v2f64 and never get here.

15577

if (!Subtarget.hasSSE2()) {

15578

if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))

15579

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

15580

if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))

15581

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

15582

}

15583

15584

// Otherwise, use a straight shuffle of a single input vector. We pass the

15585

// input vector to both operands to simulate this with a SHUFPS.

15586

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

15587

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15588

}

15589

15590

if (Subtarget.hasSSE2())

15591

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

15592

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {

15593

ZExt = DAG.getBitcast(MVT::v4f32, ZExt);

15594

return ZExt;

15595

}

15596

15597

if (Subtarget.hasAVX2())

15598

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15599

return Extract;

15600

15601

// There are special ways we can lower some single-element blends. However, we

15602

// have custom ways we can lower more complex single-element blends below that

15603

// we defer to if both this and BLENDPS fail to match, so restrict this to

15604

// when the V2 input is targeting element 0 of the mask -- that is the fast

15605

// case here.

15606

if (NumV2Elements == 1 && Mask[0] >= 4)

15607

if (SDValue V = lowerShuffleAsElementInsertion(

15608

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15609

return V;

15610

15611

if (Subtarget.hasSSE41()) {

15612

// Use INSERTPS if we can complete the shuffle efficiently.

15613

if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

15614

return V;

15615

15616

if (!isSingleSHUFPSMask(Mask))

15617

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

15618

V2, Mask, DAG))

15619

return BlendPerm;

15620

}

15621

15622

// Use low/high mov instructions. These are only valid in SSE1 because

15623

// otherwise they are widened to v2f64 and never get here.

15624

if (!Subtarget.hasSSE2()) {

15625

if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))

15626

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

15627

if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))

15628

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

15629

}

15630

15631

// Use dedicated unpack instructions for masks that match their pattern.

15632

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

15633

return V;

15634

15635

// Otherwise fall back to a SHUFPS lowering strategy.

15636

return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

15637

}

15638

15639

/// Lower 4-lane i32 vector shuffles.

15640

///

15641

/// We try to handle these with integer-domain shuffles where we can, but for

15642

/// blends we use the floating point domain blend instructions.

15643

static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15644

const APInt &Zeroable, SDValue V1, SDValue V2,

15645

const X86Subtarget &Subtarget,

15646

SelectionDAG &DAG) {

15647

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15647, __extension__
__PRETTY_FUNCTION__));

15648

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15648, __extension__
__PRETTY_FUNCTION__));

15649

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15649, __extension__
__PRETTY_FUNCTION__));

15650

15651

// Whenever we can lower this as a zext, that instruction is strictly faster

15652

// than any alternative. It also allows us to fold memory operands into the

15653

// shuffle in many cases.

15654

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

15655

Zeroable, Subtarget, DAG))

15656

return ZExt;

15657

15658

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15659

15660

// Try to use shift instructions if fast.

15661

if (Subtarget.preferLowerShuffleAsShift()) {

15662

if (SDValue Shift =

15663

lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,

15664

Subtarget, DAG, /*BitwiseOnly*/ true))

15665

return Shift;

15666

if (NumV2Elements == 0)

15667

if (SDValue Rotate =

15668

lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))

15669

return Rotate;

15670

}

15671

15672

if (NumV2Elements == 0) {

15673

// Try to use broadcast unless the mask only has one non-undef element.

15674

if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

15675

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

15676

Mask, Subtarget, DAG))

15677

return Broadcast;

15678

}

15679

15680

// Straight shuffle of a single input vector. For everything from SSE2

15681

// onward this has a single fast instruction with no scary immediates.

15682

// We coerce the shuffle pattern to be compatible with UNPCK instructions

15683

// but we aren't actually going to use the UNPCK instruction because doing

15684

// so prevents folding a load into this instruction or making a copy.

15685

const int UnpackLoMask[] = {0, 0, 1, 1};

15686

const int UnpackHiMask[] = {2, 2, 3, 3};

15687

if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))

15688

Mask = UnpackLoMask;

15689

else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))

15690

Mask = UnpackHiMask;

15691

15692

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15693

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15694

}

15695

15696

if (Subtarget.hasAVX2())

15697

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15698

return Extract;

15699

15700

// Try to use shift instructions.

15701

if (SDValue Shift =

15702

lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,

15703

DAG, /*BitwiseOnly*/ false))

15704

return Shift;

15705

15706

// There are special ways we can lower some single-element blends.

15707

if (NumV2Elements == 1)

15708

if (SDValue V = lowerShuffleAsElementInsertion(

15709

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15710

return V;

15711

15712

// We have different paths for blend lowering, but they all must use the

15713

// *exact* same predicate.

15714

bool IsBlendSupported = Subtarget.hasSSE41();

15715

if (IsBlendSupported)

15716

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

15717

Zeroable, Subtarget, DAG))

15718

return Blend;

15719

15720

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

15721

Zeroable, Subtarget, DAG))

15722

return Masked;

15723

15724

// Use dedicated unpack instructions for masks that match their pattern.

15725

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

15726

return V;

15727

15728

// Try to use byte rotation instructions.

15729

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15730

if (Subtarget.hasSSSE3()) {

15731

if (Subtarget.hasVLX())

15732

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

15733

Subtarget, DAG))

15734

return Rotate;

15735

15736

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

15737

Subtarget, DAG))

15738

return Rotate;

15739

}

15740

15741

// Assume that a single SHUFPS is faster than an alternative sequence of

15742

// multiple instructions (even if the CPU has a domain penalty).

15743

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

15744

if (!isSingleSHUFPSMask(Mask)) {

15745

// If we have direct support for blends, we should lower by decomposing into

15746

// a permute. That will be faster than the domain cross.

15747

if (IsBlendSupported)

15748

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

15749

Subtarget, DAG);

15750

15751

// Try to lower by permuting the inputs into an unpack instruction.

15752

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

15753

Mask, Subtarget, DAG))

15754

return Unpack;

15755

}

15756

15757

// We implement this with SHUFPS because it can blend from two vectors.

15758

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

15759

// up the inputs, bypassing domain shift penalties that we would incur if we

15760

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

15761

// relevant.

15762

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

15763

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

15764

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

15765

return DAG.getBitcast(MVT::v4i32, ShufPS);

15766

}

15767

15768

/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

15769

/// shuffle lowering, and the most complex part.

15770

///

15771

/// The lowering strategy is to try to form pairs of input lanes which are

15772

/// targeted at the same half of the final vector, and then use a dword shuffle

15773

/// to place them onto the right half, and finally unpack the paired lanes into

15774

/// their final position.

15775

///

15776

/// The exact breakdown of how to form these dword pairs and align them on the

15777

/// correct sides is really tricky. See the comments within the function for

15778

/// more of the details.

15779

///

15780

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

15781

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

15782

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

15783

/// vector, form the analogous 128-bit 8-element Mask.

15784

static SDValue lowerV8I16GeneralSingleInputShuffle(

15785

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

15786

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

15787

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15787, __extension__
__PRETTY_FUNCTION__));

15788

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

15789

15790

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15790, __extension__
__PRETTY_FUNCTION__));

15791

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

15792

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

15793

15794

// Attempt to directly match PSHUFLW or PSHUFHW.

15795

if (isUndefOrInRange(LoMask, 0, 4) &&

15796

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

15797

return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15798

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

15799

}

15800

if (isUndefOrInRange(HiMask, 4, 8) &&

15801

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

15802

for (int i = 0; i != 4; ++i)

15803

HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

15804

return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15805

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

15806

}

15807

15808

SmallVector<int, 4> LoInputs;

15809

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

15810

array_pod_sort(LoInputs.begin(), LoInputs.end());

15811

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

15812

SmallVector<int, 4> HiInputs;

15813

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

15814

array_pod_sort(HiInputs.begin(), HiInputs.end());

15815

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

15816

int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

15817

int NumHToL = LoInputs.size() - NumLToL;

15818

int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

15819

int NumHToH = HiInputs.size() - NumLToH;

15820

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

15821

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

15822

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

15823

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

15824

15825

// If we are shuffling values from one half - check how many different DWORD

15826

// pairs we need to create. If only 1 or 2 then we can perform this as a

15827

// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

15828

auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

15829

ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

15830

V = DAG.getNode(ShufWOp, DL, VT, V,

15831

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15832

V = DAG.getBitcast(PSHUFDVT, V);

15833

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

15834

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

15835

return DAG.getBitcast(VT, V);

15836

};

15837

15838

if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

15839

int PSHUFDMask[4] = { -1, -1, -1, -1 };

15840

SmallVector<std::pair<int, int>, 4> DWordPairs;

15841

int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

15842

15843

// Collect the different DWORD pairs.

15844

for (int DWord = 0; DWord != 4; ++DWord) {

15845

int M0 = Mask[2 * DWord + 0];

15846

int M1 = Mask[2 * DWord + 1];

15847

M0 = (M0 >= 0 ? M0 % 4 : M0);

15848

M1 = (M1 >= 0 ? M1 % 4 : M1);

15849

if (M0 < 0 && M1 < 0)

15850

continue;

15851

15852

bool Match = false;

15853

for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

15854

auto &DWordPair = DWordPairs[j];

15855

if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

15856

(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

15857

DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

15858

DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

15859

PSHUFDMask[DWord] = DOffset + j;

15860

Match = true;

15861

break;

15862

}

15863

}

15864

if (!Match) {

15865

PSHUFDMask[DWord] = DOffset + DWordPairs.size();

15866

DWordPairs.push_back(std::make_pair(M0, M1));

15867

}

15868

}

15869

15870

if (DWordPairs.size() <= 2) {

15871

DWordPairs.resize(2, std::make_pair(-1, -1));

15872

int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

15873

DWordPairs[1].first, DWordPairs[1].second};

15874

if ((NumHToL + NumHToH) == 0)

15875

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

15876

if ((NumLToL + NumLToH) == 0)

15877

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

15878

}

15879

}

15880

15881

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

15882

// such inputs we can swap two of the dwords across the half mark and end up

15883

// with <=2 inputs to each half in each half. Once there, we can fall through

15884

// to the generic code below. For example:

15885

//

15886

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15887

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

15888

//

15889

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

15890

// and an existing 2-into-2 on the other half. In this case we may have to

15891

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

15892

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

15893

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

15894

// because any other situation (including a 3-into-1 or 1-into-3 in the other

15895

// half than the one we target for fixing) will be fixed when we re-enter this

15896

// path. We will also combine away any sequence of PSHUFD instructions that

15897

// result into a single instruction. Here is an example of the tricky case:

15898

//

15899

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15900

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

15901

//

15902

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

15903

//

15904

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

15905

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

15906

//

15907

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

15908

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

15909

//

15910

// The result is fine to be handled by the generic logic.

15911

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

15912

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

15913

int AOffset, int BOffset) {

15914

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__
__PRETTY_FUNCTION__))

15915

"Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__
__PRETTY_FUNCTION__));

15916

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__
__PRETTY_FUNCTION__))

15917

"Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__
__PRETTY_FUNCTION__));

15918

assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__
__PRETTY_FUNCTION__))

15919

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__
__PRETTY_FUNCTION__));

15920

15921

bool ThreeAInputs = AToAInputs.size() == 3;

15922

15923

// Compute the index of dword with only one word among the three inputs in

15924

// a half by taking the sum of the half with three inputs and subtracting

15925

// the sum of the actual three inputs. The difference is the remaining

15926

// slot.

15927

int ADWord = 0, BDWord = 0;

15928

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

15929

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

15930

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

15931

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

15932

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

15933

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

15934

int TripleNonInputIdx =

15935

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

15936

TripleDWord = TripleNonInputIdx / 2;

15937

15938

// We use xor with one to compute the adjacent DWord to whichever one the

15939

// OneInput is in.

15940

OneInputDWord = (OneInput / 2) ^ 1;

15941

15942

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

15943

// and BToA inputs. If there is also such a problem with the BToB and AToB

15944

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

15945

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

15946

// is essential that we don't *create* a 3<-1 as then we might oscillate.

15947

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

15948

// Compute how many inputs will be flipped by swapping these DWords. We

15949

// need

15950

// to balance this to ensure we don't form a 3-1 shuffle in the other

15951

// half.

15952

int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +

15953

llvm::count(AToBInputs, 2 * ADWord + 1);

15954

int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +

15955

llvm::count(BToBInputs, 2 * BDWord + 1);

15956

if ((NumFlippedAToBInputs == 1 &&

15957

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

15958

(NumFlippedBToBInputs == 1 &&

15959

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

15960

// We choose whether to fix the A half or B half based on whether that

15961

// half has zero flipped inputs. At zero, we may not be able to fix it

15962

// with that half. We also bias towards fixing the B half because that

15963

// will more commonly be the high half, and we have to bias one way.

15964

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

15965

ArrayRef<int> Inputs) {

15966

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

15967

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

15968

// Determine whether the free index is in the flipped dword or the

15969

// unflipped dword based on where the pinned index is. We use this bit

15970

// in an xor to conditionally select the adjacent dword.

15971

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

15972

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15973

if (IsFixIdxInput == IsFixFreeIdxInput)

15974

FixFreeIdx += 1;

15975

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15976

assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__
__PRETTY_FUNCTION__))

15977

"We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__
__PRETTY_FUNCTION__));

15978

int PSHUFHalfMask[] = {0, 1, 2, 3};

15979

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

15980

V = DAG.getNode(

15981

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

15982

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

15983

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15984

15985

for (int &M : Mask)

15986

if (M >= 0 && M == FixIdx)

15987

M = FixFreeIdx;

15988

else if (M >= 0 && M == FixFreeIdx)

15989

M = FixIdx;

15990

};

15991

if (NumFlippedBToBInputs != 0) {

15992

int BPinnedIdx =

15993

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

15994

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

15995

} else {

15996

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15996, __extension__
__PRETTY_FUNCTION__));

15997

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

15998

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

15999

}

16000

}

16001

}

16002

16003

int PSHUFDMask[] = {0, 1, 2, 3};

16004

PSHUFDMask[ADWord] = BDWord;

16005

PSHUFDMask[BDWord] = ADWord;

16006

V = DAG.getBitcast(

16007

VT,

16008

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

16009

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

16010

16011

// Adjust the mask to match the new locations of A and B.

16012

for (int &M : Mask)

16013

if (M >= 0 && M/2 == ADWord)

16014

M = 2 * BDWord + M % 2;

16015

else if (M >= 0 && M/2 == BDWord)

16016

M = 2 * ADWord + M % 2;

16017

16018

// Recurse back into this routine to re-compute state now that this isn't

16019

// a 3 and 1 problem.

16020

return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

16021

};

16022

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

16023

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

16024

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

16025

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

16026

16027

// At this point there are at most two inputs to the low and high halves from

16028

// each half. That means the inputs can always be grouped into dwords and

16029

// those dwords can then be moved to the correct half with a dword shuffle.

16030

// We use at most one low and one high word shuffle to collect these paired

16031

// inputs into dwords, and finally a dword shuffle to place them.

16032

int PSHUFLMask[4] = {-1, -1, -1, -1};

16033

int PSHUFHMask[4] = {-1, -1, -1, -1};

16034

int PSHUFDMask[4] = {-1, -1, -1, -1};

16035

16036

// First fix the masks for all the inputs that are staying in their

16037

// original halves. This will then dictate the targets of the cross-half

16038

// shuffles.

16039

auto fixInPlaceInputs =

16040

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

16041

MutableArrayRef<int> SourceHalfMask,

16042

MutableArrayRef<int> HalfMask, int HalfOffset) {

16043

if (InPlaceInputs.empty())

16044

return;

16045

if (InPlaceInputs.size() == 1) {

16046

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

16047

InPlaceInputs[0] - HalfOffset;

16048

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

16049

return;

16050

}

16051

if (IncomingInputs.empty()) {

16052

// Just fix all of the in place inputs.

16053

for (int Input : InPlaceInputs) {

16054

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

16055

PSHUFDMask[Input / 2] = Input / 2;

16056

}

16057

return;

16058

}

16059

16060

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16060, __extension__
__PRETTY_FUNCTION__));

16061

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

16062

InPlaceInputs[0] - HalfOffset;

16063

// Put the second input next to the first so that they are packed into

16064

// a dword. We find the adjacent index by toggling the low bit.

16065

int AdjIndex = InPlaceInputs[0] ^ 1;

16066

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

16067

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

16068

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

16069

};

16070

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

16071

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

16072

16073

// Now gather the cross-half inputs and place them into a free dword of

16074

// their target half.

16075

// FIXME: This operation could almost certainly be simplified dramatically to

16076

// look more like the 3-1 fixing operation.

16077

auto moveInputsToRightHalf = [&PSHUFDMask](

16078

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

16079

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

16080

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

16081

int DestOffset) {

16082

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

16083

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

16084

};

16085

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

16086

int Word) {

16087

int LowWord = Word & ~1;

16088

int HighWord = Word | 1;

16089

return isWordClobbered(SourceHalfMask, LowWord) ||

16090

isWordClobbered(SourceHalfMask, HighWord);

16091

};

16092

16093

if (IncomingInputs.empty())

16094

return;

16095

16096

if (ExistingInputs.empty()) {

16097

// Map any dwords with inputs from them into the right half.

16098

for (int Input : IncomingInputs) {

16099

// If the source half mask maps over the inputs, turn those into

16100

// swaps and use the swapped lane.

16101

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

16102

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

16103

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

16104

Input - SourceOffset;

16105

// We have to swap the uses in our half mask in one sweep.

16106

for (int &M : HalfMask)

16107

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

16108

M = Input;

16109

else if (M == Input)

16110

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

16111

} else {

16112

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))

16113

Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__))

16114

"Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__
__PRETTY_FUNCTION__));

16115

}

16116

// Note that this correctly re-maps both when we do a swap and when

16117

// we observe the other side of the swap above. We rely on that to

16118

// avoid swapping the members of the input list directly.

16119

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

16120

}

16121

16122

// Map the input's dword into the correct half.

16123

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

16124

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

16125

else

16126

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))

16127

Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))

16128

"Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__));

16129

}

16130

16131

// And just directly shift any other-half mask elements to be same-half

16132

// as we will have mirrored the dword containing the element into the

16133

// same position within that half.

16134

for (int &M : HalfMask)

16135

if (M >= SourceOffset && M < SourceOffset + 4) {

16136

M = M - SourceOffset + DestOffset;

16137

assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16137, __extension__
__PRETTY_FUNCTION__));

16138

}

16139

return;

16140

}

16141

16142

// Ensure we have the input in a viable dword of its current half. This

16143

// is particularly tricky because the original position may be clobbered

16144

// by inputs being moved and *staying* in that half.

16145

if (IncomingInputs.size() == 1) {

16146

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16147

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

16148

SourceOffset;

16149

SourceHalfMask[InputFixed - SourceOffset] =

16150

IncomingInputs[0] - SourceOffset;

16151

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

16152

InputFixed);

16153

IncomingInputs[0] = InputFixed;

16154

}

16155

} else if (IncomingInputs.size() == 2) {

16156

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

16157

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16158

// We have two non-adjacent or clobbered inputs we need to extract from

16159

// the source half. To do this, we need to map them into some adjacent

16160

// dword slot in the source mask.

16161

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

16162

IncomingInputs[1] - SourceOffset};

16163

16164

// If there is a free slot in the source half mask adjacent to one of

16165

// the inputs, place the other input in it. We use (Index XOR 1) to

16166

// compute an adjacent index.

16167

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

16168

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

16169

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

16170

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16171

InputsFixed[1] = InputsFixed[0] ^ 1;

16172

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

16173

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

16174

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

16175

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

16176

InputsFixed[0] = InputsFixed[1] ^ 1;

16177

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

16178

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

16179

// The two inputs are in the same DWord but it is clobbered and the

16180

// adjacent DWord isn't used at all. Move both inputs to the free

16181

// slot.

16182

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

16183

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

16184

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

16185

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

16186

} else {

16187

// The only way we hit this point is if there is no clobbering

16188

// (because there are no off-half inputs to this half) and there is no

16189

// free slot adjacent to one of the inputs. In this case, we have to

16190

// swap an input with a non-input.

16191

for (int i = 0; i < 4; ++i)

16192

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))

16193

"We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__));

16194

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__
__PRETTY_FUNCTION__))

16195

"Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__
__PRETTY_FUNCTION__));

16196

16197

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16198

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

16199

16200

// We also have to update the final source mask in this case because

16201

// it may need to undo the above swap.

16202

for (int &M : FinalSourceHalfMask)

16203

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

16204

M = InputsFixed[1] + SourceOffset;

16205

else if (M == InputsFixed[1] + SourceOffset)

16206

M = (InputsFixed[0] ^ 1) + SourceOffset;

16207

16208

InputsFixed[1] = InputsFixed[0] ^ 1;

16209

}

16210

16211

// Point everything at the fixed inputs.

16212

for (int &M : HalfMask)

16213

if (M == IncomingInputs[0])

16214

M = InputsFixed[0] + SourceOffset;

16215

else if (M == IncomingInputs[1])

16216

M = InputsFixed[1] + SourceOffset;

16217

16218

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

16219

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

16220

}

16221

} else {

16222

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16222);

16223

}

16224

16225

// Now hoist the DWord down to the right half.

16226

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

16227

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16227, __extension__
__PRETTY_FUNCTION__));

16228

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

16229

for (int &M : HalfMask)

16230

for (int Input : IncomingInputs)

16231

if (M == Input)

16232

M = FreeDWord * 2 + Input % 2;

16233

};

16234

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

16235

/*SourceOffset*/ 4, /*DestOffset*/ 0);

16236

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

16237

/*SourceOffset*/ 0, /*DestOffset*/ 4);

16238

16239

// Now enact all the shuffles we've computed to move the inputs into their

16240

// target half.

16241

if (!isNoopShuffleMask(PSHUFLMask))

16242

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16243

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

16244

if (!isNoopShuffleMask(PSHUFHMask))

16245

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16246

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

16247

if (!isNoopShuffleMask(PSHUFDMask))

16248

V = DAG.getBitcast(

16249

VT,

16250

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

16251

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

16252

16253

// At this point, each half should contain all its inputs, and we can then

16254

// just shuffle them into their final position.

16255

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__
__PRETTY_FUNCTION__))

16256

"Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__
__PRETTY_FUNCTION__));

16257

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__
__PRETTY_FUNCTION__))

16258

"Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__
__PRETTY_FUNCTION__));

16259

16260

// Do a half shuffle for the low mask.

16261

if (!isNoopShuffleMask(LoMask))

16262

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16263

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

16264

16265

// Do a half shuffle with the high mask after shifting its values down.

16266

for (int &M : HiMask)

16267

if (M >= 0)

16268

M -= 4;

16269

if (!isNoopShuffleMask(HiMask))

16270

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16271

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

16272

16273

return V;

16274

}

16275

16276

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

16277

/// blend if only one input is used.

16278

static SDValue lowerShuffleAsBlendOfPSHUFBs(

16279

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16280

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

16281

assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__))

16282

"Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__));

16283

16284

int NumBytes = VT.getSizeInBits() / 8;

16285

int Size = Mask.size();

16286

int Scale = NumBytes / Size;

16287

16288

SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16289

SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16290

V1InUse = false;

16291

V2InUse = false;

16292

16293

for (int i = 0; i < NumBytes; ++i) {

16294

int M = Mask[i / Scale];

16295

if (M < 0)

16296

continue;

16297

16298

const int ZeroMask = 0x80;

16299

int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

16300

int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

16301

if (Zeroable[i / Scale])

16302

V1Idx = V2Idx = ZeroMask;

16303

16304

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

16305

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

16306

V1InUse |= (ZeroMask != V1Idx);

16307

V2InUse |= (ZeroMask != V2Idx);

16308

}

16309

16310

MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

16311

if (V1InUse)

16312

V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

16313

DAG.getBuildVector(ShufVT, DL, V1Mask));

16314

if (V2InUse)

16315

V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

16316

DAG.getBuildVector(ShufVT, DL, V2Mask));

16317

16318

// If we need shuffled inputs from both, blend the two.

16319

SDValue V;

16320

if (V1InUse && V2InUse)

16321

V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

16322

else

16323

V = V1InUse ? V1 : V2;

16324

16325

// Cast the result back to the correct type.

16326

return DAG.getBitcast(VT, V);

16327

}

16328

16329

/// Generic lowering of 8-lane i16 shuffles.

16330

///

16331

/// This handles both single-input shuffles and combined shuffle/blends with

16332

/// two inputs. The single input shuffles are immediately delegated to

16333

/// a dedicated lowering routine.

16334

///

16335

/// The blends are lowered in one of three fundamental ways. If there are few

16336

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

16337

/// of the input is significantly cheaper when lowered as an interleaving of

16338

/// the two inputs, try to interleave them. Otherwise, blend the low and high

16339

/// halves of the inputs separately (making them have relatively few inputs)

16340

/// and then concatenate them.

16341

static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16342

const APInt &Zeroable, SDValue V1, SDValue V2,

16343

const X86Subtarget &Subtarget,

16344

SelectionDAG &DAG) {

16345

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16345, __extension__
__PRETTY_FUNCTION__));

16346

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16346, __extension__
__PRETTY_FUNCTION__));

16347

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16347, __extension__
__PRETTY_FUNCTION__));

16348

16349

// Whenever we can lower this as a zext, that instruction is strictly faster

16350

// than any alternative.

16351

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

16352

Zeroable, Subtarget, DAG))

16353

return ZExt;

16354

16355

// Try to use lower using a truncation.

16356

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16357

Subtarget, DAG))

16358

return V;

16359

16360

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

16361

16362

if (NumV2Inputs == 0) {

16363

// Try to use shift instructions.

16364

if (SDValue Shift =

16365

lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,

16366

Subtarget, DAG, /*BitwiseOnly*/ false))

16367

return Shift;

16368

16369

// Check for being able to broadcast a single element.

16370

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

16371

Mask, Subtarget, DAG))

16372

return Broadcast;

16373

16374

// Try to use bit rotation instructions.

16375

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

16376

Subtarget, DAG))

16377

return Rotate;

16378

16379

// Use dedicated unpack instructions for masks that match their pattern.

16380

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16381

return V;

16382

16383

// Use dedicated pack instructions for masks that match their pattern.

16384

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16385

Subtarget))

16386

return V;

16387

16388

// Try to use byte rotation instructions.

16389

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

16390

Subtarget, DAG))

16391

return Rotate;

16392

16393

// Make a copy of the mask so it can be modified.

16394

SmallVector<int, 8> MutableMask(Mask);

16395

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

16396

Subtarget, DAG);

16397

}

16398

16399

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))

16400

"All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__))

16401

"shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__
__PRETTY_FUNCTION__));

16402

16403

// Try to use shift instructions.

16404

if (SDValue Shift =

16405

lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,

16406

DAG, /*BitwiseOnly*/ false))

16407

return Shift;

16408

16409

// See if we can use SSE4A Extraction / Insertion.

16410

if (Subtarget.hasSSE4A())

16411

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

16412

Zeroable, DAG))

16413

return V;

16414

16415

// There are special ways we can lower some single-element blends.

16416

if (NumV2Inputs == 1)

16417

if (SDValue V = lowerShuffleAsElementInsertion(

16418

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16419

return V;

16420

16421

// We have different paths for blend lowering, but they all must use the

16422

// *exact* same predicate.

16423

bool IsBlendSupported = Subtarget.hasSSE41();

16424

if (IsBlendSupported)

16425

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

16426

Zeroable, Subtarget, DAG))

16427

return Blend;

16428

16429

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

16430

Zeroable, Subtarget, DAG))

16431

return Masked;

16432

16433

// Use dedicated unpack instructions for masks that match their pattern.

16434

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16435

return V;

16436

16437

// Use dedicated pack instructions for masks that match their pattern.

16438

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16439

Subtarget))

16440

return V;

16441

16442

// Try to use lower using a truncation.

16443

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16444

Subtarget, DAG))

16445

return V;

16446

16447

// Try to use byte rotation instructions.

16448

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

16449

Subtarget, DAG))

16450

return Rotate;

16451

16452

if (SDValue BitBlend =

16453

lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

16454

return BitBlend;

16455

16456

// Try to use byte shift instructions to mask.

16457

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

16458

Zeroable, Subtarget, DAG))

16459

return V;

16460

16461

// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

16462

// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

16463

// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

16464

int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);

16465

if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

16466

!Subtarget.hasVLX()) {

16467

// Check if this is part of a 256-bit vector truncation.

16468

if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&

16469

peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&

16470

peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {

16471

SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);

16472

V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,

16473

getZeroVector(MVT::v16i16, Subtarget, DAG, DL),

16474

DAG.getTargetConstant(0xEE, DL, MVT::i8));

16475

V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);

16476

V1 = extract128BitVector(V1V2, 0, DAG, DL);

16477

V2 = extract128BitVector(V1V2, 4, DAG, DL);

16478

} else {

16479

SmallVector<SDValue, 4> DWordClearOps(4,

16480

DAG.getConstant(0, DL, MVT::i32));

16481

for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

16482

DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

16483

SDValue DWordClearMask =

16484

DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

16485

V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

16486

DWordClearMask);

16487

V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

16488

DWordClearMask);

16489

}

16490

// Now pack things back together.

16491

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

16492

if (NumEvenDrops == 2) {

16493

Result = DAG.getBitcast(MVT::v4i32, Result);

16494

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

16495

}

16496

return Result;

16497

}

16498

16499

// When compacting odd (upper) elements, use PACKSS pre-SSE41.

16500

int NumOddDrops = canLowerByDroppingElements(Mask, false, false);

16501

if (NumOddDrops == 1) {

16502

bool HasSSE41 = Subtarget.hasSSE41();

16503

V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16504

DAG.getBitcast(MVT::v4i32, V1),

16505

DAG.getTargetConstant(16, DL, MVT::i8));

16506

V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16507

DAG.getBitcast(MVT::v4i32, V2),

16508

DAG.getTargetConstant(16, DL, MVT::i8));

16509

return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,

16510

MVT::v8i16, V1, V2);

16511

}

16512

16513

// Try to lower by permuting the inputs into an unpack instruction.

16514

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

16515

Mask, Subtarget, DAG))

16516

return Unpack;

16517

16518

// If we can't directly blend but can use PSHUFB, that will be better as it

16519

// can both shuffle and set up the inefficient blend.

16520

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

16521

bool V1InUse, V2InUse;

16522

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

16523

Zeroable, DAG, V1InUse, V2InUse);

16524

}

16525

16526

// We can always bit-blend if we have to so the fallback strategy is to

16527

// decompose into single-input permutes and blends/unpacks.

16528

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,

16529

Mask, Subtarget, DAG);

16530

}

16531

16532

/// Lower 8-lane 16-bit floating point shuffles.

16533

static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16534

const APInt &Zeroable, SDValue V1, SDValue V2,

16535

const X86Subtarget &Subtarget,

16536

SelectionDAG &DAG) {

16537

assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16537, __extension__
__PRETTY_FUNCTION__));

16538

assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16538, __extension__
__PRETTY_FUNCTION__));

16539

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16539, __extension__
__PRETTY_FUNCTION__));

16540

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

16541

16542

if (Subtarget.hasFP16()) {

16543

if (NumV2Elements == 0) {

16544

// Check for being able to broadcast a single element.

16545

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,

16546

Mask, Subtarget, DAG))

16547

return Broadcast;

16548

}

16549

if (NumV2Elements == 1 && Mask[0] >= 8)

16550

if (SDValue V = lowerShuffleAsElementInsertion(

16551

DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16552

return V;

16553

}

16554

16555

V1 = DAG.getBitcast(MVT::v8i16, V1);

16556

V2 = DAG.getBitcast(MVT::v8i16, V2);

16557

return DAG.getBitcast(MVT::v8f16,

16558

DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

16559

}

16560

16561

// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

16562

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

16563

// the active subvector is extracted.

16564

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

16565

ArrayRef<int> Mask, SDValue V1, SDValue V2,

16566

const X86Subtarget &Subtarget,

16567

SelectionDAG &DAG) {

16568

MVT MaskVT = VT.changeTypeToInteger();

16569

SDValue MaskNode;

16570

MVT ShuffleVT = VT;

16571

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

16572

V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

16573

V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

16574

ShuffleVT = V1.getSimpleValueType();

16575

16576

// Adjust mask to correct indices for the second input.

16577

int NumElts = VT.getVectorNumElements();

16578

unsigned Scale = 512 / VT.getSizeInBits();

16579

SmallVector<int, 32> AdjustedMask(Mask);

16580

for (int &M : AdjustedMask)

16581

if (NumElts <= M)

16582

M += (Scale - 1) * NumElts;

16583

MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);

16584

MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

16585

} else {

16586

MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);

16587

}

16588

16589

SDValue Result;

16590

if (V2.isUndef())

16591

Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

16592

else

16593

Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);

16594

16595

if (VT != ShuffleVT)

16596

Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());

16597

16598

return Result;

16599

}

16600

16601

/// Generic lowering of v16i8 shuffles.

16602

///

16603

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

16604

/// detect any complexity reducing interleaving. If that doesn't help, it uses

16605

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

16606

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

16607

/// back together.

16608

static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16609

const APInt &Zeroable, SDValue V1, SDValue V2,

16610

const X86Subtarget &Subtarget,

16611

SelectionDAG &DAG) {

16612

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16612, __extension__
__PRETTY_FUNCTION__));

16613

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16613, __extension__
__PRETTY_FUNCTION__));

16614

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16614, __extension__
__PRETTY_FUNCTION__));

16615

16616

// Try to use shift instructions.

16617

if (SDValue Shift =

16618

lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,

16619

DAG, /*BitwiseOnly*/ false))

16620

return Shift;

16621

16622

// Try to use byte rotation instructions.

16623

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

16624

Subtarget, DAG))

16625

return Rotate;

16626

16627

// Use dedicated pack instructions for masks that match their pattern.

16628

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

16629

Subtarget))

16630

return V;

16631

16632

// Try to use a zext lowering.

16633

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

16634

Zeroable, Subtarget, DAG))

16635

return ZExt;

16636

16637

// Try to use lower using a truncation.

16638

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16639

Subtarget, DAG))

16640

return V;

16641

16642

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16643

Subtarget, DAG))

16644

return V;

16645

16646

// See if we can use SSE4A Extraction / Insertion.

16647

if (Subtarget.hasSSE4A())

16648

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

16649

Zeroable, DAG))

16650

return V;

16651

16652

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

16653

16654

// For single-input shuffles, there are some nicer lowering tricks we can use.

16655

if (NumV2Elements == 0) {

16656

// Check for being able to broadcast a single element.

16657

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

16658

Mask, Subtarget, DAG))

16659

return Broadcast;

16660

16661

// Try to use bit rotation instructions.

16662

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

16663

Subtarget, DAG))

16664

return Rotate;

16665

16666

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16667

return V;

16668

16669

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

16670

// Notably, this handles splat and partial-splat shuffles more efficiently.

16671

// However, it only makes sense if the pre-duplication shuffle simplifies

16672

// things significantly. Currently, this means we need to be able to

16673

// express the pre-duplication shuffle as an i16 shuffle.

16674

//

16675

// FIXME: We should check for other patterns which can be widened into an

16676

// i16 shuffle as well.

16677

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

16678

for (int i = 0; i < 16; i += 2)

16679

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

16680

return false;

16681

16682

return true;

16683

};

16684

auto tryToWidenViaDuplication = [&]() -> SDValue {

16685

if (!canWidenViaDuplication(Mask))

16686

return SDValue();

16687

SmallVector<int, 4> LoInputs;

16688

copy_if(Mask, std::back_inserter(LoInputs),

16689

[](int M) { return M >= 0 && M < 8; });

16690

array_pod_sort(LoInputs.begin(), LoInputs.end());

16691

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

16692

LoInputs.end());

16693

SmallVector<int, 4> HiInputs;

16694

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

16695

array_pod_sort(HiInputs.begin(), HiInputs.end());

16696

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

16697

HiInputs.end());

16698

16699

bool TargetLo = LoInputs.size() >= HiInputs.size();

16700

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

16701

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

16702

16703

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

16704

SmallDenseMap<int, int, 8> LaneMap;

16705

for (int I : InPlaceInputs) {

16706

PreDupI16Shuffle[I/2] = I/2;

16707

LaneMap[I] = I;

16708

}

16709

int j = TargetLo ? 0 : 4, je = j + 4;

16710

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

16711

// Check if j is already a shuffle of this input. This happens when

16712

// there are two adjacent bytes after we move the low one.

16713

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

16714

// If we haven't yet mapped the input, search for a slot into which

16715

// we can map it.

16716

while (j < je && PreDupI16Shuffle[j] >= 0)

16717

++j;

16718

16719

if (j == je)

16720

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

16721

return SDValue();

16722

16723

// Map this input with the i16 shuffle.

16724

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

16725

}

16726

16727

// Update the lane map based on the mapping we ended up with.

16728

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

16729

}

16730

V1 = DAG.getBitcast(

16731

MVT::v16i8,

16732

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16733

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

16734

16735

// Unpack the bytes to form the i16s that will be shuffled into place.

16736

bool EvenInUse = false, OddInUse = false;

16737

for (int i = 0; i < 16; i += 2) {

16738

EvenInUse |= (Mask[i + 0] >= 0);

16739

OddInUse |= (Mask[i + 1] >= 0);

16740

if (EvenInUse && OddInUse)

16741

break;

16742

}

16743

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

16744

MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

16745

OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

16746

16747

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

16748

for (int i = 0; i < 16; ++i)

16749

if (Mask[i] >= 0) {

16750

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

16751

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16751, __extension__
__PRETTY_FUNCTION__));

16752

if (PostDupI16Shuffle[i / 2] < 0)

16753

PostDupI16Shuffle[i / 2] = MappedMask;

16754

else

16755

assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__
__PRETTY_FUNCTION__))

16756

"Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__
__PRETTY_FUNCTION__));

16757

}

16758

return DAG.getBitcast(

16759

MVT::v16i8,

16760

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16761

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

16762

};

16763

if (SDValue V = tryToWidenViaDuplication())

16764

return V;

16765

}

16766

16767

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

16768

Zeroable, Subtarget, DAG))

16769

return Masked;

16770

16771

// Use dedicated unpack instructions for masks that match their pattern.

16772

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16773

return V;

16774

16775

// Try to use byte shift instructions to mask.

16776

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

16777

Zeroable, Subtarget, DAG))

16778

return V;

16779

16780

// Check for compaction patterns.

16781

bool IsSingleInput = V2.isUndef();

16782

int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);

16783

16784

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

16785

// with PSHUFB. It is important to do this before we attempt to generate any

16786

// blends but after all of the single-input lowerings. If the single input

16787

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

16788

// want to preserve that and we can DAG combine any longer sequences into

16789

// a PSHUFB in the end. But once we start blending from multiple inputs,

16790

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

16791

// and there are *very* few patterns that would actually be faster than the

16792

// PSHUFB approach because of its ability to zero lanes.

16793

//

16794

// If the mask is a binary compaction, we can more efficiently perform this

16795

// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

16796

//

16797

// FIXME: The only exceptions to the above are blends which are exact

16798

// interleavings with direct instructions supporting them. We currently don't

16799

// handle those well here.

16800

if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

16801

bool V1InUse = false;

16802

bool V2InUse = false;

16803

16804

SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

16805

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

16806

16807

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

16808

// do so. This avoids using them to handle blends-with-zero which is

16809

// important as a single pshufb is significantly faster for that.

16810

if (V1InUse && V2InUse) {

16811

if (Subtarget.hasSSE41())

16812

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

16813

Zeroable, Subtarget, DAG))

16814

return Blend;

16815

16816

// We can use an unpack to do the blending rather than an or in some

16817

// cases. Even though the or may be (very minorly) more efficient, we

16818

// preference this lowering because there are common cases where part of

16819

// the complexity of the shuffles goes away when we do the final blend as

16820

// an unpack.

16821

// FIXME: It might be worth trying to detect if the unpack-feeding

16822

// shuffles will both be pshufb, in which case we shouldn't bother with

16823

// this.

16824

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

16825

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16826

return Unpack;

16827

16828

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

16829

if (Subtarget.hasVBMI())

16830

return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

16831

DAG);

16832

16833

// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

16834

if (Subtarget.hasXOP()) {

16835

SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

16836

return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

16837

}

16838

16839

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

16840

// PALIGNR will be cheaper than the second PSHUFB+OR.

16841

if (SDValue V = lowerShuffleAsByteRotateAndPermute(

16842

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16843

return V;

16844

}

16845

16846

return PSHUFB;

16847

}

16848

16849

// There are special ways we can lower some single-element blends.

16850

if (NumV2Elements == 1)

16851

if (SDValue V = lowerShuffleAsElementInsertion(

16852

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

16853

return V;

16854

16855

if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

16856

return Blend;

16857

16858

// Check whether a compaction lowering can be done. This handles shuffles

16859

// which take every Nth element for some even N. See the helper function for

16860

// details.

16861

//

16862

// We special case these as they can be particularly efficiently handled with

16863

// the PACKUSB instruction on x86 and they show up in common patterns of

16864

// rearranging bytes to truncate wide elements.

16865

if (NumEvenDrops) {

16866

// NumEvenDrops is the power of two stride of the elements. Another way of

16867

// thinking about it is that we need to drop the even elements this many

16868

// times to get the original input.

16869

16870

// First we need to zero all the dropped bytes.

16871

assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__
__PRETTY_FUNCTION__))

16872

"No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__
__PRETTY_FUNCTION__));

16873

SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

16874

for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

16875

WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

16876

SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

16877

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

16878

WordClearMask);

16879

if (!IsSingleInput)

16880

V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

16881

WordClearMask);

16882

16883

// Now pack things back together.

16884

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16885

IsSingleInput ? V1 : V2);

16886

for (int i = 1; i < NumEvenDrops; ++i) {

16887

Result = DAG.getBitcast(MVT::v8i16, Result);

16888

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

16889

}

16890

return Result;

16891

}

16892

16893

int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);

16894

if (NumOddDrops == 1) {

16895

V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16896

DAG.getBitcast(MVT::v8i16, V1),

16897

DAG.getTargetConstant(8, DL, MVT::i8));

16898

if (!IsSingleInput)

16899

V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16900

DAG.getBitcast(MVT::v8i16, V2),

16901

DAG.getTargetConstant(8, DL, MVT::i8));

16902

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16903

IsSingleInput ? V1 : V2);

16904

}

16905

16906

// Handle multi-input cases by blending/unpacking single-input shuffles.

16907

if (NumV2Elements > 0)

16908

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

16909

Subtarget, DAG);

16910

16911

// The fallback path for single-input shuffles widens this into two v8i16

16912

// vectors with unpacks, shuffles those, and then pulls them back together

16913

// with a pack.

16914

SDValue V = V1;

16915

16916

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16917

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16918

for (int i = 0; i < 16; ++i)

16919

if (Mask[i] >= 0)

16920

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

16921

16922

SDValue VLoHalf, VHiHalf;

16923

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

16924

// them out and avoid using UNPCK{L,H} to extract the elements of V as

16925

// i16s.

16926

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

16927

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

16928

// Use a mask to drop the high bytes.

16929

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

16930

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

16931

DAG.getConstant(0x00FF, DL, MVT::v8i16));

16932

16933

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

16934

VHiHalf = DAG.getUNDEF(MVT::v8i16);

16935

16936

// Squash the masks to point directly into VLoHalf.

16937

for (int &M : LoBlendMask)

16938

if (M >= 0)

16939

M /= 2;

16940

for (int &M : HiBlendMask)

16941

if (M >= 0)

16942

M /= 2;

16943

} else {

16944

// Otherwise just unpack the low half of V into VLoHalf and the high half into

16945

// VHiHalf so that we can blend them as i16s.

16946

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

16947

16948

VLoHalf = DAG.getBitcast(

16949

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

16950

VHiHalf = DAG.getBitcast(

16951

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

16952

}

16953

16954

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

16955

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

16956

16957

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

16958

}

16959

16960

/// Dispatching routine to lower various 128-bit x86 vector shuffles.

16961

///

16962

/// This routine breaks down the specific type of 128-bit shuffle and

16963

/// dispatches to the lowering routines accordingly.

16964

static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

16965

MVT VT, SDValue V1, SDValue V2,

16966

const APInt &Zeroable,

16967

const X86Subtarget &Subtarget,

16968

SelectionDAG &DAG) {

16969

switch (VT.SimpleTy) {

16970

case MVT::v2i64:

16971

return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16972

case MVT::v2f64:

16973

return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16974

case MVT::v4i32:

16975

return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16976

case MVT::v4f32:

16977

return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16978

case MVT::v8i16:

16979

return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16980

case MVT::v8f16:

16981

return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16982

case MVT::v16i8:

16983

return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16984

16985

default:

16986

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16986);

16987

}

16988

}

16989

16990

/// Generic routine to split vector shuffle into half-sized shuffles.

16991

///

16992

/// This routine just extracts two subvectors, shuffles them independently, and

16993

/// then concatenates them back together. This should work effectively with all

16994

/// AVX vector shuffle types.

16995

static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

16996

SDValue V2, ArrayRef<int> Mask,

16997

SelectionDAG &DAG, bool SimpleOnly) {

16998

assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__
__PRETTY_FUNCTION__))

16999

"Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__
__PRETTY_FUNCTION__));

17000

assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17000, __extension__
__PRETTY_FUNCTION__));

17001

assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17001, __extension__
__PRETTY_FUNCTION__));

17002

17003

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

17004

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

17005

17006

int NumElements = VT.getVectorNumElements();

17007

int SplitNumElements = NumElements / 2;

17008

MVT ScalarVT = VT.getVectorElementType();

17009

MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

17010

17011

// Use splitVector/extractSubVector so that split build-vectors just build two

17012

// narrower build vectors. This helps shuffling with splats and zeros.

17013

auto SplitVector = [&](SDValue V) {

17014

SDValue LoV, HiV;

17015

std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

17016

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

17017

DAG.getBitcast(SplitVT, HiV));

17018

};

17019

17020

SDValue LoV1, HiV1, LoV2, HiV2;

17021

std::tie(LoV1, HiV1) = SplitVector(V1);

17022

std::tie(LoV2, HiV2) = SplitVector(V2);

17023

17024

// Now create two 4-way blends of these half-width vectors.

17025

auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,

17026

bool &UseHiV1, bool &UseLoV2,

17027

bool &UseHiV2) {

17028

UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;

17029

for (int i = 0; i < SplitNumElements; ++i) {

17030

int M = HalfMask[i];

17031

if (M >= NumElements) {

17032

if (M >= NumElements + SplitNumElements)

17033

UseHiV2 = true;

17034

else

17035

UseLoV2 = true;

17036

} else if (M >= 0) {

17037

if (M >= SplitNumElements)

17038

UseHiV1 = true;

17039

else

17040

UseLoV1 = true;

17041

}

17042

}

17043

};

17044

17045

auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {

17046

if (!SimpleOnly)

17047

return true;

17048

17049

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

17050

GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

17051

17052

return !(UseHiV1 || UseHiV2);

17053

};

17054

17055

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

17056

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

17057

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

17058

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

17059

for (int i = 0; i < SplitNumElements; ++i) {

17060

int M = HalfMask[i];

17061

if (M >= NumElements) {

17062

V2BlendMask[i] = M - NumElements;

17063

BlendMask[i] = SplitNumElements + i;

17064

} else if (M >= 0) {

17065

V1BlendMask[i] = M;

17066

BlendMask[i] = i;

17067

}

17068

}

17069

17070

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

17071

GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

17072

17073

// Because the lowering happens after all combining takes place, we need to

17074

// manually combine these blend masks as much as possible so that we create

17075

// a minimal number of high-level vector shuffle nodes.

17076

assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 &&
!UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail
("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17076, __extension__
__PRETTY_FUNCTION__));

17077

17078

// First try just blending the halves of V1 or V2.

17079

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

17080

return DAG.getUNDEF(SplitVT);

17081

if (!UseLoV2 && !UseHiV2)

17082

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

17083

if (!UseLoV1 && !UseHiV1)

17084

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

17085

17086

SDValue V1Blend, V2Blend;

17087

if (UseLoV1 && UseHiV1) {

17088

V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

17089

} else {

17090

// We only use half of V1 so map the usage down into the final blend mask.

17091

V1Blend = UseLoV1 ? LoV1 : HiV1;

17092

for (int i = 0; i < SplitNumElements; ++i)

17093

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

17094

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

17095

}

17096

if (UseLoV2 && UseHiV2) {

17097

V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

17098

} else {

17099

// We only use half of V2 so map the usage down into the final blend mask.

17100

V2Blend = UseLoV2 ? LoV2 : HiV2;

17101

for (int i = 0; i < SplitNumElements; ++i)

17102

if (BlendMask[i] >= SplitNumElements)

17103

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

17104

}

17105

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

17106

};

17107

17108

if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))

17109

return SDValue();

17110

17111

SDValue Lo = HalfBlend(LoMask);

17112

SDValue Hi = HalfBlend(HiMask);

17113

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

17114

}

17115

17116

/// Either split a vector in halves or decompose the shuffles and the

17117

/// blend/unpack.

17118

///

17119

/// This is provided as a good fallback for many lowerings of non-single-input

17120

/// shuffles with more than one 128-bit lane. In those cases, we want to select

17121

/// between splitting the shuffle into 128-bit components and stitching those

17122

/// back together vs. extracting the single-input shuffles and blending those

17123

/// results.

17124

static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

17125

SDValue V2, ArrayRef<int> Mask,

17126

const X86Subtarget &Subtarget,

17127

SelectionDAG &DAG) {

17128

assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__
__PRETTY_FUNCTION__))

17129

"shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__
__PRETTY_FUNCTION__));

17130

int Size = Mask.size();

17131

17132

// If this can be modeled as a broadcast of two elements followed by a blend,

17133

// prefer that lowering. This is especially important because broadcasts can

17134

// often fold with memory operands.

17135

auto DoBothBroadcast = [&] {

17136

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

17137

for (int M : Mask)

17138

if (M >= Size) {

17139

if (V2BroadcastIdx < 0)

17140

V2BroadcastIdx = M - Size;

17141

else if (M - Size != V2BroadcastIdx)

17142

return false;

17143

} else if (M >= 0) {

17144

if (V1BroadcastIdx < 0)

17145

V1BroadcastIdx = M;

17146

else if (M != V1BroadcastIdx)

17147

return false;

17148

}

17149

return true;

17150

};

17151

if (DoBothBroadcast())

17152

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17153

DAG);

17154

17155

// If the inputs all stem from a single 128-bit lane of each input, then we

17156

// split them rather than blending because the split will decompose to

17157

// unusually few instructions.

17158

int LaneCount = VT.getSizeInBits() / 128;

17159

int LaneSize = Size / LaneCount;

17160

SmallBitVector LaneInputs[2];

17161

LaneInputs[0].resize(LaneCount, false);

17162

LaneInputs[1].resize(LaneCount, false);

17163

for (int i = 0; i < Size; ++i)

17164

if (Mask[i] >= 0)

17165

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

17166

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

17167

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

17168

/*SimpleOnly*/ false);

17169

17170

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

17171

// requires that the decomposed single-input shuffles don't end up here.

17172

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17173

DAG);

17174

}

17175

17176

// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17177

// TODO: Extend to support v8f32 (+ 512-bit shuffles).

17178

static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

17179

SDValue V1, SDValue V2,

17180

ArrayRef<int> Mask,

17181

SelectionDAG &DAG) {

17182

assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17182, __extension__
__PRETTY_FUNCTION__));

17183

17184

int LHSMask[4] = {-1, -1, -1, -1};

17185

int RHSMask[4] = {-1, -1, -1, -1};

17186

unsigned SHUFPMask = 0;

17187

17188

// As SHUFPD uses a single LHS/RHS element per lane, we can always

17189

// perform the shuffle once the lanes have been shuffled in place.

17190

for (int i = 0; i != 4; ++i) {

17191

int M = Mask[i];

17192

if (M < 0)

17193

continue;

17194

int LaneBase = i & ~1;

17195

auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

17196

LaneMask[LaneBase + (M & 1)] = M;

17197

SHUFPMask |= (M & 1) << i;

17198

}

17199

17200

SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

17201

SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

17202

return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

17203

DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

17204

}

17205

17206

/// Lower a vector shuffle crossing multiple 128-bit lanes as

17207

/// a lane permutation followed by a per-lane permutation.

17208

///

17209

/// This is mainly for cases where we can have non-repeating permutes

17210

/// in each lane.

17211

///

17212

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

17213

/// we should investigate merging them.

17214

static SDValue lowerShuffleAsLanePermuteAndPermute(

17215

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17216

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17217

int NumElts = VT.getVectorNumElements();

17218

int NumLanes = VT.getSizeInBits() / 128;

17219

int NumEltsPerLane = NumElts / NumLanes;

17220

bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();

17221

17222

/// Attempts to find a sublane permute with the given size

17223

/// that gets all elements into their target lanes.

17224

///

17225

/// If successful, fills CrossLaneMask and InLaneMask and returns true.

17226

/// If unsuccessful, returns false and may overwrite InLaneMask.

17227

auto getSublanePermute = [&](int NumSublanes) -> SDValue {

17228

int NumSublanesPerLane = NumSublanes / NumLanes;

17229

int NumEltsPerSublane = NumElts / NumSublanes;

17230

17231

SmallVector<int, 16> CrossLaneMask;

17232

SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

17233

// CrossLaneMask but one entry == one sublane.

17234

SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

17235

17236

for (int i = 0; i != NumElts; ++i) {

17237

int M = Mask[i];

17238

if (M < 0)

17239

continue;

17240

17241

int SrcSublane = M / NumEltsPerSublane;

17242

int DstLane = i / NumEltsPerLane;

17243

17244

// We only need to get the elements into the right lane, not sublane.

17245

// So search all sublanes that make up the destination lane.

17246

bool Found = false;

17247

int DstSubStart = DstLane * NumSublanesPerLane;

17248

int DstSubEnd = DstSubStart + NumSublanesPerLane;

17249

for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

17250

if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

17251

continue;

17252

17253

Found = true;

17254

CrossLaneMaskLarge[DstSublane] = SrcSublane;

17255

int DstSublaneOffset = DstSublane * NumEltsPerSublane;

17256

InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

17257

break;

17258

}

17259

if (!Found)

17260

return SDValue();

17261

}

17262

17263

// Fill CrossLaneMask using CrossLaneMaskLarge.

17264

narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);

17265

17266

if (!CanUseSublanes) {

17267

// If we're only shuffling a single lowest lane and the rest are identity

17268

// then don't bother.

17269

// TODO - isShuffleMaskInputInPlace could be extended to something like

17270

// this.

17271

int NumIdentityLanes = 0;

17272

bool OnlyShuffleLowestLane = true;

17273

for (int i = 0; i != NumLanes; ++i) {

17274

int LaneOffset = i * NumEltsPerLane;

17275

if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

17276

i * NumEltsPerLane))

17277

NumIdentityLanes++;

17278

else if (CrossLaneMask[LaneOffset] != 0)

17279

OnlyShuffleLowestLane = false;

17280

}

17281

if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

17282

return SDValue();

17283

}

17284

17285

// Avoid returning the same shuffle operation. For example,

17286

// t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,

17287

// undef:v16i16

17288

if (CrossLaneMask == Mask || InLaneMask == Mask)

17289

return SDValue();

17290

17291

SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

17292

return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

17293

InLaneMask);

17294

};

17295

17296

// First attempt a solution with full lanes.

17297

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

17298

return V;

17299

17300

// The rest of the solutions use sublanes.

17301

if (!CanUseSublanes)

17302

return SDValue();

17303

17304

// Then attempt a solution with 64-bit sublanes (vpermq).

17305

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

17306

return V;

17307

17308

// If that doesn't work and we have fast variable cross-lane shuffle,

17309

// attempt 32-bit sublanes (vpermd).

17310

if (!Subtarget.hasFastVariableCrossLaneShuffle())

17311

return SDValue();

17312

17313

return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

17314

}

17315

17316

/// Helper to get compute inlane shuffle mask for a complete shuffle mask.

17317

static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,

17318

SmallVector<int> &InLaneMask) {

17319

int Size = Mask.size();

17320

InLaneMask.assign(Mask.begin(), Mask.end());

17321

for (int i = 0; i < Size; ++i) {

17322

int &M = InLaneMask[i];

17323

if (M < 0)

17324

continue;

17325

if (((M % Size) / LaneSize) != (i / LaneSize))

17326

M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

17327

}

17328

}

17329

17330

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

17331

/// source with a lane permutation.

17332

///

17333

/// This lowering strategy results in four instructions in the worst case for a

17334

/// single-input cross lane shuffle which is lower than any other fully general

17335

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

17336

/// shuffle pattern should be handled prior to trying this lowering.

17337

static SDValue lowerShuffleAsLanePermuteAndShuffle(

17338

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17339

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17340

// FIXME: This should probably be generalized for 512-bit vectors as well.

17341

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17341, __extension__
__PRETTY_FUNCTION__));

17342

int Size = Mask.size();

17343

int LaneSize = Size / 2;

17344

17345

// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17346

// Only do this if the elements aren't all from the lower lane,

17347

// otherwise we're (probably) better off doing a split.

17348

if (VT == MVT::v4f64 &&

17349

!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

17350

return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);

17351

17352

// If there are only inputs from one 128-bit lane, splitting will in fact be

17353

// less expensive. The flags track whether the given lane contains an element

17354

// that crosses to another lane.

17355

bool AllLanes;

17356

if (!Subtarget.hasAVX2()) {

17357

bool LaneCrossing[2] = {false, false};

17358

for (int i = 0; i < Size; ++i)

17359

if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

17360

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

17361

AllLanes = LaneCrossing[0] && LaneCrossing[1];

17362

} else {

17363

bool LaneUsed[2] = {false, false};

17364

for (int i = 0; i < Size; ++i)

17365

if (Mask[i] >= 0)

17366

LaneUsed[(Mask[i] % Size) / LaneSize] = true;

17367

AllLanes = LaneUsed[0] && LaneUsed[1];

17368

}

17369

17370

// TODO - we could support shuffling V2 in the Flipped input.

17371

assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__
__PRETTY_FUNCTION__))

17372

"This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__
__PRETTY_FUNCTION__));

17373

17374

SmallVector<int> InLaneMask;

17375

computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

17376

17377

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__
__PRETTY_FUNCTION__))

17378

"In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__
__PRETTY_FUNCTION__));

17379

17380

// If we're not using both lanes in each lane and the inlane mask is not

17381

// repeating, then we're better off splitting.

17382

if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))

17383

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

17384

/*SimpleOnly*/ false);

17385

17386

// Flip the lanes, and shuffle the results which should now be in-lane.

17387

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

17388

SDValue Flipped = DAG.getBitcast(PVT, V1);

17389

Flipped =

17390

DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

17391

Flipped = DAG.getBitcast(VT, Flipped);

17392

return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

17393

}

17394

17395

/// Handle lowering 2-lane 128-bit shuffles.

17396

static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

17397

SDValue V2, ArrayRef<int> Mask,

17398

const APInt &Zeroable,

17399

const X86Subtarget &Subtarget,

17400

SelectionDAG &DAG) {

17401

if (V2.isUndef()) {

17402

// Attempt to match VBROADCAST*128 subvector broadcast load.

17403

bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);

17404

bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);

17405

if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&

17406

X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {

17407

MVT MemVT = VT.getHalfNumVectorElementsVT();

17408

unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

17409

auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));

17410

if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,

17411

VT, MemVT, Ld, Ofs, DAG))

17412

return BcstLd;

17413

}

17414

17415

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

17416

if (Subtarget.hasAVX2())

17417

return SDValue();

17418

}

17419

17420

bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

17421

17422

SmallVector<int, 4> WidenedMask;

17423

if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

17424

return SDValue();

17425

17426

bool IsLowZero = (Zeroable & 0x3) == 0x3;

17427

bool IsHighZero = (Zeroable & 0xc) == 0xc;

17428

17429

// Try to use an insert into a zero vector.

17430

if (WidenedMask[0] == 0 && IsHighZero) {

17431

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17432

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

17433

DAG.getIntPtrConstant(0, DL));

17434

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

17435

getZeroVector(VT, Subtarget, DAG, DL), LoV,

17436

DAG.getIntPtrConstant(0, DL));

17437

}

17438

17439

// TODO: If minimizing size and one of the inputs is a zero vector and the

17440

// the zero vector has only one use, we could use a VPERM2X128 to save the

17441

// instruction bytes needed to explicitly generate the zero vector.

17442

17443

// Blends are faster and handle all the non-lane-crossing cases.

17444

if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

17445

Subtarget, DAG))

17446

return Blend;

17447

17448

// If either input operand is a zero vector, use VPERM2X128 because its mask

17449

// allows us to replace the zero input with an implicit zero.

17450

if (!IsLowZero && !IsHighZero) {

17451

// Check for patterns which can be matched with a single insert of a 128-bit

17452

// subvector.

17453

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);

17454

if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {

17455

17456

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

17457

// this will likely become vinsertf128 which can't fold a 256-bit memop.

17458

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

17459

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17460

SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

17461

OnlyUsesV1 ? V1 : V2,

17462

DAG.getIntPtrConstant(0, DL));

17463

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

17464

DAG.getIntPtrConstant(2, DL));

17465

}

17466

}

17467

17468

// Try to use SHUF128 if possible.

17469

if (Subtarget.hasVLX()) {

17470

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

17471

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

17472

((WidenedMask[1] % 2) << 1);

17473

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

17474

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17475

}

17476

}

17477

}

17478

17479

// Otherwise form a 128-bit permutation. After accounting for undefs,

17480

// convert the 64-bit shuffle mask selection values into 128-bit

17481

// selection bits by dividing the indexes by 2 and shifting into positions

17482

// defined by a vperm2*128 instruction's immediate control byte.

17483

17484

// The immediate permute control byte looks like this:

17485

// [1:0] - select 128 bits from sources for low half of destination

17486

// [2] - ignore

17487

// [3] - zero low half of destination

17488

// [5:4] - select 128 bits from sources for high half of destination

17489

// [6] - ignore

17490

// [7] - zero high half of destination

17491

17492

assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__
__PRETTY_FUNCTION__))

17493

(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__
__PRETTY_FUNCTION__));

17494

17495

unsigned PermMask = 0;

17496

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

17497

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

17498

17499

// Check the immediate mask and replace unused sources with undef.

17500

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

17501

V1 = DAG.getUNDEF(VT);

17502

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

17503

V2 = DAG.getUNDEF(VT);

17504

17505

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

17506

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17507

}

17508

17509

/// Lower a vector shuffle by first fixing the 128-bit lanes and then

17510

/// shuffling each lane.

17511

///

17512

/// This attempts to create a repeated lane shuffle where each lane uses one

17513

/// or two of the lanes of the inputs. The lanes of the input vectors are

17514

/// shuffled in one or two independent shuffles to get the lanes into the

17515

/// position needed by the final shuffle.

17516

static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

17517

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17518

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17519

assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17519, __extension__
__PRETTY_FUNCTION__));

17520

17521

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17522

return SDValue();

17523

17524

int NumElts = Mask.size();

17525

int NumLanes = VT.getSizeInBits() / 128;

17526

int NumLaneElts = 128 / VT.getScalarSizeInBits();

17527

SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

17528

SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

17529

17530

// First pass will try to fill in the RepeatMask from lanes that need two

17531

// sources.

17532

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17533

int Srcs[2] = {-1, -1};

17534

SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

17535

for (int i = 0; i != NumLaneElts; ++i) {

17536

int M = Mask[(Lane * NumLaneElts) + i];

17537

if (M < 0)

17538

continue;

17539

// Determine which of the possible input lanes (NumLanes from each source)

17540

// this element comes from. Assign that as one of the sources for this

17541

// lane. We can assign up to 2 sources for this lane. If we run out

17542

// sources we can't do anything.

17543

int LaneSrc = M / NumLaneElts;

17544

int Src;

17545

if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

17546

Src = 0;

17547

else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

17548

Src = 1;

17549

else

17550

return SDValue();

17551

17552

Srcs[Src] = LaneSrc;

17553

InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

17554

}

17555

17556

// If this lane has two sources, see if it fits with the repeat mask so far.

17557

if (Srcs[1] < 0)

17558

continue;

17559

17560

LaneSrcs[Lane][0] = Srcs[0];

17561

LaneSrcs[Lane][1] = Srcs[1];

17562

17563

auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

17564

assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17564, __extension__
__PRETTY_FUNCTION__));

17565

for (int i = 0, e = M1.size(); i != e; ++i)

17566

if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

17567

return false;

17568

return true;

17569

};

17570

17571

auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

17572

assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17572, __extension__
__PRETTY_FUNCTION__));

17573

for (int i = 0, e = MergedMask.size(); i != e; ++i) {

17574

int M = Mask[i];

17575

if (M < 0)

17576

continue;

17577

assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__
__PRETTY_FUNCTION__))

17578

"Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__
__PRETTY_FUNCTION__));

17579

MergedMask[i] = M;

17580

}

17581

};

17582

17583

if (MatchMasks(InLaneMask, RepeatMask)) {

17584

// Merge this lane mask into the final repeat mask.

17585

MergeMasks(InLaneMask, RepeatMask);

17586

continue;

17587

}

17588

17589

// Didn't find a match. Swap the operands and try again.

17590

std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

17591

ShuffleVectorSDNode::commuteMask(InLaneMask);

17592

17593

if (MatchMasks(InLaneMask, RepeatMask)) {

17594

// Merge this lane mask into the final repeat mask.

17595

MergeMasks(InLaneMask, RepeatMask);

17596

continue;

17597

}

17598

17599

// Couldn't find a match with the operands in either order.

17600

return SDValue();

17601

}

17602

17603

// Now handle any lanes with only one source.

17604

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17605

// If this lane has already been processed, skip it.

17606

if (LaneSrcs[Lane][0] >= 0)

17607

continue;

17608

17609

for (int i = 0; i != NumLaneElts; ++i) {

17610

int M = Mask[(Lane * NumLaneElts) + i];

17611

if (M < 0)

17612

continue;

17613

17614

// If RepeatMask isn't defined yet we can define it ourself.

17615

if (RepeatMask[i] < 0)

17616

RepeatMask[i] = M % NumLaneElts;

17617

17618

if (RepeatMask[i] < NumElts) {

17619

if (RepeatMask[i] != M % NumLaneElts)

17620

return SDValue();

17621

LaneSrcs[Lane][0] = M / NumLaneElts;

17622

} else {

17623

if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

17624

return SDValue();

17625

LaneSrcs[Lane][1] = M / NumLaneElts;

17626

}

17627

}

17628

17629

if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

17630

return SDValue();

17631

}

17632

17633

SmallVector<int, 16> NewMask(NumElts, -1);

17634

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17635

int Src = LaneSrcs[Lane][0];

17636

for (int i = 0; i != NumLaneElts; ++i) {

17637

int M = -1;

17638

if (Src >= 0)

17639

M = Src * NumLaneElts + i;

17640

NewMask[Lane * NumLaneElts + i] = M;

17641

}

17642

}

17643

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17644

// Ensure we didn't get back the shuffle we started with.

17645

// FIXME: This is a hack to make up for some splat handling code in

17646

// getVectorShuffle.

17647

if (isa<ShuffleVectorSDNode>(NewV1) &&

17648

cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

17649

return SDValue();

17650

17651

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17652

int Src = LaneSrcs[Lane][1];

17653

for (int i = 0; i != NumLaneElts; ++i) {

17654

int M = -1;

17655

if (Src >= 0)

17656

M = Src * NumLaneElts + i;

17657

NewMask[Lane * NumLaneElts + i] = M;

17658

}

17659

}

17660

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17661

// Ensure we didn't get back the shuffle we started with.

17662

// FIXME: This is a hack to make up for some splat handling code in

17663

// getVectorShuffle.

17664

if (isa<ShuffleVectorSDNode>(NewV2) &&

17665

cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

17666

return SDValue();

17667

17668

for (int i = 0; i != NumElts; ++i) {

17669

if (Mask[i] < 0) {

17670

NewMask[i] = -1;

17671

continue;

17672

}

17673

NewMask[i] = RepeatMask[i % NumLaneElts];

17674

if (NewMask[i] < 0)

17675

continue;

17676

17677

NewMask[i] += (i / NumLaneElts) * NumLaneElts;

17678

}

17679

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

17680

}

17681

17682

/// If the input shuffle mask results in a vector that is undefined in all upper

17683

/// or lower half elements and that mask accesses only 2 halves of the

17684

/// shuffle's operands, return true. A mask of half the width with mask indexes

17685

/// adjusted to access the extracted halves of the original shuffle operands is

17686

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

17687

/// lower half of each input operand is accessed.

17688

static bool

17689

getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

17690

int &HalfIdx1, int &HalfIdx2) {

17691

assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__
__PRETTY_FUNCTION__))

17692

"Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__
__PRETTY_FUNCTION__));

17693

17694

// Exactly one half of the result must be undef to allow narrowing.

17695

bool UndefLower = isUndefLowerHalf(Mask);

17696

bool UndefUpper = isUndefUpperHalf(Mask);

17697

if (UndefLower == UndefUpper)

17698

return false;

17699

17700

unsigned HalfNumElts = HalfMask.size();

17701

unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

17702

HalfIdx1 = -1;

17703

HalfIdx2 = -1;

17704

for (unsigned i = 0; i != HalfNumElts; ++i) {

17705

int M = Mask[i + MaskIndexOffset];

17706

if (M < 0) {

17707

HalfMask[i] = M;

17708

continue;

17709

}

17710

17711

// Determine which of the 4 half vectors this element is from.

17712

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

17713

int HalfIdx = M / HalfNumElts;

17714

17715

// Determine the element index into its half vector source.

17716

int HalfElt = M % HalfNumElts;

17717

17718

// We can shuffle with up to 2 half vectors, set the new 'half'

17719

// shuffle mask accordingly.

17720

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

17721

HalfMask[i] = HalfElt;

17722

HalfIdx1 = HalfIdx;

17723

continue;

17724

}

17725

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

17726

HalfMask[i] = HalfElt + HalfNumElts;

17727

HalfIdx2 = HalfIdx;

17728

continue;

17729

}

17730

17731

// Too many half vectors referenced.

17732

return false;

17733

}

17734

17735

return true;

17736

}

17737

17738

/// Given the output values from getHalfShuffleMask(), create a half width

17739

/// shuffle of extracted vectors followed by an insert back to full width.

17740

static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

17741

ArrayRef<int> HalfMask, int HalfIdx1,

17742

int HalfIdx2, bool UndefLower,

17743

SelectionDAG &DAG, bool UseConcat = false) {

17744

assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17744, __extension__
__PRETTY_FUNCTION__));

17745

assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17745, __extension__
__PRETTY_FUNCTION__));

17746

17747

MVT VT = V1.getSimpleValueType();

17748

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17749

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17750

17751

auto getHalfVector = [&](int HalfIdx) {

17752

if (HalfIdx < 0)

17753

return DAG.getUNDEF(HalfVT);

17754

SDValue V = (HalfIdx < 2 ? V1 : V2);

17755

HalfIdx = (HalfIdx % 2) * HalfNumElts;

17756

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

17757

DAG.getIntPtrConstant(HalfIdx, DL));

17758

};

17759

17760

// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

17761

SDValue Half1 = getHalfVector(HalfIdx1);

17762

SDValue Half2 = getHalfVector(HalfIdx2);

17763

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

17764

if (UseConcat) {

17765

SDValue Op0 = V;

17766

SDValue Op1 = DAG.getUNDEF(HalfVT);

17767

if (UndefLower)

17768

std::swap(Op0, Op1);

17769

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

17770

}

17771

17772

unsigned Offset = UndefLower ? HalfNumElts : 0;

17773

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

17774

DAG.getIntPtrConstant(Offset, DL));

17775

}

17776

17777

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

17778

/// This allows for fast cases such as subvector extraction/insertion

17779

/// or shuffling smaller vector types which can lower more efficiently.

17780

static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

17781

SDValue V2, ArrayRef<int> Mask,

17782

const X86Subtarget &Subtarget,

17783

SelectionDAG &DAG) {

17784

assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__
__PRETTY_FUNCTION__))

17785

"Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__
__PRETTY_FUNCTION__));

17786

17787

bool UndefLower = isUndefLowerHalf(Mask);

17788

if (!UndefLower && !isUndefUpperHalf(Mask))

17789

return SDValue();

17790

17791

assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__
__PRETTY_FUNCTION__))

17792

"Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__
__PRETTY_FUNCTION__));

17793

17794

// Upper half is undef and lower half is whole upper subvector.

17795

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

17796

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17797

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17798

if (!UndefLower &&

17799

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

17800

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17801

DAG.getIntPtrConstant(HalfNumElts, DL));

17802

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17803

DAG.getIntPtrConstant(0, DL));

17804

}

17805

17806

// Lower half is undef and upper half is whole lower subvector.

17807

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

17808

if (UndefLower &&

17809

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

17810

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17811

DAG.getIntPtrConstant(0, DL));

17812

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17813

DAG.getIntPtrConstant(HalfNumElts, DL));

17814

}

17815

17816

int HalfIdx1, HalfIdx2;

17817

SmallVector<int, 8> HalfMask(HalfNumElts);

17818

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

17819

return SDValue();

17820

17821

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17821, __extension__
__PRETTY_FUNCTION__));

17822

17823

// Only shuffle the halves of the inputs when useful.

17824

unsigned NumLowerHalves =

17825

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

17826

unsigned NumUpperHalves =

17827

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

17828

assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17828, __extension__
__PRETTY_FUNCTION__));

17829

17830

// Determine the larger pattern of undef/halves, then decide if it's worth

17831

// splitting the shuffle based on subtarget capabilities and types.

17832

unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

17833

if (!UndefLower) {

17834

// XXXXuuuu: no insert is needed.

17835

// Always extract lowers when setting lower - these are all free subreg ops.

17836

if (NumUpperHalves == 0)

17837

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17838

UndefLower, DAG);

17839

17840

if (NumUpperHalves == 1) {

17841

// AVX2 has efficient 32/64-bit element cross-lane shuffles.

17842

if (Subtarget.hasAVX2()) {

17843

// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

17844

if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

17845

!is128BitUnpackShuffleMask(HalfMask, DAG) &&

17846

(!isSingleSHUFPSMask(HalfMask) ||

17847

Subtarget.hasFastVariableCrossLaneShuffle()))

17848

return SDValue();

17849

// If this is a unary shuffle (assume that the 2nd operand is

17850

// canonicalized to undef), then we can use vpermpd. Otherwise, we

17851

// are better off extracting the upper half of 1 operand and using a

17852

// narrow shuffle.

17853

if (EltWidth == 64 && V2.isUndef())

17854

return SDValue();

17855

}

17856

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17857

if (Subtarget.hasAVX512() && VT.is512BitVector())

17858

return SDValue();

17859

// Extract + narrow shuffle is better than the wide alternative.

17860

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17861

UndefLower, DAG);

17862

}

17863

17864

// Don't extract both uppers, instead shuffle and then extract.

17865

assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17865, __extension__
__PRETTY_FUNCTION__));

17866

return SDValue();

17867

}

17868

17869

// UndefLower - uuuuXXXX: an insert to high half is required if we split this.

17870

if (NumUpperHalves == 0) {

17871

// AVX2 has efficient 64-bit element cross-lane shuffles.

17872

// TODO: Refine to account for unary shuffle, splat, and other masks?

17873

if (Subtarget.hasAVX2() && EltWidth == 64)

17874

return SDValue();

17875

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17876

if (Subtarget.hasAVX512() && VT.is512BitVector())

17877

return SDValue();

17878

// Narrow shuffle + insert is better than the wide alternative.

17879

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17880

UndefLower, DAG);

17881

}

17882

17883

// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

17884

return SDValue();

17885

}

17886

17887

/// Handle case where shuffle sources are coming from the same 128-bit lane and

17888

/// every lane can be represented as the same repeating mask - allowing us to

17889

/// shuffle the sources with the repeating shuffle and then permute the result

17890

/// to the destination lanes.

17891

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

17892

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17893

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17894

int NumElts = VT.getVectorNumElements();

17895

int NumLanes = VT.getSizeInBits() / 128;

17896

int NumLaneElts = NumElts / NumLanes;

17897

17898

// On AVX2 we may be able to just shuffle the lowest elements and then

17899

// broadcast the result.

17900

if (Subtarget.hasAVX2()) {

17901

for (unsigned BroadcastSize : {16, 32, 64}) {

17902

if (BroadcastSize <= VT.getScalarSizeInBits())

17903

continue;

17904

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

17905

17906

// Attempt to match a repeating pattern every NumBroadcastElts,

17907

// accounting for UNDEFs but only references the lowest 128-bit

17908

// lane of the inputs.

17909

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

17910

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17911

for (int j = 0; j != NumBroadcastElts; ++j) {

17912

int M = Mask[i + j];

17913

if (M < 0)

17914

continue;

17915

int &R = RepeatMask[j];

17916

if (0 != ((M % NumElts) / NumLaneElts))

17917

return false;

17918

if (0 <= R && R != M)

17919

return false;

17920

R = M;

17921

}

17922

return true;

17923

};

17924

17925

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

17926

if (!FindRepeatingBroadcastMask(RepeatMask))

17927

continue;

17928

17929

// Shuffle the (lowest) repeated elements in place for broadcast.

17930

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

17931

17932

// Shuffle the actual broadcast.

17933

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

17934

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17935

for (int j = 0; j != NumBroadcastElts; ++j)

17936

BroadcastMask[i + j] = j;

17937

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

17938

BroadcastMask);

17939

}

17940

}

17941

17942

// Bail if the shuffle mask doesn't cross 128-bit lanes.

17943

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

17944

return SDValue();

17945

17946

// Bail if we already have a repeated lane shuffle mask.

17947

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17948

return SDValue();

17949

17950

// Helper to look for repeated mask in each split sublane, and that those

17951

// sublanes can then be permuted into place.

17952

auto ShuffleSubLanes = [&](int SubLaneScale) {

17953

int NumSubLanes = NumLanes * SubLaneScale;

17954

int NumSubLaneElts = NumLaneElts / SubLaneScale;

17955

17956

// Check that all the sources are coming from the same lane and see if we

17957

// can form a repeating shuffle mask (local to each sub-lane). At the same

17958

// time, determine the source sub-lane for each destination sub-lane.

17959

int TopSrcSubLane = -1;

17960

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

17961

SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(

17962

SubLaneScale,

17963

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));

17964

17965

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

17966

// Extract the sub-lane mask, check that it all comes from the same lane

17967

// and normalize the mask entries to come from the first lane.

17968

int SrcLane = -1;

17969

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

17970

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17971

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

17972

if (M < 0)

17973

continue;

17974

int Lane = (M % NumElts) / NumLaneElts;

17975

if ((0 <= SrcLane) && (SrcLane != Lane))

17976

return SDValue();

17977

SrcLane = Lane;

17978

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

17979

SubLaneMask[Elt] = LocalM;

17980

}

17981

17982

// Whole sub-lane is UNDEF.

17983

if (SrcLane < 0)

17984

continue;

17985

17986

// Attempt to match against the candidate repeated sub-lane masks.

17987

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

17988

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

17989

for (int i = 0; i != NumSubLaneElts; ++i) {

17990

if (M1[i] < 0 || M2[i] < 0)

17991

continue;

17992

if (M1[i] != M2[i])

17993

return false;

17994

}

17995

return true;

17996

};

17997

17998

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

17999

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

18000

continue;

18001

18002

// Merge the sub-lane mask into the matching repeated sub-lane mask.

18003

for (int i = 0; i != NumSubLaneElts; ++i) {

18004

int M = SubLaneMask[i];

18005

if (M < 0)

18006

continue;

18007

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__
__PRETTY_FUNCTION__))

18008

"Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__
__PRETTY_FUNCTION__));

18009

RepeatedSubLaneMask[i] = M;

18010

}

18011

18012

// Track the top most source sub-lane - by setting the remaining to

18013

// UNDEF we can greatly simplify shuffle matching.

18014

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

18015

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

18016

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

18017

break;

18018

}

18019

18020

// Bail if we failed to find a matching repeated sub-lane mask.

18021

if (Dst2SrcSubLanes[DstSubLane] < 0)

18022

return SDValue();

18023

}

18024

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__
__PRETTY_FUNCTION__))

18025

"Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__
__PRETTY_FUNCTION__));

18026

18027

// Create a repeating shuffle mask for the entire vector.

18028

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

18029

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

18030

int Lane = SubLane / SubLaneScale;

18031

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

18032

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

18033

int M = RepeatedSubLaneMask[Elt];

18034

if (M < 0)

18035

continue;

18036

int Idx = (SubLane * NumSubLaneElts) + Elt;

18037

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

18038

}

18039

}

18040

18041

// Shuffle each source sub-lane to its destination.

18042

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

18043

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

18044

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

18045

if (SrcSubLane < 0)

18046

continue;

18047

for (int j = 0; j != NumSubLaneElts; ++j)

18048

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

18049

}

18050

18051

// Avoid returning the same shuffle operation.

18052

// v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32

18053

if (RepeatedMask == Mask || SubLaneMask == Mask)

18054

return SDValue();

18055

18056

SDValue RepeatedShuffle =

18057

DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

18058

18059

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

18060

SubLaneMask);

18061

};

18062

18063

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

18064

// (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,

18065

// even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.

18066

// Otherwise we can only permute whole 128-bit lanes.

18067

int MinSubLaneScale = 1, MaxSubLaneScale = 1;

18068

if (Subtarget.hasAVX2() && VT.is256BitVector()) {

18069

bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);

18070

MinSubLaneScale = 2;

18071

MaxSubLaneScale =

18072

(!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;

18073

}

18074

if (Subtarget.hasBWI() && VT == MVT::v64i8)

18075

MinSubLaneScale = MaxSubLaneScale = 4;

18076

18077

for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)

18078

if (SDValue Shuffle = ShuffleSubLanes(Scale))

18079

return Shuffle;

18080

18081

return SDValue();

18082

}

18083

18084

static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

18085

bool &ForceV1Zero, bool &ForceV2Zero,

18086

unsigned &ShuffleImm, ArrayRef<int> Mask,

18087

const APInt &Zeroable) {

18088

int NumElts = VT.getVectorNumElements();

18089

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))

18090

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__))

18091

"Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__
__PRETTY_FUNCTION__));

18092

assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__
__PRETTY_FUNCTION__))

18093

"Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__
__PRETTY_FUNCTION__));

18094

18095

bool ZeroLane[2] = { true, true };

18096

for (int i = 0; i < NumElts; ++i)

18097

ZeroLane[i & 1] &= Zeroable[i];

18098

18099

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

18100

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

18101

ShuffleImm = 0;

18102

bool ShufpdMask = true;

18103

bool CommutableMask = true;

18104

for (int i = 0; i < NumElts; ++i) {

18105

if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

18106

continue;

18107

if (Mask[i] < 0)

18108

return false;

18109

int Val = (i & 6) + NumElts * (i & 1);

18110

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

18111

if (Mask[i] < Val || Mask[i] > Val + 1)

18112

ShufpdMask = false;

18113

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

18114

CommutableMask = false;

18115

ShuffleImm |= (Mask[i] % 2) << i;

18116

}

18117

18118

if (!ShufpdMask && !CommutableMask)

18119

return false;

18120

18121

if (!ShufpdMask && CommutableMask)

18122

std::swap(V1, V2);

18123

18124

ForceV1Zero = ZeroLane[0];

18125

ForceV2Zero = ZeroLane[1];

18126

return true;

18127

}

18128

18129

static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

18130

SDValue V2, ArrayRef<int> Mask,

18131

const APInt &Zeroable,

18132

const X86Subtarget &Subtarget,

18133

SelectionDAG &DAG) {

18134

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__
__PRETTY_FUNCTION__))

18135

"Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__
__PRETTY_FUNCTION__));

18136

18137

unsigned Immediate = 0;

18138

bool ForceV1Zero = false, ForceV2Zero = false;

18139

if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

18140

Mask, Zeroable))

18141

return SDValue();

18142

18143

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

18144

if (ForceV1Zero)

18145

V1 = getZeroVector(VT, Subtarget, DAG, DL);

18146

if (ForceV2Zero)

18147

V2 = getZeroVector(VT, Subtarget, DAG, DL);

18148

18149

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

18150

DAG.getTargetConstant(Immediate, DL, MVT::i8));

18151

}

18152

18153

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18154

// by zeroable elements in the remaining 24 elements. Turn this into two

18155

// vmovqb instructions shuffled together.

18156

static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

18157

SDValue V1, SDValue V2,

18158

ArrayRef<int> Mask,

18159

const APInt &Zeroable,

18160

SelectionDAG &DAG) {

18161

assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18161, __extension__
__PRETTY_FUNCTION__));

18162

18163

// The first 8 indices should be every 8th element.

18164

if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

18165

return SDValue();

18166

18167

// Remaining elements need to be zeroable.

18168

if (Zeroable.countl_one() < (Mask.size() - 8))

18169

return SDValue();

18170

18171

V1 = DAG.getBitcast(MVT::v4i64, V1);

18172

V2 = DAG.getBitcast(MVT::v4i64, V2);

18173

18174

V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

18175

V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

18176

18177

// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

18178

// the upper bits of the result using an unpckldq.

18179

SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

18180

{ 0, 1, 2, 3, 16, 17, 18, 19,

18181

4, 5, 6, 7, 20, 21, 22, 23 });

18182

// Insert the unpckldq into a zero vector to widen to v32i8.

18183

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

18184

DAG.getConstant(0, DL, MVT::v32i8), Unpack,

18185

DAG.getIntPtrConstant(0, DL));

18186

}

18187

18188

// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2

18189

// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2

18190

// =>

18191

// ul = unpckl v1, v2

18192

// uh = unpckh v1, v2

18193

// a = vperm ul, uh

18194

// b = vperm ul, uh

18195

//

18196

// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck

18197

// and permute. We cannot directly match v3 because it is split into two

18198

// 256-bit vectors in earlier isel stages. Therefore, this function matches a

18199

// pair of 256-bit shuffles and makes sure the masks are consecutive.

18200

//

18201

// Once unpck and permute nodes are created, the permute corresponding to this

18202

// shuffle is returned, while the other permute replaces the other half of the

18203

// shuffle in the selection dag.

18204

static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

18205

SDValue V1, SDValue V2,

18206

ArrayRef<int> Mask,

18207

SelectionDAG &DAG) {

18208

if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&

18209

VT != MVT::v32i8)

18210

return SDValue();

18211

// <B0, B1, B0+1, B1+1, ..., >

18212

auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,

18213

unsigned Begin1) {

18214

size_t Size = Mask.size();

18215

assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18215, __extension__
__PRETTY_FUNCTION__));

18216

for (unsigned I = 0; I < Size; I += 2) {

18217

if (Mask[I] != (int)(Begin0 + I / 2) ||

18218

Mask[I + 1] != (int)(Begin1 + I / 2))

18219

return false;

18220

}

18221

return true;

18222

};

18223

// Check which half is this shuffle node

18224

int NumElts = VT.getVectorNumElements();

18225

size_t FirstQtr = NumElts / 2;

18226

size_t ThirdQtr = NumElts + NumElts / 2;

18227

bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);

18228

bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);

18229

if (!IsFirstHalf && !IsSecondHalf)

18230

return SDValue();

18231

18232

// Find the intersection between shuffle users of V1 and V2.

18233

SmallVector<SDNode *, 2> Shuffles;

18234

for (SDNode *User : V1->uses())

18235

if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&

18236

User->getOperand(1) == V2)

18237

Shuffles.push_back(User);

18238

// Limit user size to two for now.

18239

if (Shuffles.size() != 2)

18240

return SDValue();

18241

// Find out which half of the 512-bit shuffles is each smaller shuffle

18242

auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);

18243

auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);

18244

SDNode *FirstHalf;

18245

SDNode *SecondHalf;

18246

if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&

18247

IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {

18248

FirstHalf = Shuffles[0];

18249

SecondHalf = Shuffles[1];

18250

} else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&

18251

IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {

18252

FirstHalf = Shuffles[1];

18253

SecondHalf = Shuffles[0];

18254

} else {

18255

return SDValue();

18256

}

18257

// Lower into unpck and perm. Return the perm of this shuffle and replace

18258

// the other.

18259

SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

18260

SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

18261

SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18262

DAG.getTargetConstant(0x20, DL, MVT::i8));

18263

SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18264

DAG.getTargetConstant(0x31, DL, MVT::i8));

18265

if (IsFirstHalf) {

18266

DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);

18267

return Perm1;

18268

}

18269

DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);

18270

return Perm2;

18271

}

18272

18273

/// Handle lowering of 4-lane 64-bit floating point shuffles.

18274

///

18275

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

18276

/// isn't available.

18277

static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18278

const APInt &Zeroable, SDValue V1, SDValue V2,

18279

const X86Subtarget &Subtarget,

18280

SelectionDAG &DAG) {

18281

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18281, __extension__
__PRETTY_FUNCTION__));

18282

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18282, __extension__
__PRETTY_FUNCTION__));

18283

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18283, __extension__
__PRETTY_FUNCTION__));

18284

18285

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

18286

Subtarget, DAG))

18287

return V;

18288

18289

if (V2.isUndef()) {

18290

// Check for being able to broadcast a single element.

18291

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

18292

Mask, Subtarget, DAG))

18293

return Broadcast;

18294

18295

// Use low duplicate instructions for masks that match their pattern.

18296

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

18297

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

18298

18299

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

18300

// Non-half-crossing single input shuffles can be lowered with an

18301

// interleaved permutation.

18302

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

18303

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

18304

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

18305

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

18306

}

18307

18308

// With AVX2 we have direct support for this permutation.

18309

if (Subtarget.hasAVX2())

18310

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

18311

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18312

18313

// Try to create an in-lane repeating shuffle mask and then shuffle the

18314

// results into the target lanes.

18315

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18316

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18317

return V;

18318

18319

// Try to permute the lanes and then use a per-lane permute.

18320

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

18321

Mask, DAG, Subtarget))

18322

return V;

18323

18324

// Otherwise, fall back.

18325

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

18326

DAG, Subtarget);

18327

}

18328

18329

// Use dedicated unpack instructions for masks that match their pattern.

18330

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

18331

return V;

18332

18333

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

18334

Zeroable, Subtarget, DAG))

18335

return Blend;

18336

18337

// Check if the blend happens to exactly fit that of SHUFPD.

18338

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

18339

Zeroable, Subtarget, DAG))

18340

return Op;

18341

18342

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18343

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18344

18345

// If we have lane crossing shuffles AND they don't all come from the lower

18346

// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

18347

// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

18348

// canonicalize to a blend of splat which isn't necessary for this combine.

18349

if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

18350

!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

18351

(V1.getOpcode() != ISD::BUILD_VECTOR) &&

18352

(V2.getOpcode() != ISD::BUILD_VECTOR))

18353

return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);

18354

18355

// If we have one input in place, then we can permute the other input and

18356

// blend the result.

18357

if (V1IsInPlace || V2IsInPlace)

18358

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18359

Subtarget, DAG);

18360

18361

// Try to create an in-lane repeating shuffle mask and then shuffle the

18362

// results into the target lanes.

18363

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18364

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18365

return V;

18366

18367

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18368

// shuffle. However, if we have AVX2 and either inputs are already in place,

18369

// we will be able to shuffle even across lanes the other input in a single

18370

// instruction so skip this pattern.

18371

if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))

18372

if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

18373

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18374

return V;

18375

18376

// If we have VLX support, we can use VEXPAND.

18377

if (Subtarget.hasVLX())

18378

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,

18379

DAG, Subtarget))

18380

return V;

18381

18382

// If we have AVX2 then we always want to lower with a blend because an v4 we

18383

// can fully permute the elements.

18384

if (Subtarget.hasAVX2())

18385

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18386

Subtarget, DAG);

18387

18388

// Otherwise fall back on generic lowering.

18389

return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,

18390

Subtarget, DAG);

18391

}

18392

18393

/// Handle lowering of 4-lane 64-bit integer shuffles.

18394

///

18395

/// This routine is only called when we have AVX2 and thus a reasonable

18396

/// instruction set for v4i64 shuffling..

18397

static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18398

const APInt &Zeroable, SDValue V1, SDValue V2,

18399

const X86Subtarget &Subtarget,

18400

SelectionDAG &DAG) {

18401

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18401, __extension__
__PRETTY_FUNCTION__));

18402

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18402, __extension__
__PRETTY_FUNCTION__));

18403

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18403, __extension__
__PRETTY_FUNCTION__));

18404

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18404, __extension__
__PRETTY_FUNCTION__));

18405

18406

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18407

Subtarget, DAG))

18408

return V;

18409

18410

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

18411

Zeroable, Subtarget, DAG))

18412

return Blend;

18413

18414

// Check for being able to broadcast a single element.

18415

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

18416

Subtarget, DAG))

18417

return Broadcast;

18418

18419

// Try to use shift instructions if fast.

18420

if (Subtarget.preferLowerShuffleAsShift())

18421

if (SDValue Shift =

18422

lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18423

Subtarget, DAG, /*BitwiseOnly*/ true))

18424

return Shift;

18425

18426

if (V2.isUndef()) {

18427

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

18428

// can use lower latency instructions that will operate on both lanes.

18429

SmallVector<int, 2> RepeatedMask;

18430

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

18431

SmallVector<int, 4> PSHUFDMask;

18432

narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

18433

return DAG.getBitcast(

18434

MVT::v4i64,

18435

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

18436

DAG.getBitcast(MVT::v8i32, V1),

18437

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

18438

}

18439

18440

// AVX2 provides a direct instruction for permuting a single input across

18441

// lanes.

18442

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

18443

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18444

}

18445

18446

// Try to use shift instructions.

18447

if (SDValue Shift =

18448

lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,

18449

DAG, /*BitwiseOnly*/ false))

18450

return Shift;

18451

18452

// If we have VLX support, we can use VALIGN or VEXPAND.

18453

if (Subtarget.hasVLX()) {

18454

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

18455

Subtarget, DAG))

18456

return Rotate;

18457

18458

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,

18459

DAG, Subtarget))

18460

return V;

18461

}

18462

18463

// Try to use PALIGNR.

18464

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

18465

Subtarget, DAG))

18466

return Rotate;

18467

18468

// Use dedicated unpack instructions for masks that match their pattern.

18469

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

18470

return V;

18471

18472

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18473

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18474

18475

// If we have one input in place, then we can permute the other input and

18476

// blend the result.

18477

if (V1IsInPlace || V2IsInPlace)

18478

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18479

Subtarget, DAG);

18480

18481

// Try to create an in-lane repeating shuffle mask and then shuffle the

18482

// results into the target lanes.

18483

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18484

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18485

return V;

18486

18487

// Try to lower to PERMQ(BLENDD(V1,V2)).

18488

if (SDValue V =

18489

lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))

18490

return V;

18491

18492

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18493

// shuffle. However, if we have AVX2 and either inputs are already in place,

18494

// we will be able to shuffle even across lanes the other input in a single

18495

// instruction so skip this pattern.

18496

if (!V1IsInPlace && !V2IsInPlace)

18497

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18498

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18499

return Result;

18500

18501

// Otherwise fall back on generic blend lowering.

18502

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18503

Subtarget, DAG);

18504

}

18505

18506

/// Handle lowering of 8-lane 32-bit floating point shuffles.

18507

///

18508

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

18509

/// isn't available.

18510

static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18511

const APInt &Zeroable, SDValue V1, SDValue V2,

18512

const X86Subtarget &Subtarget,

18513

SelectionDAG &DAG) {

18514

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18514, __extension__
__PRETTY_FUNCTION__));

18515

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18515, __extension__
__PRETTY_FUNCTION__));

18516

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18516, __extension__
__PRETTY_FUNCTION__));

18517

18518

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

18519

Zeroable, Subtarget, DAG))

18520

return Blend;

18521

18522

// Check for being able to broadcast a single element.

18523

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

18524

Subtarget, DAG))

18525

return Broadcast;

18526

18527

if (!Subtarget.hasAVX2()) {

18528

SmallVector<int> InLaneMask;

18529

computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

18530

18531

if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))

18532

if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,

18533

/*SimpleOnly*/ true))

18534

return R;

18535

}

18536

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18537

Zeroable, Subtarget, DAG))

18538

return DAG.getBitcast(MVT::v8f32, ZExt);

18539

18540

// If the shuffle mask is repeated in each 128-bit lane, we have many more

18541

// options to efficiently lower the shuffle.

18542

SmallVector<int, 4> RepeatedMask;

18543

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

18544

assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__
__PRETTY_FUNCTION__))

18545

"Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__
__PRETTY_FUNCTION__));

18546

18547

// Use even/odd duplicate instructions for masks that match their pattern.

18548

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

18549

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

18550

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

18551

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

18552

18553

if (V2.isUndef())

18554

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

18555

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18556

18557

// Use dedicated unpack instructions for masks that match their pattern.

18558

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

18559

return V;

18560

18561

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

18562

// have already handled any direct blends.

18563

return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

18564

}

18565

18566

// Try to create an in-lane repeating shuffle mask and then shuffle the

18567

// results into the target lanes.

18568

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18569

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18570

return V;

18571

18572

// If we have a single input shuffle with different shuffle patterns in the

18573

// two 128-bit lanes use the variable mask to VPERMILPS.

18574

if (V2.isUndef()) {

18575

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

18576

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18577

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

18578

}

18579

if (Subtarget.hasAVX2()) {

18580

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18581

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

18582

}

18583

// Otherwise, fall back.

18584

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

18585

DAG, Subtarget);

18586

}

18587

18588

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18589

// shuffle.

18590

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18591

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18592

return Result;

18593

18594

// If we have VLX support, we can use VEXPAND.

18595

if (Subtarget.hasVLX())

18596

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,

18597

DAG, Subtarget))

18598

return V;

18599

18600

// Try to match an interleave of two v8f32s and lower them as unpck and

18601

// permutes using ymms. This needs to go before we try to split the vectors.

18602

//

18603

// TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits

18604

// this path inadvertently.

18605

if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())

18606

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,

18607

Mask, DAG))

18608

return V;

18609

18610

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18611

// since after split we get a more efficient code using vpunpcklwd and

18612

// vpunpckhwd instrs than vblend.

18613

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))

18614

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,

18615

DAG);

18616

18617

// If we have AVX2 then we always want to lower with a blend because at v8 we

18618

// can fully permute the elements.

18619

if (Subtarget.hasAVX2())

18620

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

18621

Subtarget, DAG);

18622

18623

// Otherwise fall back on generic lowering.

18624

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,

18625

Subtarget, DAG);

18626

}

18627

18628

/// Handle lowering of 8-lane 32-bit integer shuffles.

18629

///

18630

/// This routine is only called when we have AVX2 and thus a reasonable

18631

/// instruction set for v8i32 shuffling..

18632

static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18633

const APInt &Zeroable, SDValue V1, SDValue V2,

18634

const X86Subtarget &Subtarget,

18635

SelectionDAG &DAG) {

18636

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18636, __extension__
__PRETTY_FUNCTION__));

18637

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18637, __extension__
__PRETTY_FUNCTION__));

18638

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18638, __extension__
__PRETTY_FUNCTION__));

18639

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18639, __extension__
__PRETTY_FUNCTION__));

18640

18641

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

18642

18643

// Whenever we can lower this as a zext, that instruction is strictly faster

18644

// than any alternative. It also allows us to fold memory operands into the

18645

// shuffle in many cases.

18646

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18647

Zeroable, Subtarget, DAG))

18648

return ZExt;

18649

18650

// Try to match an interleave of two v8i32s and lower them as unpck and

18651

// permutes using ymms. This needs to go before we try to split the vectors.

18652

if (!Subtarget.hasAVX512())

18653

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,

18654

Mask, DAG))

18655

return V;

18656

18657

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18658

// since after split we get a more efficient code than vblend by using

18659

// vpunpcklwd and vpunpckhwd instrs.

18660

if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&

18661

!Subtarget.hasAVX512())

18662

return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,

18663

DAG);

18664

18665

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

18666

Zeroable, Subtarget, DAG))

18667

return Blend;

18668

18669

// Check for being able to broadcast a single element.

18670

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

18671

Subtarget, DAG))

18672

return Broadcast;

18673

18674

// Try to use shift instructions if fast.

18675

if (Subtarget.preferLowerShuffleAsShift()) {

18676

if (SDValue Shift =

18677

lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,

18678

Subtarget, DAG, /*BitwiseOnly*/ true))

18679

return Shift;

18680

if (NumV2Elements == 0)

18681

if (SDValue Rotate =

18682

lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

18683

return Rotate;

18684

}

18685

18686

// If the shuffle mask is repeated in each 128-bit lane we can use more

18687

// efficient instructions that mirror the shuffles across the two 128-bit

18688

// lanes.

18689

SmallVector<int, 4> RepeatedMask;

18690

bool Is128BitLaneRepeatedShuffle =

18691

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

18692

if (Is128BitLaneRepeatedShuffle) {

18693

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18693, __extension__
__PRETTY_FUNCTION__));

18694

if (V2.isUndef())

18695

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

18696

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18697

18698

// Use dedicated unpack instructions for masks that match their pattern.

18699

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

18700

return V;

18701

}

18702

18703

// Try to use shift instructions.

18704

if (SDValue Shift =

18705

lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,

18706

DAG, /*BitwiseOnly*/ false))

18707

return Shift;

18708

18709

if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)

18710

if (SDValue Rotate =

18711

lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

18712

return Rotate;

18713

18714

// If we have VLX support, we can use VALIGN or EXPAND.

18715

if (Subtarget.hasVLX()) {

18716

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

18717

Subtarget, DAG))

18718

return Rotate;

18719

18720

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,

18721

DAG, Subtarget))

18722

return V;

18723

}

18724

18725

// Try to use byte rotation instructions.

18726

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

18727

Subtarget, DAG))

18728

return Rotate;

18729

18730

// Try to create an in-lane repeating shuffle mask and then shuffle the

18731

// results into the target lanes.

18732

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18733

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18734

return V;

18735

18736

if (V2.isUndef()) {

18737

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18738

// because that should be faster than the variable permute alternatives.

18739

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

18740

return V;

18741

18742

// If the shuffle patterns aren't repeated but it's a single input, directly

18743

// generate a cross-lane VPERMD instruction.

18744

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18745

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

18746

}

18747

18748

// Assume that a single SHUFPS is faster than an alternative sequence of

18749

// multiple instructions (even if the CPU has a domain penalty).

18750

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

18751

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

18752

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

18753

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

18754

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

18755

CastV1, CastV2, DAG);

18756

return DAG.getBitcast(MVT::v8i32, ShufPS);

18757

}

18758

18759

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18760

// shuffle.

18761

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18762

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18763

return Result;

18764

18765

// Otherwise fall back on generic blend lowering.

18766

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

18767

Subtarget, DAG);

18768

}

18769

18770

/// Handle lowering of 16-lane 16-bit integer shuffles.

18771

///

18772

/// This routine is only called when we have AVX2 and thus a reasonable

18773

/// instruction set for v16i16 shuffling..

18774

static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18775

const APInt &Zeroable, SDValue V1, SDValue V2,

18776

const X86Subtarget &Subtarget,

18777

SelectionDAG &DAG) {

18778

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18778, __extension__
__PRETTY_FUNCTION__));

18779

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18779, __extension__
__PRETTY_FUNCTION__));

18780

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18780, __extension__
__PRETTY_FUNCTION__));

18781

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18781, __extension__
__PRETTY_FUNCTION__));

18782

18783

// Whenever we can lower this as a zext, that instruction is strictly faster

18784

// than any alternative. It also allows us to fold memory operands into the

18785

// shuffle in many cases.

18786

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18787

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

18788

return ZExt;

18789

18790

// Check for being able to broadcast a single element.

18791

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

18792

Subtarget, DAG))

18793

return Broadcast;

18794

18795

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

18796

Zeroable, Subtarget, DAG))

18797

return Blend;

18798

18799

// Use dedicated unpack instructions for masks that match their pattern.

18800

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

18801

return V;

18802

18803

// Use dedicated pack instructions for masks that match their pattern.

18804

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

18805

Subtarget))

18806

return V;

18807

18808

// Try to use lower using a truncation.

18809

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18810

Subtarget, DAG))

18811

return V;

18812

18813

// Try to use shift instructions.

18814

if (SDValue Shift =

18815

lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18816

Subtarget, DAG, /*BitwiseOnly*/ false))

18817

return Shift;

18818

18819

// Try to use byte rotation instructions.

18820

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

18821

Subtarget, DAG))

18822

return Rotate;

18823

18824

// Try to create an in-lane repeating shuffle mask and then shuffle the

18825

// results into the target lanes.

18826

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18827

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18828

return V;

18829

18830

if (V2.isUndef()) {

18831

// Try to use bit rotation instructions.

18832

if (SDValue Rotate =

18833

lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

18834

return Rotate;

18835

18836

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18837

// because that should be faster than the variable permute alternatives.

18838

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

18839

return V;

18840

18841

// There are no generalized cross-lane shuffle operations available on i16

18842

// element types.

18843

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

18844

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18845

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18846

return V;

18847

18848

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

18849

DAG, Subtarget);

18850

}

18851

18852

SmallVector<int, 8> RepeatedMask;

18853

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

18854

// As this is a single-input shuffle, the repeated mask should be

18855

// a strictly valid v8i16 mask that we can pass through to the v8i16

18856

// lowering to handle even the v16 case.

18857

return lowerV8I16GeneralSingleInputShuffle(

18858

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

18859

}

18860

}

18861

18862

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

18863

Zeroable, Subtarget, DAG))

18864

return PSHUFB;

18865

18866

// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

18867

if (Subtarget.hasBWI())

18868

return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);

18869

18870

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18871

// shuffle.

18872

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18873

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18874

return Result;

18875

18876

// Try to permute the lanes and then use a per-lane permute.

18877

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18878

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18879

return V;

18880

18881

// Try to match an interleave of two v16i16s and lower them as unpck and

18882

// permutes using ymms.

18883

if (!Subtarget.hasAVX512())

18884

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,

18885

Mask, DAG))

18886

return V;

18887

18888

// Otherwise fall back on generic lowering.

18889

return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,

18890

Subtarget, DAG);

18891

}

18892

18893

/// Handle lowering of 32-lane 8-bit integer shuffles.

18894

///

18895

/// This routine is only called when we have AVX2 and thus a reasonable

18896

/// instruction set for v32i8 shuffling..

18897

static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18898

const APInt &Zeroable, SDValue V1, SDValue V2,

18899

const X86Subtarget &Subtarget,

18900

SelectionDAG &DAG) {

18901

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18901, __extension__
__PRETTY_FUNCTION__));

18902

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18902, __extension__
__PRETTY_FUNCTION__));

18903

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18903, __extension__
__PRETTY_FUNCTION__));

18904

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18904, __extension__
__PRETTY_FUNCTION__));

18905

18906

// Whenever we can lower this as a zext, that instruction is strictly faster

18907

// than any alternative. It also allows us to fold memory operands into the

18908

// shuffle in many cases.

18909

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

18910

Zeroable, Subtarget, DAG))

18911

return ZExt;

18912

18913

// Check for being able to broadcast a single element.

18914

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

18915

Subtarget, DAG))

18916

return Broadcast;

18917

18918

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

18919

Zeroable, Subtarget, DAG))

18920

return Blend;

18921

18922

// Use dedicated unpack instructions for masks that match their pattern.

18923

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

18924

return V;

18925

18926

// Use dedicated pack instructions for masks that match their pattern.

18927

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

18928

Subtarget))

18929

return V;

18930

18931

// Try to use lower using a truncation.

18932

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

18933

Subtarget, DAG))

18934

return V;

18935

18936

// Try to use shift instructions.

18937

if (SDValue Shift =

18938

lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,

18939

DAG, /*BitwiseOnly*/ false))

18940

return Shift;

18941

18942

// Try to use byte rotation instructions.

18943

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

18944

Subtarget, DAG))

18945

return Rotate;

18946

18947

// Try to use bit rotation instructions.

18948

if (V2.isUndef())

18949

if (SDValue Rotate =

18950

lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

18951

return Rotate;

18952

18953

// Try to create an in-lane repeating shuffle mask and then shuffle the

18954

// results into the target lanes.

18955

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18956

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18957

return V;

18958

18959

// There are no generalized cross-lane shuffle operations available on i8

18960

// element types.

18961

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

18962

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18963

// because that should be faster than the variable permute alternatives.

18964

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

18965

return V;

18966

18967

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18968

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18969

return V;

18970

18971

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

18972

DAG, Subtarget);

18973

}

18974

18975

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

18976

Zeroable, Subtarget, DAG))

18977

return PSHUFB;

18978

18979

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

18980

if (Subtarget.hasVBMI())

18981

return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);

18982

18983

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18984

// shuffle.

18985

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18986

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18987

return Result;

18988

18989

// Try to permute the lanes and then use a per-lane permute.

18990

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18991

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18992

return V;

18993

18994

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18995

// by zeroable elements in the remaining 24 elements. Turn this into two

18996

// vmovqb instructions shuffled together.

18997

if (Subtarget.hasVLX())

18998

if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

18999

Mask, Zeroable, DAG))

19000

return V;

19001

19002

// Try to match an interleave of two v32i8s and lower them as unpck and

19003

// permutes using ymms.

19004

if (!Subtarget.hasAVX512())

19005

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,

19006

Mask, DAG))

19007

return V;

19008

19009

// Otherwise fall back on generic lowering.

19010

return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,

19011

Subtarget, DAG);

19012

}

19013

19014

/// High-level routine to lower various 256-bit x86 vector shuffles.

19015

///

19016

/// This routine either breaks down the specific type of a 256-bit x86 vector

19017

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

19018

/// together based on the available instructions.

19019

static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

19020

SDValue V1, SDValue V2, const APInt &Zeroable,

19021

const X86Subtarget &Subtarget,

19022

SelectionDAG &DAG) {

19023

// If we have a single input to the zero element, insert that into V1 if we

19024

// can do so cheaply.

19025

int NumElts = VT.getVectorNumElements();

19026

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

19027

19028

if (NumV2Elements == 1 && Mask[0] >= NumElts)

19029

if (SDValue Insertion = lowerShuffleAsElementInsertion(

19030

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

19031

return Insertion;

19032

19033

// Handle special cases where the lower or upper half is UNDEF.

19034

if (SDValue V =

19035

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

19036

return V;

19037

19038

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

19039

// can check for those subtargets here and avoid much of the subtarget

19040

// querying in the per-vector-type lowering routines. With AVX1 we have

19041

// essentially *zero* ability to manipulate a 256-bit vector with integer

19042

// types. Since we'll use floating point types there eventually, just

19043

// immediately cast everything to a float and operate entirely in that domain.

19044

if (VT.isInteger() && !Subtarget.hasAVX2()) {

19045

int ElementBits = VT.getScalarSizeInBits();

19046

if (ElementBits < 32) {

19047

// No floating point type available, if we can't use the bit operations

19048

// for masking/blending then decompose into 128-bit vectors.

19049

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

19050

Subtarget, DAG))

19051

return V;

19052

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

19053

return V;

19054

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

19055

}

19056

19057

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

19058

VT.getVectorNumElements());

19059

V1 = DAG.getBitcast(FpVT, V1);

19060

V2 = DAG.getBitcast(FpVT, V2);

19061

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

19062

}

19063

19064

if (VT == MVT::v16f16) {

19065

V1 = DAG.getBitcast(MVT::v16i16, V1);

19066

V2 = DAG.getBitcast(MVT::v16i16, V2);

19067

return DAG.getBitcast(MVT::v16f16,

19068

DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));

19069

}

19070

19071

switch (VT.SimpleTy) {

19072

case MVT::v4f64:

19073

return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19074

case MVT::v4i64:

19075

return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19076

case MVT::v8f32:

19077

return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19078

case MVT::v8i32:

19079

return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19080

case MVT::v16i16:

19081

return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19082

case MVT::v32i8:

19083

return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19084

19085

default:

19086

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19086);

19087

}

19088

}

19089

19090

/// Try to lower a vector shuffle as a 128-bit shuffles.

19091

static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

19092

const APInt &Zeroable, SDValue V1, SDValue V2,

19093

const X86Subtarget &Subtarget,

19094

SelectionDAG &DAG) {

19095

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__
__PRETTY_FUNCTION__))

19096

"Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__
__PRETTY_FUNCTION__));

19097

19098

// To handle 256 bit vector requires VLX and most probably

19099

// function lowerV2X128VectorShuffle() is better solution.

19100

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19100, __extension__
__PRETTY_FUNCTION__));

19101

19102

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

19103

SmallVector<int, 4> Widened128Mask;

19104

if (!canWidenShuffleElements(Mask, Widened128Mask))

19105

return SDValue();

19106

assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19106, __extension__
__PRETTY_FUNCTION__));

19107

19108

// Try to use an insert into a zero vector.

19109

if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

19110

(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

19111

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

19112

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

19113

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

19114

DAG.getIntPtrConstant(0, DL));

19115

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19116

getZeroVector(VT, Subtarget, DAG, DL), LoV,

19117

DAG.getIntPtrConstant(0, DL));

19118

}

19119

19120

// Check for patterns which can be matched with a single insert of a 256-bit

19121

// subvector.

19122

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);

19123

if (OnlyUsesV1 ||

19124

isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {

19125

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

19126

SDValue SubVec =

19127

DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

19128

DAG.getIntPtrConstant(0, DL));

19129

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

19130

DAG.getIntPtrConstant(4, DL));

19131

}

19132

19133

// See if this is an insertion of the lower 128-bits of V2 into V1.

19134

bool IsInsert = true;

19135

int V2Index = -1;

19136

for (int i = 0; i < 4; ++i) {

19137

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19137, __extension__
__PRETTY_FUNCTION__));

19138

if (Widened128Mask[i] < 0)

19139

continue;

19140

19141

// Make sure all V1 subvectors are in place.

19142

if (Widened128Mask[i] < 4) {

19143

if (Widened128Mask[i] != i) {

19144

IsInsert = false;

19145

break;

19146

}

19147

} else {

19148

// Make sure we only have a single V2 index and its the lowest 128-bits.

19149

if (V2Index >= 0 || Widened128Mask[i] != 4) {

19150

IsInsert = false;

19151

break;

19152

}

19153

V2Index = i;

19154

}

19155

}

19156

if (IsInsert && V2Index >= 0) {

19157

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

19158

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

19159

DAG.getIntPtrConstant(0, DL));

19160

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

19161

}

19162

19163

// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

19164

// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

19165

// possible we at least ensure the lanes stay sequential to help later

19166

// combines.

19167

SmallVector<int, 2> Widened256Mask;

19168

if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

19169

Widened128Mask.clear();

19170

narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

19171

}

19172

19173

// Try to lower to vshuf64x2/vshuf32x4.

19174

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

19175

unsigned PermMask = 0;

19176

// Insure elements came from the same Op.

19177

for (int i = 0; i < 4; ++i) {

19178

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19178, __extension__
__PRETTY_FUNCTION__));

19179

if (Widened128Mask[i] < 0)

19180

continue;

19181

19182

SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

19183

unsigned OpIndex = i / 2;

19184

if (Ops[OpIndex].isUndef())

19185

Ops[OpIndex] = Op;

19186

else if (Ops[OpIndex] != Op)

19187

return SDValue();

19188

19189

// Convert the 128-bit shuffle mask selection values into 128-bit selection

19190

// bits defined by a vshuf64x2 instruction's immediate control byte.

19191

PermMask |= (Widened128Mask[i] % 4) << (i * 2);

19192

}

19193

19194

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

19195

DAG.getTargetConstant(PermMask, DL, MVT::i8));

19196

}

19197

19198

/// Handle lowering of 8-lane 64-bit floating point shuffles.

19199

static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19200

const APInt &Zeroable, SDValue V1, SDValue V2,

19201

const X86Subtarget &Subtarget,

19202

SelectionDAG &DAG) {

19203

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19203, __extension__
__PRETTY_FUNCTION__));

19204

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19204, __extension__
__PRETTY_FUNCTION__));

19205

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19205, __extension__
__PRETTY_FUNCTION__));

19206

19207

if (V2.isUndef()) {

19208

// Use low duplicate instructions for masks that match their pattern.

19209

if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))

19210

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

19211

19212

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

19213

// Non-half-crossing single input shuffles can be lowered with an

19214

// interleaved permutation.

19215

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

19216

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

19217

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

19218

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

19219

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

19220

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

19221

}

19222

19223

SmallVector<int, 4> RepeatedMask;

19224

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

19225

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

19226

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19227

}

19228

19229

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

19230

V2, Subtarget, DAG))

19231

return Shuf128;

19232

19233

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

19234

return Unpck;

19235

19236

// Check if the blend happens to exactly fit that of SHUFPD.

19237

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

19238

Zeroable, Subtarget, DAG))

19239

return Op;

19240

19241

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,

19242

DAG, Subtarget))

19243

return V;

19244

19245

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

19246

Zeroable, Subtarget, DAG))

19247

return Blend;

19248

19249

return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

19250

}

19251

19252

/// Handle lowering of 16-lane 32-bit floating point shuffles.

19253

static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19254

const APInt &Zeroable, SDValue V1, SDValue V2,

19255

const X86Subtarget &Subtarget,

19256

SelectionDAG &DAG) {

19257

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19257, __extension__
__PRETTY_FUNCTION__));

19258

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19258, __extension__
__PRETTY_FUNCTION__));

19259

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19259, __extension__
__PRETTY_FUNCTION__));

19260

19261

// If the shuffle mask is repeated in each 128-bit lane, we have many more

19262

// options to efficiently lower the shuffle.

19263

SmallVector<int, 4> RepeatedMask;

19264

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

19265

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19265, __extension__
__PRETTY_FUNCTION__));

19266

19267

// Use even/odd duplicate instructions for masks that match their pattern.

19268

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

19269

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

19270

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

19271

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

19272

19273

if (V2.isUndef())

19274

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

19275

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19276

19277

// Use dedicated unpack instructions for masks that match their pattern.

19278

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

19279

return V;

19280

19281

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19282

Zeroable, Subtarget, DAG))

19283

return Blend;

19284

19285

// Otherwise, fall back to a SHUFPS sequence.

19286

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

19287

}

19288

19289

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19290

Zeroable, Subtarget, DAG))

19291

return Blend;

19292

19293

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19294

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19295

return DAG.getBitcast(MVT::v16f32, ZExt);

19296

19297

// Try to create an in-lane repeating shuffle mask and then shuffle the

19298

// results into the target lanes.

19299

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19300

DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

19301

return V;

19302

19303

// If we have a single input shuffle with different shuffle patterns in the

19304

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

19305

if (V2.isUndef() &&

19306

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

19307

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

19308

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

19309

}

19310

19311

// If we have AVX512F support, we can use VEXPAND.

19312

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

19313

V1, V2, DAG, Subtarget))

19314

return V;

19315

19316

return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

19317

}

19318

19319

/// Handle lowering of 8-lane 64-bit integer shuffles.

19320

static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19321

const APInt &Zeroable, SDValue V1, SDValue V2,

19322

const X86Subtarget &Subtarget,

19323

SelectionDAG &DAG) {

19324

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19324, __extension__
__PRETTY_FUNCTION__));

19325

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19325, __extension__
__PRETTY_FUNCTION__));

19326

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19326, __extension__
__PRETTY_FUNCTION__));

19327

19328

// Try to use shift instructions if fast.

19329

if (Subtarget.preferLowerShuffleAsShift())

19330

if (SDValue Shift =

19331

lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,

19332

Subtarget, DAG, /*BitwiseOnly*/ true))

19333

return Shift;

19334

19335

if (V2.isUndef()) {

19336

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

19337

// can use lower latency instructions that will operate on all four

19338

// 128-bit lanes.

19339

SmallVector<int, 2> Repeated128Mask;

19340

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

19341

SmallVector<int, 4> PSHUFDMask;

19342

narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

19343

return DAG.getBitcast(

19344

MVT::v8i64,

19345

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

19346

DAG.getBitcast(MVT::v16i32, V1),

19347

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

19348

}

19349

19350

SmallVector<int, 4> Repeated256Mask;

19351

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

19352

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

19353

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

19354

}

19355

19356

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

19357

V2, Subtarget, DAG))

19358

return Shuf128;

19359

19360

// Try to use shift instructions.

19361

if (SDValue Shift =

19362

lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,

19363

DAG, /*BitwiseOnly*/ false))

19364

return Shift;

19365

19366

// Try to use VALIGN.

19367

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

19368

Subtarget, DAG))

19369

return Rotate;

19370

19371

// Try to use PALIGNR.

19372

if (Subtarget.hasBWI())

19373

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

19374

Subtarget, DAG))

19375

return Rotate;

19376

19377

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

19378

return Unpck;

19379

19380

// If we have AVX512F support, we can use VEXPAND.

19381

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,

19382

DAG, Subtarget))

19383

return V;

19384

19385

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

19386

Zeroable, Subtarget, DAG))

19387

return Blend;

19388

19389

return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

19390

}

19391

19392

/// Handle lowering of 16-lane 32-bit integer shuffles.

19393

static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19394

const APInt &Zeroable, SDValue V1, SDValue V2,

19395

const X86Subtarget &Subtarget,

19396

SelectionDAG &DAG) {

19397

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19397, __extension__
__PRETTY_FUNCTION__));

19398

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19398, __extension__
__PRETTY_FUNCTION__));

19399

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19399, __extension__
__PRETTY_FUNCTION__));

19400

19401

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

19402

19403

// Whenever we can lower this as a zext, that instruction is strictly faster

19404

// than any alternative. It also allows us to fold memory operands into the

19405

// shuffle in many cases.

19406

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19407

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19408

return ZExt;

19409

19410

// Try to use shift instructions if fast.

19411

if (Subtarget.preferLowerShuffleAsShift()) {

19412

if (SDValue Shift =

19413

lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

19414

Subtarget, DAG, /*BitwiseOnly*/ true))

19415

return Shift;

19416

if (NumV2Elements == 0)

19417

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,

19418

Subtarget, DAG))

19419

return Rotate;

19420

}

19421

19422

// If the shuffle mask is repeated in each 128-bit lane we can use more

19423

// efficient instructions that mirror the shuffles across the four 128-bit

19424

// lanes.

19425

SmallVector<int, 4> RepeatedMask;

19426

bool Is128BitLaneRepeatedShuffle =

19427

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

19428

if (Is128BitLaneRepeatedShuffle) {

19429

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19429, __extension__
__PRETTY_FUNCTION__));

19430

if (V2.isUndef())

19431

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

19432

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19433

19434

// Use dedicated unpack instructions for masks that match their pattern.

19435

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

19436

return V;

19437

}

19438

19439

// Try to use shift instructions.

19440

if (SDValue Shift =

19441

lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

19442

Subtarget, DAG, /*BitwiseOnly*/ false))

19443

return Shift;

19444

19445

if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)

19446

if (SDValue Rotate =

19447

lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))

19448

return Rotate;

19449

19450

// Try to use VALIGN.

19451

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

19452

Subtarget, DAG))

19453

return Rotate;

19454

19455

// Try to use byte rotation instructions.

19456

if (Subtarget.hasBWI())

19457

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

19458

Subtarget, DAG))

19459

return Rotate;

19460

19461

// Assume that a single SHUFPS is faster than using a permv shuffle.

19462

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

19463

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

19464

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

19465

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

19466

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

19467

CastV1, CastV2, DAG);

19468

return DAG.getBitcast(MVT::v16i32, ShufPS);

19469

}

19470

19471

// Try to create an in-lane repeating shuffle mask and then shuffle the

19472

// results into the target lanes.

19473

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19474

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

19475

return V;

19476

19477

// If we have AVX512F support, we can use VEXPAND.

19478

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

19479

DAG, Subtarget))

19480

return V;

19481

19482

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

19483

Zeroable, Subtarget, DAG))

19484

return Blend;

19485

19486

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

19487

}

19488

19489

/// Handle lowering of 32-lane 16-bit integer shuffles.

19490

static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19491

const APInt &Zeroable, SDValue V1, SDValue V2,

19492

const X86Subtarget &Subtarget,

19493

SelectionDAG &DAG) {

19494

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19494, __extension__
__PRETTY_FUNCTION__));

19495

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19495, __extension__
__PRETTY_FUNCTION__));

19496

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__));

19497

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19497, __extension__
__PRETTY_FUNCTION__));

19498

19499

// Whenever we can lower this as a zext, that instruction is strictly faster

19500

// than any alternative. It also allows us to fold memory operands into the

19501

// shuffle in many cases.

19502

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19503

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

19504

return ZExt;

19505

19506

// Use dedicated unpack instructions for masks that match their pattern.

19507

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

19508

return V;

19509

19510

// Use dedicated pack instructions for masks that match their pattern.

19511

if (SDValue V =

19512

lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

19513

return V;

19514

19515

// Try to use shift instructions.

19516

if (SDValue Shift =

19517

lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,

19518

Subtarget, DAG, /*BitwiseOnly*/ false))

19519

return Shift;

19520

19521

// Try to use byte rotation instructions.

19522

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

19523

Subtarget, DAG))

19524

return Rotate;

19525

19526

if (V2.isUndef()) {

19527

// Try to use bit rotation instructions.

19528

if (SDValue Rotate =

19529

lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

19530

return Rotate;

19531

19532

SmallVector<int, 8> RepeatedMask;

19533

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

19534

// As this is a single-input shuffle, the repeated mask should be

19535

// a strictly valid v8i16 mask that we can pass through to the v8i16

19536

// lowering to handle even the v32 case.

19537

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

19538

RepeatedMask, Subtarget, DAG);

19539

}

19540

}

19541

19542

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

19543

Zeroable, Subtarget, DAG))

19544

return Blend;

19545

19546

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

19547

Zeroable, Subtarget, DAG))

19548

return PSHUFB;

19549

19550

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

19551

}

19552

19553

/// Handle lowering of 64-lane 8-bit integer shuffles.

19554

static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19555

const APInt &Zeroable, SDValue V1, SDValue V2,

19556

const X86Subtarget &Subtarget,

19557

SelectionDAG &DAG) {

19558

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19558, __extension__
__PRETTY_FUNCTION__));

19559

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19559, __extension__
__PRETTY_FUNCTION__));

19560

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19560, __extension__
__PRETTY_FUNCTION__));

19561

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19561, __extension__
__PRETTY_FUNCTION__));

19562

19563

// Whenever we can lower this as a zext, that instruction is strictly faster

19564

// than any alternative. It also allows us to fold memory operands into the

19565

// shuffle in many cases.

19566

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19567

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

19568

return ZExt;

19569

19570

// Use dedicated unpack instructions for masks that match their pattern.

19571

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

19572

return V;

19573

19574

// Use dedicated pack instructions for masks that match their pattern.

19575

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,

19576

Subtarget))

19577

return V;

19578

19579

// Try to use shift instructions.

19580

if (SDValue Shift =

19581

lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,

19582

DAG, /*BitwiseOnly*/ false))

19583

return Shift;

19584

19585

// Try to use byte rotation instructions.

19586

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

19587

Subtarget, DAG))

19588

return Rotate;

19589

19590

// Try to use bit rotation instructions.

19591

if (V2.isUndef())

19592

if (SDValue Rotate =

19593

lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

19594

return Rotate;

19595

19596

// Lower as AND if possible.

19597

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

19598

Zeroable, Subtarget, DAG))

19599

return Masked;

19600

19601

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

19602

Zeroable, Subtarget, DAG))

19603

return PSHUFB;

19604

19605

// Try to create an in-lane repeating shuffle mask and then shuffle the

19606

// results into the target lanes.

19607

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19608

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19609

return V;

19610

19611

if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(

19612

DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))

19613

return Result;

19614

19615

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

19616

Zeroable, Subtarget, DAG))

19617

return Blend;

19618

19619

if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {

19620

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

19621

// PALIGNR will be cheaper than the second PSHUFB+OR.

19622

if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,

19623

Mask, Subtarget, DAG))

19624

return V;

19625

19626

// If we can't directly blend but can use PSHUFB, that will be better as it

19627

// can both shuffle and set up the inefficient blend.

19628

bool V1InUse, V2InUse;

19629

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,

19630

DAG, V1InUse, V2InUse);

19631

}

19632

19633

// Try to simplify this by merging 128-bit lanes to enable a lane-based

19634

// shuffle.

19635

if (!V2.isUndef())

19636

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

19637

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19638

return Result;

19639

19640

// VBMI can use VPERMV/VPERMV3 byte shuffles.

19641

if (Subtarget.hasVBMI())

19642

return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

19643

19644

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

19645

}

19646

19647

/// High-level routine to lower various 512-bit x86 vector shuffles.

19648

///

19649

/// This routine either breaks down the specific type of a 512-bit x86 vector

19650

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

19651

/// together based on the available instructions.

19652

static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19653

MVT VT, SDValue V1, SDValue V2,

19654

const APInt &Zeroable,

19655

const X86Subtarget &Subtarget,

19656

SelectionDAG &DAG) {

19657

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__
__PRETTY_FUNCTION__))

19658

"Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__
__PRETTY_FUNCTION__));

19659

19660

// If we have a single input to the zero element, insert that into V1 if we

19661

// can do so cheaply.

19662

int NumElts = Mask.size();

19663

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

19664

19665

if (NumV2Elements == 1 && Mask[0] >= NumElts)

19666

if (SDValue Insertion = lowerShuffleAsElementInsertion(

19667

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

19668

return Insertion;

19669

19670

// Handle special cases where the lower or upper half is UNDEF.

19671

if (SDValue V =

19672

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

19673

return V;

19674

19675

// Check for being able to broadcast a single element.

19676

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

19677

Subtarget, DAG))

19678

return Broadcast;

19679

19680

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

19681

// Try using bit ops for masking and blending before falling back to

19682

// splitting.

19683

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

19684

Subtarget, DAG))

19685

return V;

19686

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

19687

return V;

19688

19689

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

19690

}

19691

19692

if (VT == MVT::v32f16) {

19693

V1 = DAG.getBitcast(MVT::v32i16, V1);

19694

V2 = DAG.getBitcast(MVT::v32i16, V2);

19695

return DAG.getBitcast(MVT::v32f16,

19696

DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));

19697

}

19698

19699

// Dispatch to each element type for lowering. If we don't have support for

19700

// specific element type shuffles at 512 bits, immediately split them and

19701

// lower them. Each lowering routine of a given type is allowed to assume that

19702

// the requisite ISA extensions for that element type are available.

19703

switch (VT.SimpleTy) {

19704

case MVT::v8f64:

19705

return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19706

case MVT::v16f32:

19707

return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19708

case MVT::v8i64:

19709

return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19710

case MVT::v16i32:

19711

return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19712

case MVT::v32i16:

19713

return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19714

case MVT::v64i8:

19715

return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19716

19717

default:

19718

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19718);

19719

}

19720

}

19721

19722

static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

19723

MVT VT, SDValue V1, SDValue V2,

19724

const X86Subtarget &Subtarget,

19725

SelectionDAG &DAG) {

19726

// Shuffle should be unary.

19727

if (!V2.isUndef())

19728

return SDValue();

19729

19730

int ShiftAmt = -1;

19731

int NumElts = Mask.size();

19732

for (int i = 0; i != NumElts; ++i) {

19733

int M = Mask[i];

19734

assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__
__PRETTY_FUNCTION__))

19735

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__
__PRETTY_FUNCTION__));

19736

if (M < 0)

19737

continue;

19738

19739

// The first non-undef element determines our shift amount.

19740

if (ShiftAmt < 0) {

19741

ShiftAmt = M - i;

19742

// Need to be shifting right.

19743

if (ShiftAmt <= 0)

19744

return SDValue();

19745

}

19746

// All non-undef elements must shift by the same amount.

19747

if (ShiftAmt != M - i)

19748

return SDValue();

19749

}

19750

assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19750, __extension__
__PRETTY_FUNCTION__));

19751

19752

// Great we found a shift right.

19753

MVT WideVT = VT;

19754

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19755

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19756

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19757

DAG.getUNDEF(WideVT), V1,

19758

DAG.getIntPtrConstant(0, DL));

19759

Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,

19760

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19761

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19762

DAG.getIntPtrConstant(0, DL));

19763

}

19764

19765

// Determine if this shuffle can be implemented with a KSHIFT instruction.

19766

// Returns the shift amount if possible or -1 if not. This is a simplified

19767

// version of matchShuffleAsShift.

19768

static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

19769

int MaskOffset, const APInt &Zeroable) {

19770

int Size = Mask.size();

19771

19772

auto CheckZeros = [&](int Shift, bool Left) {

19773

for (int j = 0; j < Shift; ++j)

19774

if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

19775

return false;

19776

19777

return true;

19778

};

19779

19780

auto MatchShift = [&](int Shift, bool Left) {

19781

unsigned Pos = Left ? Shift : 0;

19782

unsigned Low = Left ? 0 : Shift;

19783

unsigned Len = Size - Shift;

19784

return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

19785

};

19786

19787

for (int Shift = 1; Shift != Size; ++Shift)

19788

for (bool Left : {true, false})

19789

if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

19790

Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

19791

return Shift;

19792

}

19793

19794

return -1;

19795

}

19796

19797

19798

// Lower vXi1 vector shuffles.

19799

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

19800

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

19801

// vector, shuffle and then truncate it back.

19802

static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19803

MVT VT, SDValue V1, SDValue V2,

19804

const APInt &Zeroable,

19805

const X86Subtarget &Subtarget,

19806

SelectionDAG &DAG) {

19807

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__
__PRETTY_FUNCTION__))

19808

"Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__
__PRETTY_FUNCTION__));

19809

19810

int NumElts = Mask.size();

19811

19812

// Try to recognize shuffles that are just padding a subvector with zeros.

19813

int SubvecElts = 0;

19814

int Src = -1;

19815

for (int i = 0; i != NumElts; ++i) {

19816

if (Mask[i] >= 0) {

19817

// Grab the source from the first valid mask. All subsequent elements need

19818

// to use this same source.

19819

if (Src < 0)

19820

Src = Mask[i] / NumElts;

19821

if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

19822

break;

19823

}

19824

19825

++SubvecElts;

19826

}

19827

assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19827, __extension__
__PRETTY_FUNCTION__));

19828

19829

// Clip to a power 2.

19830

SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);

19831

19832

// Make sure the number of zeroable bits in the top at least covers the bits

19833

// not covered by the subvector.

19834

if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {

19835

assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19835, __extension__
__PRETTY_FUNCTION__));

19836

MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

19837

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,

19838

Src == 0 ? V1 : V2,

19839

DAG.getIntPtrConstant(0, DL));

19840

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19841

DAG.getConstant(0, DL, VT),

19842

Extract, DAG.getIntPtrConstant(0, DL));

19843

}

19844

19845

// Try a simple shift right with undef elements. Later we'll try with zeros.

19846

if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,

19847

DAG))

19848

return Shift;

19849

19850

// Try to match KSHIFTs.

19851

unsigned Offset = 0;

19852

for (SDValue V : { V1, V2 }) {

19853

unsigned Opcode;

19854

int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

19855

if (ShiftAmt >= 0) {

19856

MVT WideVT = VT;

19857

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19858

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19859

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19860

DAG.getUNDEF(WideVT), V,

19861

DAG.getIntPtrConstant(0, DL));

19862

// Widened right shifts need two shifts to ensure we shift in zeroes.

19863

if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

19864

int WideElts = WideVT.getVectorNumElements();

19865

// Shift left to put the original vector in the MSBs of the new size.

19866

Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

19867

DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

19868

// Increase the shift amount to account for the left shift.

19869

ShiftAmt += WideElts - NumElts;

19870

}

19871

19872

Res = DAG.getNode(Opcode, DL, WideVT, Res,

19873

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19874

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19875

DAG.getIntPtrConstant(0, DL));

19876

}

19877

Offset += NumElts; // Increment for next iteration.

19878

}

19879

19880

// If we're broadcasting a SETCC result, try to broadcast the ops instead.

19881

// TODO: What other unary shuffles would benefit from this?

19882

if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&

19883

V1->hasOneUse()) {

19884

SDValue Op0 = V1.getOperand(0);

19885

SDValue Op1 = V1.getOperand(1);

19886

ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();

19887

EVT OpVT = Op0.getValueType();

19888

return DAG.getSetCC(

19889

DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),

19890

DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);

19891

}

19892

19893

MVT ExtVT;

19894

switch (VT.SimpleTy) {

19895

default:

19896

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19896);

19897

case MVT::v2i1:

19898

ExtVT = MVT::v2i64;

19899

break;

19900

case MVT::v4i1:

19901

ExtVT = MVT::v4i32;

19902

break;

19903

case MVT::v8i1:

19904

// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

19905

// shuffle.

19906

ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

19907

break;

19908

case MVT::v16i1:

19909

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19910

// 256-bit operation available.

19911

ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

19912

break;

19913

case MVT::v32i1:

19914

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19915

// 256-bit operation available.

19916

assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19916, __extension__
__PRETTY_FUNCTION__));

19917

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

19918

break;

19919

case MVT::v64i1:

19920

// Fall back to scalarization. FIXME: We can do better if the shuffle

19921

// can be partitioned cleanly.

19922

if (!Subtarget.useBWIRegs())

19923

return SDValue();

19924

ExtVT = MVT::v64i8;

19925

break;

19926

}

19927

19928

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

19929

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

19930

19931

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

19932

// i1 was sign extended we can use X86ISD::CVT2MASK.

19933

int NumElems = VT.getVectorNumElements();

19934

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

19935

(Subtarget.hasDQI() && (NumElems < 32)))

19936

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

19937

Shuffle, ISD::SETGT);

19938

19939

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

19940

}

19941

19942

/// Helper function that returns true if the shuffle mask should be

19943

/// commuted to improve canonicalization.

19944

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

19945

int NumElements = Mask.size();

19946

19947

int NumV1Elements = 0, NumV2Elements = 0;

19948

for (int M : Mask)

19949

if (M < 0)

19950

continue;

19951

else if (M < NumElements)

19952

++NumV1Elements;

19953

else

19954

++NumV2Elements;

19955

19956

// Commute the shuffle as needed such that more elements come from V1 than

19957

// V2. This allows us to match the shuffle pattern strictly on how many

19958

// elements come from V1 without handling the symmetric cases.

19959

if (NumV2Elements > NumV1Elements)

19960

return true;

19961

19962

assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19962, __extension__
__PRETTY_FUNCTION__));

19963

19964

if (NumV2Elements == 0)

19965

return false;

19966

19967

// When the number of V1 and V2 elements are the same, try to minimize the

19968

// number of uses of V2 in the low half of the vector. When that is tied,

19969

// ensure that the sum of indices for V1 is equal to or lower than the sum

19970

// indices for V2. When those are equal, try to ensure that the number of odd

19971

// indices for V1 is lower than the number of odd indices for V2.

19972

if (NumV1Elements == NumV2Elements) {

19973

int LowV1Elements = 0, LowV2Elements = 0;

19974

for (int M : Mask.slice(0, NumElements / 2))

19975

if (M >= NumElements)

19976

++LowV2Elements;

19977

else if (M >= 0)

19978

++LowV1Elements;

19979

if (LowV2Elements > LowV1Elements)

19980

return true;

19981

if (LowV2Elements == LowV1Elements) {

19982

int SumV1Indices = 0, SumV2Indices = 0;

19983

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19984

if (Mask[i] >= NumElements)

19985

SumV2Indices += i;

19986

else if (Mask[i] >= 0)

19987

SumV1Indices += i;

19988

if (SumV2Indices < SumV1Indices)

19989

return true;

19990

if (SumV2Indices == SumV1Indices) {

19991

int NumV1OddIndices = 0, NumV2OddIndices = 0;

19992

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19993

if (Mask[i] >= NumElements)

19994

NumV2OddIndices += i % 2;

19995

else if (Mask[i] >= 0)

19996

NumV1OddIndices += i % 2;

19997

if (NumV2OddIndices < NumV1OddIndices)

19998

return true;

19999

}

20000

}

20001

}

20002

20003

return false;

20004

}

20005

20006

static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,

20007

const X86Subtarget &Subtarget) {

20008

if (!Subtarget.hasAVX512())

20009

return false;

20010

20011

MVT VT = V1.getSimpleValueType().getScalarType();

20012

if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())

20013

return false;

20014

20015

// If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd

20016

// are preferable to blendw/blendvb/masked-mov.

20017

if ((VT == MVT::i16 || VT == MVT::i8) &&

20018

V1.getSimpleValueType().getSizeInBits() < 512)

20019

return false;

20020

20021

auto HasMaskOperation = [&](SDValue V) {

20022

// TODO: Currently we only check limited opcode. We probably extend

20023

// it to all binary operation by checking TLI.isBinOp().

20024

switch (V->getOpcode()) {

20025

default:

20026

return false;

20027

case ISD::ADD:

20028

case ISD::SUB:

20029

case ISD::AND:

20030

case ISD::XOR:

20031

case ISD::OR:

20032

case ISD::SMAX:

20033

case ISD::SMIN:

20034

case ISD::UMAX:

20035

case ISD::UMIN:

20036

case ISD::ABS:

20037

case ISD::SHL:

20038

case ISD::SRL:

20039

case ISD::SRA:

20040

case ISD::MUL:

20041

break;

20042

}

20043

if (!V->hasOneUse())

20044

return false;

20045

20046

return true;

20047

};

20048

20049

if (HasMaskOperation(V1) || HasMaskOperation(V2))

20050

return true;

20051

20052

return false;

20053

}

20054

20055

// Forward declaration.

20056

static SDValue canonicalizeShuffleMaskWithHorizOp(

20057

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

20058

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

20059

const X86Subtarget &Subtarget);

20060

20061

/// Top-level lowering for x86 vector shuffles.

20062

///

20063

/// This handles decomposition, canonicalization, and lowering of all x86

20064

/// vector shuffles. Most of the specific lowering strategies are encapsulated

20065

/// above in helper routines. The canonicalization attempts to widen shuffles

20066

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

20067

/// s.t. only one of the two inputs needs to be tested, etc.

20068

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

20069

SelectionDAG &DAG) {

20070

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

20071

ArrayRef<int> OrigMask = SVOp->getMask();

20072

SDValue V1 = Op.getOperand(0);

20073

SDValue V2 = Op.getOperand(1);

20074

MVT VT = Op.getSimpleValueType();

20075

int NumElements = VT.getVectorNumElements();

20076

SDLoc DL(Op);

20077

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

20078

20079

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__
__PRETTY_FUNCTION__))

20080

"Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__
__PRETTY_FUNCTION__));

20081

20082

bool V1IsUndef = V1.isUndef();

20083

bool V2IsUndef = V2.isUndef();

20084

if (V1IsUndef && V2IsUndef)

20085

return DAG.getUNDEF(VT);

20086

20087

// When we create a shuffle node we put the UNDEF node to second operand,

20088

// but in some cases the first operand may be transformed to UNDEF.

20089

// In this case we should just commute the node.

20090

if (V1IsUndef)

20091

return DAG.getCommutedVectorShuffle(*SVOp);

20092

20093

// Check for non-undef masks pointing at an undef vector and make the masks

20094

// undef as well. This makes it easier to match the shuffle based solely on

20095

// the mask.

20096

if (V2IsUndef &&

20097

any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

20098

SmallVector<int, 8> NewMask(OrigMask);

20099

for (int &M : NewMask)

20100

if (M >= NumElements)

20101

M = -1;

20102

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

20103

}

20104

20105

// Check for illegal shuffle mask element index values.

20106

int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

20107

(void)MaskUpperLimit;

20108

assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))

20109

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__))

20110

"Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__
__PRETTY_FUNCTION__));

20111

20112

// We actually see shuffles that are entirely re-arrangements of a set of

20113

// zero inputs. This mostly happens while decomposing complex shuffles into

20114

// simple ones. Directly lower these as a buildvector of zeros.

20115

APInt KnownUndef, KnownZero;

20116

computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

20117

20118

APInt Zeroable = KnownUndef | KnownZero;

20119

if (Zeroable.isAllOnes())

20120

return getZeroVector(VT, Subtarget, DAG, DL);

20121

20122

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

20123

20124

// Try to collapse shuffles into using a vector type with fewer elements but

20125

// wider element types. We cap this to not form integers or floating point

20126

// elements wider than 64 bits. It does not seem beneficial to form i128

20127

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

20128

SmallVector<int, 16> WidenedMask;

20129

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

20130

!canCombineAsMaskOperation(V1, V2, Subtarget) &&

20131

canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

20132

// Shuffle mask widening should not interfere with a broadcast opportunity

20133

// by obfuscating the operands with bitcasts.

20134

// TODO: Avoid lowering directly from this top-level function: make this

20135

// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

20136

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

20137

Subtarget, DAG))

20138

return Broadcast;

20139

20140

MVT NewEltVT = VT.isFloatingPoint()

20141

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

20142

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

20143

int NewNumElts = NumElements / 2;

20144

MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

20145

// Make sure that the new vector type is legal. For example, v2f64 isn't

20146

// legal on SSE1.

20147

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

20148

if (V2IsZero) {

20149

// Modify the new Mask to take all zeros from the all-zero vector.

20150

// Choose indices that are blend-friendly.

20151

bool UsedZeroVector = false;

20152

assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__
__PRETTY_FUNCTION__))

20153

"V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__
__PRETTY_FUNCTION__));

20154

for (int i = 0; i != NewNumElts; ++i)

20155

if (WidenedMask[i] == SM_SentinelZero) {

20156

WidenedMask[i] = i + NewNumElts;

20157

UsedZeroVector = true;

20158

}

20159

// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

20160

// some elements to be undef.

20161

if (UsedZeroVector)

20162

V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

20163

}

20164

V1 = DAG.getBitcast(NewVT, V1);

20165

V2 = DAG.getBitcast(NewVT, V2);

20166

return DAG.getBitcast(

20167

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

20168

}

20169

}

20170

20171

SmallVector<SDValue> Ops = {V1, V2};

20172

SmallVector<int> Mask(OrigMask);

20173

20174

// Canonicalize the shuffle with any horizontal ops inputs.

20175

// NOTE: This may update Ops and Mask.

20176

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

20177

Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))

20178

return DAG.getBitcast(VT, HOp);

20179

20180

V1 = DAG.getBitcast(VT, Ops[0]);

20181

V2 = DAG.getBitcast(VT, Ops[1]);

20182

assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))

20183

"canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__))

20184

"shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__
__PRETTY_FUNCTION__));

20185

20186

// Commute the shuffle if it will improve canonicalization.

20187

if (canonicalizeShuffleMaskWithCommute(Mask)) {

20188

ShuffleVectorSDNode::commuteMask(Mask);

20189

std::swap(V1, V2);

20190

}

20191

20192

// For each vector width, delegate to a specialized lowering routine.

20193

if (VT.is128BitVector())

20194

return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20195

20196

if (VT.is256BitVector())

20197

return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20198

20199

if (VT.is512BitVector())

20200

return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20201

20202

if (Is1BitVector)

20203

return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20204

20205

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20205);

20206

}

20207

20208

/// Try to lower a VSELECT instruction to a vector shuffle.

20209

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

20210

const X86Subtarget &Subtarget,

20211

SelectionDAG &DAG) {

20212

SDValue Cond = Op.getOperand(0);

20213

SDValue LHS = Op.getOperand(1);

20214

SDValue RHS = Op.getOperand(2);

20215

MVT VT = Op.getSimpleValueType();

20216

20217

// Only non-legal VSELECTs reach this lowering, convert those into generic

20218

// shuffles and re-use the shuffle lowering path for blends.

20219

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

20220

SmallVector<int, 32> Mask;

20221

if (createShuffleMaskFromVSELECT(Mask, Cond))

20222

return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

20223

}

20224

20225

return SDValue();

20226

}

20227

20228

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

20229

SDValue Cond = Op.getOperand(0);

20230

SDValue LHS = Op.getOperand(1);

20231

SDValue RHS = Op.getOperand(2);

20232

20233

SDLoc dl(Op);

20234

MVT VT = Op.getSimpleValueType();

20235

if (isSoftFP16(VT)) {

20236

MVT NVT = VT.changeVectorElementTypeToInteger();

20237

return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,

20238

DAG.getBitcast(NVT, LHS),

20239

DAG.getBitcast(NVT, RHS)));

20240

}

20241

20242

// A vselect where all conditions and data are constants can be optimized into

20243

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

20244

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

20245

ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

20246

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

20247

return SDValue();

20248

20249

// Try to lower this to a blend-style vector shuffle. This can handle all

20250

// constant condition cases.

20251

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

20252

return BlendOp;

20253

20254

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

20255

// with patterns on the mask registers on AVX-512.

20256

MVT CondVT = Cond.getSimpleValueType();

20257

unsigned CondEltSize = Cond.getScalarValueSizeInBits();

20258

if (CondEltSize == 1)

20259

return Op;

20260

20261

// Variable blends are only legal from SSE4.1 onward.

20262

if (!Subtarget.hasSSE41())

20263

return SDValue();

20264

20265

unsigned EltSize = VT.getScalarSizeInBits();

20266

unsigned NumElts = VT.getVectorNumElements();

20267

20268

// Expand v32i16/v64i8 without BWI.

20269

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

20270

return SDValue();

20271

20272

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

20273

// into an i1 condition so that we can use the mask-based 512-bit blend

20274

// instructions.

20275

if (VT.getSizeInBits() == 512) {

20276

// Build a mask by testing the condition against zero.

20277

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

20278

SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

20279

DAG.getConstant(0, dl, CondVT),

20280

ISD::SETNE);

20281

// Now return a new VSELECT using the mask.

20282

return DAG.getSelect(dl, VT, Mask, LHS, RHS);

20283

}

20284

20285

// SEXT/TRUNC cases where the mask doesn't match the destination size.

20286

if (CondEltSize != EltSize) {

20287

// If we don't have a sign splat, rely on the expansion.

20288

if (CondEltSize != DAG.ComputeNumSignBits(Cond))

20289

return SDValue();

20290

20291

MVT NewCondSVT = MVT::getIntegerVT(EltSize);

20292

MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

20293

Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

20294

return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

20295

}

20296

20297

// Only some types will be legal on some subtargets. If we can emit a legal

20298

// VSELECT-matching blend, return Op, and but if we need to expand, return

20299

// a null value.

20300

switch (VT.SimpleTy) {

20301

default:

20302

// Most of the vector types have blends past SSE4.1.

20303

return Op;

20304

20305

case MVT::v32i8:

20306

// The byte blends for AVX vectors were introduced only in AVX2.

20307

if (Subtarget.hasAVX2())

20308

return Op;

20309

20310

return SDValue();

20311

20312

case MVT::v8i16:

20313

case MVT::v16i16: {

20314

// Bitcast everything to the vXi8 type and use a vXi8 vselect.

20315

MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

20316

Cond = DAG.getBitcast(CastVT, Cond);

20317

LHS = DAG.getBitcast(CastVT, LHS);

20318

RHS = DAG.getBitcast(CastVT, RHS);

20319

SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

20320

return DAG.getBitcast(VT, Select);

20321

}

20322

}

20323

}

20324

20325

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

20326

MVT VT = Op.getSimpleValueType();

20327

SDValue Vec = Op.getOperand(0);

20328

SDValue Idx = Op.getOperand(1);

20329

assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20329, __extension__
__PRETTY_FUNCTION__));

20330

SDLoc dl(Op);

20331

20332

if (!Vec.getSimpleValueType().is128BitVector())

20333

return SDValue();

20334

20335

if (VT.getSizeInBits() == 8) {

20336

// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

20337

// we're going to zero extend the register or fold the store.

20338

if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&

20339

!X86::mayFoldIntoStore(Op))

20340

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

20341

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20342

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20343

20344

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

20345

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,

20346

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20347

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20348

}

20349

20350

if (VT == MVT::f32) {

20351

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

20352

// the result back to FR32 register. It's only worth matching if the

20353

// result has a single use which is a store or a bitcast to i32. And in

20354

// the case of a store, it's not worth it if the index is a constant 0,

20355

// because a MOVSSmr can be used instead, which is smaller and faster.

20356

if (!Op.hasOneUse())

20357

return SDValue();

20358

SDNode *User = *Op.getNode()->use_begin();

20359

if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

20360

(User->getOpcode() != ISD::BITCAST ||

20361

User->getValueType(0) != MVT::i32))

20362

return SDValue();

20363

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20364

DAG.getBitcast(MVT::v4i32, Vec), Idx);

20365

return DAG.getBitcast(MVT::f32, Extract);

20366

}

20367

20368

if (VT == MVT::i32 || VT == MVT::i64)

20369

return Op;

20370

20371

return SDValue();

20372

}

20373

20374

/// Extract one bit from mask vector, like v16i1 or v8i1.

20375

/// AVX-512 feature.

20376

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

20377

const X86Subtarget &Subtarget) {

20378

SDValue Vec = Op.getOperand(0);

20379

SDLoc dl(Vec);

20380

MVT VecVT = Vec.getSimpleValueType();

20381

SDValue Idx = Op.getOperand(1);

20382

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20383

MVT EltVT = Op.getSimpleValueType();

20384

20385

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__
__PRETTY_FUNCTION__))

20386

"Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__
__PRETTY_FUNCTION__));

20387

20388

// variable index can't be handled in mask registers,

20389

// extend vector to VR512/128

20390

if (!IdxC) {

20391

unsigned NumElts = VecVT.getVectorNumElements();

20392

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

20393

// than extending to 128/256bit.

20394

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20395

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20396

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

20397

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

20398

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

20399

}

20400

20401

unsigned IdxVal = IdxC->getZExtValue();

20402

if (IdxVal == 0) // the operation is legal

20403

return Op;

20404

20405

// Extend to natively supported kshift.

20406

unsigned NumElems = VecVT.getVectorNumElements();

20407

MVT WideVecVT = VecVT;

20408

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20409

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20410

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20411

DAG.getUNDEF(WideVecVT), Vec,

20412

DAG.getIntPtrConstant(0, dl));

20413

}

20414

20415

// Use kshiftr instruction to move to the lower element.

20416

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20417

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20418

20419

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20420

DAG.getIntPtrConstant(0, dl));

20421

}

20422

20423

SDValue

20424

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

20425

SelectionDAG &DAG) const {

20426

SDLoc dl(Op);

20427

SDValue Vec = Op.getOperand(0);

20428

MVT VecVT = Vec.getSimpleValueType();

20429

SDValue Idx = Op.getOperand(1);

20430

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20431

20432

if (VecVT.getVectorElementType() == MVT::i1)

20433

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

20434

20435

if (!IdxC) {

20436

// Its more profitable to go through memory (1 cycles throughput)

20437

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

20438

// IACA tool was used to get performance estimation

20439

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

20440

//

20441

// example : extractelement <16 x i8> %a, i32 %i

20442

//

20443

// Block Throughput: 3.00 Cycles

20444

// Throughput Bottleneck: Port5

20445

//

20446

// | Num Of | Ports pressure in cycles | |

20447

// | Uops | 0 - DV | 5 | 6 | 7 | |

20448

// ---------------------------------------------

20449

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

20450

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

20451

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

20452

// Total Num Of Uops: 4

20453

//

20454

//

20455

// Block Throughput: 1.00 Cycles

20456

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

20457

//

20458

// | | Ports pressure in cycles | |

20459

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

20460

// ---------------------------------------------------------

20461

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

20462

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

20463

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

20464

// Total Num Of Uops: 4

20465

20466

return SDValue();

20467

}

20468

20469

unsigned IdxVal = IdxC->getZExtValue();

20470

20471

// If this is a 256-bit vector result, first extract the 128-bit vector and

20472

// then extract the element from the 128-bit vector.

20473

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

20474

// Get the 128-bit vector.

20475

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

20476

MVT EltVT = VecVT.getVectorElementType();

20477

20478

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

20479

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20479, __extension__
__PRETTY_FUNCTION__));

20480

20481

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

20482

// this can be done with a mask.

20483

IdxVal &= ElemsPerChunk - 1;

20484

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20485

DAG.getIntPtrConstant(IdxVal, dl));

20486

}

20487

20488

assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20488, __extension__
__PRETTY_FUNCTION__));

20489

20490

MVT VT = Op.getSimpleValueType();

20491

20492

if (VT == MVT::i16) {

20493

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

20494

// we're going to zero extend the register or fold the store (SSE41 only).

20495

if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&

20496

!(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {

20497

if (Subtarget.hasFP16())

20498

return Op;

20499

20500

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

20501

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20502

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20503

}

20504

20505

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,

20506

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20507

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20508

}

20509

20510

if (Subtarget.hasSSE41())

20511

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

20512

return Res;

20513

20514

// TODO: We only extract a single element from v16i8, we can probably afford

20515

// to be more aggressive here before using the default approach of spilling to

20516

// stack.

20517

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

20518

// Extract either the lowest i32 or any i16, and extract the sub-byte.

20519

int DWordIdx = IdxVal / 4;

20520

if (DWordIdx == 0) {

20521

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20522

DAG.getBitcast(MVT::v4i32, Vec),

20523

DAG.getIntPtrConstant(DWordIdx, dl));

20524

int ShiftVal = (IdxVal % 4) * 8;

20525

if (ShiftVal != 0)

20526

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

20527

DAG.getConstant(ShiftVal, dl, MVT::i8));

20528

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20529

}

20530

20531

int WordIdx = IdxVal / 2;

20532

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

20533

DAG.getBitcast(MVT::v8i16, Vec),

20534

DAG.getIntPtrConstant(WordIdx, dl));

20535

int ShiftVal = (IdxVal % 2) * 8;

20536

if (ShiftVal != 0)

20537

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

20538

DAG.getConstant(ShiftVal, dl, MVT::i8));

20539

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20540

}

20541

20542

if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

20543

if (IdxVal == 0)

20544

return Op;

20545

20546

// Shuffle the element to the lowest element, then movss or movsh.

20547

SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);

20548

Mask[0] = static_cast<int>(IdxVal);

20549

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20550

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20551

DAG.getIntPtrConstant(0, dl));

20552

}

20553

20554

if (VT.getSizeInBits() == 64) {

20555

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

20556

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

20557

// to match extract_elt for f64.

20558

if (IdxVal == 0)

20559

return Op;

20560

20561

// UNPCKHPD the element to the lowest double word, then movsd.

20562

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

20563

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

20564

int Mask[2] = { 1, -1 };

20565

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20566

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20567

DAG.getIntPtrConstant(0, dl));

20568

}

20569

20570

return SDValue();

20571

}

20572

20573

/// Insert one bit to mask vector, like v16i1 or v8i1.

20574

/// AVX-512 feature.

20575

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

20576

const X86Subtarget &Subtarget) {

20577

SDLoc dl(Op);

20578

SDValue Vec = Op.getOperand(0);

20579

SDValue Elt = Op.getOperand(1);

20580

SDValue Idx = Op.getOperand(2);

20581

MVT VecVT = Vec.getSimpleValueType();

20582

20583

if (!isa<ConstantSDNode>(Idx)) {

20584

// Non constant index. Extend source and destination,

20585

// insert element and then truncate the result.

20586

unsigned NumElts = VecVT.getVectorNumElements();

20587

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20588

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20589

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

20590

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

20591

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

20592

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

20593

}

20594

20595

// Copy into a k-register, extract to v1i1 and insert_subvector.

20596

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

20597

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

20598

}

20599

20600

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

20601

SelectionDAG &DAG) const {

20602

MVT VT = Op.getSimpleValueType();

20603

MVT EltVT = VT.getVectorElementType();

20604

unsigned NumElts = VT.getVectorNumElements();

20605

unsigned EltSizeInBits = EltVT.getScalarSizeInBits();

20606

20607

if (EltVT == MVT::i1)

20608

return InsertBitToMaskVector(Op, DAG, Subtarget);

20609

20610

SDLoc dl(Op);

20611

SDValue N0 = Op.getOperand(0);

20612

SDValue N1 = Op.getOperand(1);

20613

SDValue N2 = Op.getOperand(2);

20614

auto *N2C = dyn_cast<ConstantSDNode>(N2);

20615

20616

if (!N2C) {

20617

// Variable insertion indices, usually we're better off spilling to stack,

20618

// but AVX512 can use a variable compare+select by comparing against all

20619

// possible vector indices, and FP insertion has less gpr->simd traffic.

20620

if (!(Subtarget.hasBWI() ||

20621

(Subtarget.hasAVX512() && EltSizeInBits >= 32) ||

20622

(Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))

20623

return SDValue();

20624

20625

MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);

20626

MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);

20627

if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))

20628

return SDValue();

20629

20630

SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);

20631

SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);

20632

SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);

20633

20634

SmallVector<SDValue, 16> RawIndices;

20635

for (unsigned I = 0; I != NumElts; ++I)

20636

RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));

20637

SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);

20638

20639

// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.

20640

return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,

20641

ISD::CondCode::SETEQ);

20642

}

20643

20644

if (N2C->getAPIntValue().uge(NumElts))

20645

return SDValue();

20646

uint64_t IdxVal = N2C->getZExtValue();

20647

20648

bool IsZeroElt = X86::isZeroNode(N1);

20649

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

20650

20651

if (IsZeroElt || IsAllOnesElt) {

20652

// Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.

20653

// We don't deal with i8 0 since it appears to be handled elsewhere.

20654

if (IsAllOnesElt &&

20655

((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||

20656

((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {

20657

SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());

20658

SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());

20659

SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);

20660

CstVectorElts[IdxVal] = OnesCst;

20661

SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);

20662

return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);

20663

}

20664

// See if we can do this more efficiently with a blend shuffle with a

20665

// rematerializable vector.

20666

if (Subtarget.hasSSE41() &&

20667

(EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {

20668

SmallVector<int, 8> BlendMask;

20669

for (unsigned i = 0; i != NumElts; ++i)

20670

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20671

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

20672

: getOnesVector(VT, DAG, dl);

20673

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

20674

}

20675

}

20676

20677

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

20678

// into that, and then insert the subvector back into the result.

20679

if (VT.is256BitVector() || VT.is512BitVector()) {

20680

// With a 256-bit vector, we can insert into the zero element efficiently

20681

// using a blend if we have AVX or AVX2 and the right data type.

20682

if (VT.is256BitVector() && IdxVal == 0) {

20683

// TODO: It is worthwhile to cast integer to floating point and back

20684

// and incur a domain crossing penalty if that's what we'll end up

20685

// doing anyway after extracting to a 128-bit vector.

20686

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

20687

(Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {

20688

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20689

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

20690

DAG.getTargetConstant(1, dl, MVT::i8));

20691

}

20692

}

20693

20694

unsigned NumEltsIn128 = 128 / EltSizeInBits;

20695

assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__
__PRETTY_FUNCTION__))

20696

"Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__
__PRETTY_FUNCTION__));

20697

20698

// If we are not inserting into the low 128-bit vector chunk,

20699

// then prefer the broadcast+blend sequence.

20700

// FIXME: relax the profitability check iff all N1 uses are insertions.

20701

if (IdxVal >= NumEltsIn128 &&

20702

((Subtarget.hasAVX2() && EltSizeInBits != 8) ||

20703

(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&

20704

X86::mayFoldLoad(N1, Subtarget)))) {

20705

SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);

20706

SmallVector<int, 8> BlendMask;

20707

for (unsigned i = 0; i != NumElts; ++i)

20708

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20709

return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);

20710

}

20711

20712

// Get the desired 128-bit vector chunk.

20713

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

20714

20715

// Insert the element into the desired chunk.

20716

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

20717

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

20718

20719

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

20720

DAG.getIntPtrConstant(IdxIn128, dl));

20721

20722

// Insert the changed part back into the bigger vector

20723

return insert128BitVector(N0, V, IdxVal, DAG, dl);

20724

}

20725

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20725, __extension__
__PRETTY_FUNCTION__));

20726

20727

// This will be just movw/movd/movq/movsh/movss/movsd.

20728

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

20729

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

20730

EltVT == MVT::f16 || EltVT == MVT::i64) {

20731

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20732

return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20733

}

20734

20735

// We can't directly insert an i8 or i16 into a vector, so zero extend

20736

// it to i32 first.

20737

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

20738

N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

20739

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

20740

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

20741

N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20742

return DAG.getBitcast(VT, N1);

20743

}

20744

}

20745

20746

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

20747

// argument. SSE41 required for pinsrb.

20748

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

20749

unsigned Opc;

20750

if (VT == MVT::v8i16) {

20751

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20751, __extension__
__PRETTY_FUNCTION__));

20752

Opc = X86ISD::PINSRW;

20753

} else {

20754

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20754, __extension__
__PRETTY_FUNCTION__));

20755

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20755, __extension__
__PRETTY_FUNCTION__));

20756

Opc = X86ISD::PINSRB;

20757

}

20758

20759

assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20759, __extension__
__PRETTY_FUNCTION__));

20760

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

20761

N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);

20762

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

20763

}

20764

20765

if (Subtarget.hasSSE41()) {

20766

if (EltVT == MVT::f32) {

20767

// Bits [7:6] of the constant are the source select. This will always be

20768

// zero here. The DAG Combiner may combine an extract_elt index into

20769

// these bits. For example (insert (extract, 3), 2) could be matched by

20770

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

20771

// Bits [5:4] of the constant are the destination select. This is the

20772

// value of the incoming immediate.

20773

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

20774

// combine either bitwise AND or insert of float 0.0 to set these bits.

20775

20776

bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

20777

if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {

20778

// If this is an insertion of 32-bits into the low 32-bits of

20779

// a vector, we prefer to generate a blend with immediate rather

20780

// than an insertps. Blends are simpler operations in hardware and so

20781

// will always have equal or better performance than insertps.

20782

// But if optimizing for size and there's a load folding opportunity,

20783

// generate insertps because blendps does not have a 32-bit memory

20784

// operand form.

20785

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20786

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

20787

DAG.getTargetConstant(1, dl, MVT::i8));

20788

}

20789

// Create this as a scalar to vector..

20790

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20791

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

20792

DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

20793

}

20794

20795

// PINSR* works with constant index.

20796

if (EltVT == MVT::i32 || EltVT == MVT::i64)

20797

return Op;

20798

}

20799

20800

return SDValue();

20801

}

20802

20803

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

20804

SelectionDAG &DAG) {

20805

SDLoc dl(Op);

20806

MVT OpVT = Op.getSimpleValueType();

20807

20808

// It's always cheaper to replace a xor+movd with xorps and simplifies further

20809

// combines.

20810

if (X86::isZeroNode(Op.getOperand(0)))

20811

return getZeroVector(OpVT, Subtarget, DAG, dl);

20812

20813

// If this is a 256-bit vector result, first insert into a 128-bit

20814

// vector and then insert into the 256-bit vector.

20815

if (!OpVT.is128BitVector()) {

20816

// Insert into a 128-bit vector.

20817

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

20818

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

20819

OpVT.getVectorNumElements() / SizeFactor);

20820

20821

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

20822

20823

// Insert the 128-bit vector.

20824

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

20825

}

20826

assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__
__PRETTY_FUNCTION__))

20827

"Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__
__PRETTY_FUNCTION__));

20828

20829

// Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in

20830

// tblgen.

20831

if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))

20832

return Op;

20833

20834

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

20835

return DAG.getBitcast(

20836

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

20837

}

20838

20839

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

20840

// simple superregister reference or explicit instructions to insert

20841

// the upper bits of a vector.

20842

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20843

SelectionDAG &DAG) {

20844

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20844, __extension__
__PRETTY_FUNCTION__));

20845

20846

return insert1BitVector(Op, DAG, Subtarget);

20847

}

20848

20849

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20850

SelectionDAG &DAG) {

20851

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__
__PRETTY_FUNCTION__))

20852

"Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__
__PRETTY_FUNCTION__));

20853

20854

SDLoc dl(Op);

20855

SDValue Vec = Op.getOperand(0);

20856

uint64_t IdxVal = Op.getConstantOperandVal(1);

20857

20858

if (IdxVal == 0) // the operation is legal

20859

return Op;

20860

20861

MVT VecVT = Vec.getSimpleValueType();

20862

unsigned NumElems = VecVT.getVectorNumElements();

20863

20864

// Extend to natively supported kshift.

20865

MVT WideVecVT = VecVT;

20866

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20867

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20868

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20869

DAG.getUNDEF(WideVecVT), Vec,

20870

DAG.getIntPtrConstant(0, dl));

20871

}

20872

20873

// Shift to the LSB.

20874

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20875

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20876

20877

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

20878

DAG.getIntPtrConstant(0, dl));

20879

}

20880

20881

// Returns the appropriate wrapper opcode for a global reference.

20882

unsigned X86TargetLowering::getGlobalWrapperKind(

20883

const GlobalValue *GV, const unsigned char OpFlags) const {

20884

// References to absolute symbols are never PC-relative.

20885

if (GV && GV->isAbsoluteSymbolRef())

20886

return X86ISD::Wrapper;

20887

20888

CodeModel::Model M = getTargetMachine().getCodeModel();

20889

if (Subtarget.isPICStyleRIPRel() &&

20890

(M == CodeModel::Small || M == CodeModel::Kernel))

20891

return X86ISD::WrapperRIP;

20892

20893

// In the medium model, functions can always be referenced RIP-relatively,

20894

// since they must be within 2GiB. This is also possible in non-PIC mode, and

20895

// shorter than the 64-bit absolute immediate that would otherwise be emitted.

20896

if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))

20897

return X86ISD::WrapperRIP;

20898

20899

// GOTPCREL references must always use RIP.

20900

if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)

20901

return X86ISD::WrapperRIP;

20902

20903

return X86ISD::Wrapper;

20904

}

20905

20906

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

20907

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

20908

// one of the above mentioned nodes. It has to be wrapped because otherwise

20909

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

20910

// be used to form addressing mode. These wrapped nodes will be selected

20911

// into MOV32ri.

20912

SDValue

20913

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

20914

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

20915

20916

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20917

// global base reg.

20918

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20919

20920

auto PtrVT = getPointerTy(DAG.getDataLayout());

20921

SDValue Result = DAG.getTargetConstantPool(

20922

CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

20923

SDLoc DL(CP);

20924

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20925

// With PIC, the address is actually $g + Offset.

20926

if (OpFlag) {

20927

Result =

20928

DAG.getNode(ISD::ADD, DL, PtrVT,

20929

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20930

}

20931

20932

return Result;

20933

}

20934

20935

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

20936

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

20937

20938

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20939

// global base reg.

20940

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20941

20942

auto PtrVT = getPointerTy(DAG.getDataLayout());

20943

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

20944

SDLoc DL(JT);

20945

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20946

20947

// With PIC, the address is actually $g + Offset.

20948

if (OpFlag)

20949

Result =

20950

DAG.getNode(ISD::ADD, DL, PtrVT,

20951

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20952

20953

return Result;

20954

}

20955

20956

SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

20957

SelectionDAG &DAG) const {

20958

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20959

}

20960

20961

SDValue

20962

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

20963

// Create the TargetBlockAddressAddress node.

20964

unsigned char OpFlags =

20965

Subtarget.classifyBlockAddressReference();

20966

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

20967

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

20968

SDLoc dl(Op);

20969

auto PtrVT = getPointerTy(DAG.getDataLayout());

20970

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

20971

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

20972

20973

// With PIC, the address is actually $g + Offset.

20974

if (isGlobalRelativeToPICBase(OpFlags)) {

20975

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20976

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20977

}

20978

20979

return Result;

20980

}

20981

20982

/// Creates target global address or external symbol nodes for calls or

20983

/// other uses.

20984

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

20985

bool ForCall) const {

20986

// Unpack the global address or external symbol.

20987

const SDLoc &dl = SDLoc(Op);

20988

const GlobalValue *GV = nullptr;

20989

int64_t Offset = 0;

20990

const char *ExternalSym = nullptr;

20991

if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

20992

GV = G->getGlobal();

20993

Offset = G->getOffset();

20994

} else {

20995

const auto *ES = cast<ExternalSymbolSDNode>(Op);

20996

ExternalSym = ES->getSymbol();

20997

}

20998

20999

// Calculate some flags for address lowering.

21000

const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

21001

unsigned char OpFlags;

21002

if (ForCall)

21003

OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

21004

else

21005

OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

21006

bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

21007

bool NeedsLoad = isGlobalStubReference(OpFlags);

21008

21009

CodeModel::Model M = DAG.getTarget().getCodeModel();

21010

auto PtrVT = getPointerTy(DAG.getDataLayout());

21011

SDValue Result;

21012

21013

if (GV) {

21014

// Create a target global address if this is a global. If possible, fold the

21015

// offset into the global address reference. Otherwise, ADD it on later.

21016

// Suppress the folding if Offset is negative: movl foo-1, %eax is not

21017

// allowed because if the address of foo is 0, the ELF R_X86_64_32

21018

// relocation will compute to a negative value, which is invalid.

21019

int64_t GlobalOffset = 0;

21020

if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&

21021

X86::isOffsetSuitableForCodeModel(Offset, M, true)) {

21022

std::swap(GlobalOffset, Offset);

21023

}

21024

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

21025

} else {

21026

// If this is not a global address, this must be an external symbol.

21027

Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

21028

}

21029

21030

// If this is a direct call, avoid the wrapper if we don't need to do any

21031

// loads or adds. This allows SDAG ISel to match direct calls.

21032

if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

21033

return Result;

21034

21035

Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

21036

21037

// With PIC, the address is actually $g + Offset.

21038

if (HasPICReg) {

21039

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

21040

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

21041

}

21042

21043

// For globals that require a load from a stub to get the address, emit the

21044

// load.

21045

if (NeedsLoad)

21046

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

21047

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

21048

21049

// If there was a non-zero offset that we didn't fold, create an explicit

21050

// addition for it.

21051

if (Offset != 0)

21052

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

21053

DAG.getConstant(Offset, dl, PtrVT));

21054

21055

return Result;

21056

}

21057

21058

SDValue

21059

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

21060

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

21061

}

21062

21063

static SDValue

21064

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

21065

SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,

21066

unsigned char OperandFlags, bool LocalDynamic = false) {

21067

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

21068

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

21069

SDLoc dl(GA);

21070

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21071

GA->getValueType(0),

21072

GA->getOffset(),

21073

OperandFlags);

21074

21075

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

21076

: X86ISD::TLSADDR;

21077

21078

if (InGlue) {

21079

SDValue Ops[] = { Chain, TGA, *InGlue };

21080

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

21081

} else {

21082

SDValue Ops[] = { Chain, TGA };

21083

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

21084

}

21085

21086

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

21087

MFI.setAdjustsStack(true);

21088

MFI.setHasCalls(true);

21089

21090

SDValue Glue = Chain.getValue(1);

21091

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);

21092

}

21093

21094

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

21095

static SDValue

21096

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21097

const EVT PtrVT) {

21098

SDValue InGlue;

21099

SDLoc dl(GA); // ? function entry point might be better

21100

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

21101

DAG.getNode(X86ISD::GlobalBaseReg,

21102

SDLoc(), PtrVT), InGlue);

21103

InGlue = Chain.getValue(1);

21104

21105

return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);

21106

}

21107

21108

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64

21109

static SDValue

21110

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21111

const EVT PtrVT) {

21112

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

21113

X86::RAX, X86II::MO_TLSGD);

21114

}

21115

21116

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32

21117

static SDValue

21118

LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21119

const EVT PtrVT) {

21120

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

21121

X86::EAX, X86II::MO_TLSGD);

21122

}

21123

21124

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

21125

SelectionDAG &DAG, const EVT PtrVT,

21126

bool Is64Bit, bool Is64BitLP64) {

21127

SDLoc dl(GA);

21128

21129

// Get the start address of the TLS block for this module.

21130

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

21131

.getInfo<X86MachineFunctionInfo>();

21132

MFI->incNumLocalDynamicTLSAccesses();

21133

21134

SDValue Base;

21135

if (Is64Bit) {

21136

unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;

21137

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,

21138

X86II::MO_TLSLD, /*LocalDynamic=*/true);

21139

} else {

21140

SDValue InGlue;

21141

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

21142

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);

21143

InGlue = Chain.getValue(1);

21144

Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,

21145

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

21146

}

21147

21148

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

21149

// of Base.

21150

21151

// Build x@dtpoff.

21152

unsigned char OperandFlags = X86II::MO_DTPOFF;

21153

unsigned WrapperKind = X86ISD::Wrapper;

21154

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21155

GA->getValueType(0),

21156

GA->getOffset(), OperandFlags);

21157

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

21158

21159

// Add x@dtpoff with the base.

21160

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

21161

}

21162

21163

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

21164

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21165

const EVT PtrVT, TLSModel::Model model,

21166

bool is64Bit, bool isPIC) {

21167

SDLoc dl(GA);

21168

21169

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

21170

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

21171

is64Bit ? 257 : 256));

21172

21173

SDValue ThreadPointer =

21174

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

21175

MachinePointerInfo(Ptr));

21176

21177

unsigned char OperandFlags = 0;

21178

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

21179

// initialexec.

21180

unsigned WrapperKind = X86ISD::Wrapper;

21181

if (model == TLSModel::LocalExec) {

21182

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

21183

} else if (model == TLSModel::InitialExec) {

21184

if (is64Bit) {

21185

OperandFlags = X86II::MO_GOTTPOFF;

21186

WrapperKind = X86ISD::WrapperRIP;

21187

} else {

21188

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

21189

}

21190

} else {

21191

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21191);

21192

}

21193

21194

// emit "addl x@ntpoff,%eax" (local exec)

21195

// or "addl x@indntpoff,%eax" (initial exec)

21196

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

21197

SDValue TGA =

21198

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

21199

GA->getOffset(), OperandFlags);

21200

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

21201

21202

if (model == TLSModel::InitialExec) {

21203

if (isPIC && !is64Bit) {

21204

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

21205

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

21206

Offset);

21207

}

21208

21209

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

21210

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

21211

}

21212

21213

// The address of the thread local variable is the add of the thread

21214

// pointer with the offset of the variable.

21215

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

21216

}

21217

21218

SDValue

21219

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

21220

21221

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

21222

21223

if (DAG.getTarget().useEmulatedTLS())

21224

return LowerToTLSEmulatedModel(GA, DAG);

21225

21226

const GlobalValue *GV = GA->getGlobal();

21227

auto PtrVT = getPointerTy(DAG.getDataLayout());

21228

bool PositionIndependent = isPositionIndependent();

21229

21230

if (Subtarget.isTargetELF()) {

21231

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

21232

switch (model) {

21233

case TLSModel::GeneralDynamic:

21234

if (Subtarget.is64Bit()) {

21235

if (Subtarget.isTarget64BitLP64())

21236

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

21237

return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);

21238

}

21239

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

21240

case TLSModel::LocalDynamic:

21241

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),

21242

Subtarget.isTarget64BitLP64());

21243

case TLSModel::InitialExec:

21244

case TLSModel::LocalExec:

21245

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

21246

PositionIndependent);

21247

}

21248

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21248);

21249

}

21250

21251

if (Subtarget.isTargetDarwin()) {

21252

// Darwin only has one model of TLS. Lower to that.

21253

unsigned char OpFlag = 0;

21254

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

21255

X86ISD::WrapperRIP : X86ISD::Wrapper;

21256

21257

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

21258

// global base reg.

21259

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

21260

if (PIC32)

21261

OpFlag = X86II::MO_TLVP_PIC_BASE;

21262

else

21263

OpFlag = X86II::MO_TLVP;

21264

SDLoc DL(Op);

21265

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

21266

GA->getValueType(0),

21267

GA->getOffset(), OpFlag);

21268

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

21269

21270

// With PIC32, the address is actually $g + Offset.

21271

if (PIC32)

21272

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

21273

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

21274

Offset);

21275

21276

// Lowering the machine isd will make sure everything is in the right

21277

// location.

21278

SDValue Chain = DAG.getEntryNode();

21279

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

21280

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

21281

SDValue Args[] = { Chain, Offset };

21282

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

21283

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);

21284

21285

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

21286

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

21287

MFI.setAdjustsStack(true);

21288

21289

// And our return value (tls address) is in the standard call return value

21290

// location.

21291

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

21292

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

21293

}

21294

21295

if (Subtarget.isOSWindows()) {

21296

// Just use the implicit TLS architecture

21297

// Need to generate something similar to:

21298

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

21299

// ; from TEB

21300

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

21301

// mov rcx, qword [rdx+rcx*8]

21302

// mov eax, .tls$:tlsvar

21303

// [rax+rcx] contains the address

21304

// Windows 64bit: gs:0x58

21305

// Windows 32bit: fs:__tls_array

21306

21307

SDLoc dl(GA);

21308

SDValue Chain = DAG.getEntryNode();

21309

21310

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

21311

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

21312

// use its literal value of 0x2C.

21313

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

21314

? Type::getInt8PtrTy(*DAG.getContext(),

21315

256)

21316

: Type::getInt32PtrTy(*DAG.getContext(),

21317

257));

21318

21319

SDValue TlsArray = Subtarget.is64Bit()

21320

? DAG.getIntPtrConstant(0x58, dl)

21321

: (Subtarget.isTargetWindowsGNU()

21322

? DAG.getIntPtrConstant(0x2C, dl)

21323

: DAG.getExternalSymbol("_tls_array", PtrVT));

21324

21325

SDValue ThreadPointer =

21326

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

21327

21328

SDValue res;

21329

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

21330

res = ThreadPointer;

21331

} else {

21332

// Load the _tls_index variable

21333

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

21334

if (Subtarget.is64Bit())

21335

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

21336

MachinePointerInfo(), MVT::i32);

21337

else

21338

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

21339

21340

const DataLayout &DL = DAG.getDataLayout();

21341

SDValue Scale =

21342

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

21343

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

21344

21345

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

21346

}

21347

21348

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

21349

21350

// Get the offset of start of .tls section

21351

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21352

GA->getValueType(0),

21353

GA->getOffset(), X86II::MO_SECREL);

21354

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

21355

21356

// The address of the thread local variable is the add of the thread

21357

// pointer with the offset of the variable.

21358

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

21359

}

21360

21361

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21361);

21362

}

21363

21364

/// Lower SRA_PARTS and friends, which return two i32 values

21365

/// and take a 2 x i32 value to shift plus a shift amount.

21366

/// TODO: Can this be moved to general expansion code?

21367

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

21368

SDValue Lo, Hi;

21369

DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);

21370

return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

21371

}

21372

21373

// Try to use a packed vector operation to handle i64 on 32-bit targets when

21374

// AVX512DQ is enabled.

21375

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

21376

const X86Subtarget &Subtarget) {

21377

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))

21378

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))

21379

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))

21380

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__))

21381

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__
__PRETTY_FUNCTION__));

21382

bool IsStrict = Op->isStrictFPOpcode();

21383

unsigned OpNo = IsStrict ? 1 : 0;

21384

SDValue Src = Op.getOperand(OpNo);

21385

MVT SrcVT = Src.getSimpleValueType();

21386

MVT VT = Op.getSimpleValueType();

21387

21388

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

21389

(VT != MVT::f32 && VT != MVT::f64))

21390

return SDValue();

21391

21392

// Pack the i64 into a vector, do the operation and extract.

21393

21394

// Using 256-bit to ensure result is 128-bits for f32 case.

21395

unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

21396

MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

21397

MVT VecVT = MVT::getVectorVT(VT, NumElts);

21398

21399

SDLoc dl(Op);

21400

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

21401

if (IsStrict) {

21402

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

21403

{Op.getOperand(0), InVec});

21404

SDValue Chain = CvtVec.getValue(1);

21405

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21406

DAG.getIntPtrConstant(0, dl));

21407

return DAG.getMergeValues({Value, Chain}, dl);

21408

}

21409

21410

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

21411

21412

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21413

DAG.getIntPtrConstant(0, dl));

21414

}

21415

21416

// Try to use a packed vector operation to handle i64 on 32-bit targets.

21417

static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,

21418

const X86Subtarget &Subtarget) {

21419

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))

21420

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))

21421

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))

21422

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__))

21423

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__
__PRETTY_FUNCTION__));

21424

bool IsStrict = Op->isStrictFPOpcode();

21425

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21426

MVT SrcVT = Src.getSimpleValueType();

21427

MVT VT = Op.getSimpleValueType();

21428

21429

if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)

21430

return SDValue();

21431

21432

// Pack the i64 into a vector, do the operation and extract.

21433

21434

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21434, __extension__
__PRETTY_FUNCTION__));

21435

21436

SDLoc dl(Op);

21437

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

21438

if (IsStrict) {

21439

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},

21440

{Op.getOperand(0), InVec});

21441

SDValue Chain = CvtVec.getValue(1);

21442

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21443

DAG.getIntPtrConstant(0, dl));

21444

return DAG.getMergeValues({Value, Chain}, dl);

21445

}

21446

21447

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);

21448

21449

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21450

DAG.getIntPtrConstant(0, dl));

21451

}

21452

21453

static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

21454

const X86Subtarget &Subtarget) {

21455

switch (Opcode) {

21456

case ISD::SINT_TO_FP:

21457

// TODO: Handle wider types with AVX/AVX512.

21458

if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

21459

return false;

21460

// CVTDQ2PS or (V)CVTDQ2PD

21461

return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);

21462

21463

case ISD::UINT_TO_FP:

21464

// TODO: Handle wider types and i64 elements.

21465

if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

21466

return false;

21467

// VCVTUDQ2PS or VCVTUDQ2PD

21468

return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;

21469

21470

default:

21471

return false;

21472

}

21473

}

21474

21475

/// Given a scalar cast operation that is extracted from a vector, try to

21476

/// vectorize the cast op followed by extraction. This will avoid an expensive

21477

/// round-trip between XMM and GPR.

21478

static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

21479

const X86Subtarget &Subtarget) {

21480

// TODO: This could be enhanced to handle smaller integer types by peeking

21481

// through an extend.

21482

SDValue Extract = Cast.getOperand(0);

21483

MVT DestVT = Cast.getSimpleValueType();

21484

if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

21485

!isa<ConstantSDNode>(Extract.getOperand(1)))

21486

return SDValue();

21487

21488

// See if we have a 128-bit vector cast op for this type of cast.

21489

SDValue VecOp = Extract.getOperand(0);

21490

MVT FromVT = VecOp.getSimpleValueType();

21491

unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

21492

MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

21493

MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

21494

if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

21495

return SDValue();

21496

21497

// If we are extracting from a non-zero element, first shuffle the source

21498

// vector to allow extracting from element zero.

21499

SDLoc DL(Cast);

21500

if (!isNullConstant(Extract.getOperand(1))) {

21501

SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

21502

Mask[0] = Extract.getConstantOperandVal(1);

21503

VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

21504

}

21505

// If the source vector is wider than 128-bits, extract the low part. Do not

21506

// create an unnecessarily wide vector cast op.

21507

if (FromVT != Vec128VT)

21508

VecOp = extract128BitVector(VecOp, 0, DAG, DL);

21509

21510

// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

21511

// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

21512

SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

21513

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

21514

DAG.getIntPtrConstant(0, DL));

21515

}

21516

21517

/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

21518

/// try to vectorize the cast ops. This will avoid an expensive round-trip

21519

/// between XMM and GPR.

21520

static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

21521

const X86Subtarget &Subtarget) {

21522

// TODO: Allow FP_TO_UINT.

21523

SDValue CastToInt = CastToFP.getOperand(0);

21524

MVT VT = CastToFP.getSimpleValueType();

21525

if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

21526

return SDValue();

21527

21528

MVT IntVT = CastToInt.getSimpleValueType();

21529

SDValue X = CastToInt.getOperand(0);

21530

MVT SrcVT = X.getSimpleValueType();

21531

if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

21532

return SDValue();

21533

21534

// See if we have 128-bit vector cast instructions for this type of cast.

21535

// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

21536

if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

21537

IntVT != MVT::i32)

21538

return SDValue();

21539

21540

unsigned SrcSize = SrcVT.getSizeInBits();

21541

unsigned IntSize = IntVT.getSizeInBits();

21542

unsigned VTSize = VT.getSizeInBits();

21543

MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

21544

MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

21545

MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

21546

21547

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

21548

unsigned ToIntOpcode =

21549

SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

21550

unsigned ToFPOpcode =

21551

IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

21552

21553

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

21554

//

21555

// We are not defining the high elements (for example, zero them) because

21556

// that could nullify any performance advantage that we hoped to gain from

21557

// this vector op hack. We do not expect any adverse effects (like denorm

21558

// penalties) with cast ops.

21559

SDLoc DL(CastToFP);

21560

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

21561

SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

21562

SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

21563

SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

21564

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

21565

}

21566

21567

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

21568

const X86Subtarget &Subtarget) {

21569

SDLoc DL(Op);

21570

bool IsStrict = Op->isStrictFPOpcode();

21571

MVT VT = Op->getSimpleValueType(0);

21572

SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

21573

21574

if (Subtarget.hasDQI()) {

21575

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21575, __extension__
__PRETTY_FUNCTION__));

21576

21577

assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))

21578

Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__))

21579

"Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__
__PRETTY_FUNCTION__));

21580

21581

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

21582

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__
__PRETTY_FUNCTION__))

21583

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__
__PRETTY_FUNCTION__));

21584

MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

21585

21586

// Need to concat with zero vector for strict fp to avoid spurious

21587

// exceptions.

21588

SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

21589

: DAG.getUNDEF(MVT::v8i64);

21590

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

21591

DAG.getIntPtrConstant(0, DL));

21592

SDValue Res, Chain;

21593

if (IsStrict) {

21594

Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

21595

{Op->getOperand(0), Src});

21596

Chain = Res.getValue(1);

21597

} else {

21598

Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

21599

}

21600

21601

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21602

DAG.getIntPtrConstant(0, DL));

21603

21604

if (IsStrict)

21605

return DAG.getMergeValues({Res, Chain}, DL);

21606

return Res;

21607

}

21608

21609

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

21610

Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

21611

if (VT != MVT::v4f32 || IsSigned)

21612

return SDValue();

21613

21614

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

21615

SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

21616

SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

21617

DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

21618

DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

21619

SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

21620

SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

21621

SmallVector<SDValue, 4> SignCvts(4);

21622

SmallVector<SDValue, 4> Chains(4);

21623

for (int i = 0; i != 4; ++i) {

21624

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

21625

DAG.getIntPtrConstant(i, DL));

21626

if (IsStrict) {

21627

SignCvts[i] =

21628

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

21629

{Op.getOperand(0), Elt});

21630

Chains[i] = SignCvts[i].getValue(1);

21631

} else {

21632

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

21633

}

21634

}

21635

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

21636

21637

SDValue Slow, Chain;

21638

if (IsStrict) {

21639

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

21640

Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

21641

{Chain, SignCvt, SignCvt});

21642

Chain = Slow.getValue(1);

21643

} else {

21644

Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

21645

}

21646

21647

IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

21648

SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

21649

21650

if (IsStrict)

21651

return DAG.getMergeValues({Cvt, Chain}, DL);

21652

21653

return Cvt;

21654

}

21655

21656

static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {

21657

bool IsStrict = Op->isStrictFPOpcode();

21658

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21659

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21660

MVT VT = Op.getSimpleValueType();

21661

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

21662

SDLoc dl(Op);

21663

21664

SDValue Rnd = DAG.getIntPtrConstant(0, dl);

21665

if (IsStrict)

21666

return DAG.getNode(

21667

ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},

21668

{Chain,

21669

DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),

21670

Rnd});

21671

return DAG.getNode(ISD::FP_ROUND, dl, VT,

21672

DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);

21673

}

21674

21675

static bool isLegalConversion(MVT VT, bool IsSigned,

21676

const X86Subtarget &Subtarget) {

21677

if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)

21678

return true;

21679

if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)

21680

return true;

21681

if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))

21682

return true;

21683

if (Subtarget.useAVX512Regs()) {

21684

if (VT == MVT::v16i32)

21685

return true;

21686

if (VT == MVT::v8i64 && Subtarget.hasDQI())

21687

return true;

21688

}

21689

if (Subtarget.hasDQI() && Subtarget.hasVLX() &&

21690

(VT == MVT::v2i64 || VT == MVT::v4i64))

21691

return true;

21692

return false;

21693

}

21694

21695

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

21696

SelectionDAG &DAG) const {

21697

bool IsStrict = Op->isStrictFPOpcode();

21698

unsigned OpNo = IsStrict ? 1 : 0;

21699

SDValue Src = Op.getOperand(OpNo);

21700

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21701

MVT SrcVT = Src.getSimpleValueType();

21702

MVT VT = Op.getSimpleValueType();

21703

SDLoc dl(Op);

21704

21705

if (isSoftFP16(VT))

21706

return promoteXINT_TO_FP(Op, DAG);

21707

else if (isLegalConversion(SrcVT, true, Subtarget))

21708

return Op;

21709

21710

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21711

return LowerWin64_INT128_TO_FP(Op, DAG);

21712

21713

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21714

return Extract;

21715

21716

if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

21717

return R;

21718

21719

if (SrcVT.isVector()) {

21720

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

21721

// Note: Since v2f64 is a legal type. We don't need to zero extend the

21722

// source for strict FP.

21723

if (IsStrict)

21724

return DAG.getNode(

21725

X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

21726

{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21727

DAG.getUNDEF(SrcVT))});

21728

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

21729

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21730

DAG.getUNDEF(SrcVT)));

21731

}

21732

if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

21733

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21734

21735

return SDValue();

21736

}

21737

21738

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__
__PRETTY_FUNCTION__))

21739

"Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__
__PRETTY_FUNCTION__));

21740

21741

bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

21742

21743

// These are really Legal; return the operand so the caller accepts it as

21744

// Legal.

21745

if (SrcVT == MVT::i32 && UseSSEReg)

21746

return Op;

21747

if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

21748

return Op;

21749

21750

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21751

return V;

21752

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21753

return V;

21754

21755

// SSE doesn't have an i16 conversion so we need to promote.

21756

if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

21757

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

21758

if (IsStrict)

21759

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

21760

{Chain, Ext});

21761

21762

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

21763

}

21764

21765

if (VT == MVT::f128 || !Subtarget.hasX87())

21766

return SDValue();

21767

21768

SDValue ValueToStore = Src;

21769

if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

21770

// Bitcasting to f64 here allows us to do a single 64-bit store from

21771

// an SSE register, avoiding the store forwarding penalty that would come

21772

// with two 32-bit stores.

21773

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

21774

21775

unsigned Size = SrcVT.getStoreSize();

21776

Align Alignment(Size);

21777

MachineFunction &MF = DAG.getMachineFunction();

21778

auto PtrVT = getPointerTy(MF.getDataLayout());

21779

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

21780

MachinePointerInfo MPI =

21781

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

21782

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21783

Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

21784

std::pair<SDValue, SDValue> Tmp =

21785

BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

21786

21787

if (IsStrict)

21788

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

21789

21790

return Tmp.first;

21791

}

21792

21793

std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

21794

EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

21795

MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

21796

// Build the FILD

21797

SDVTList Tys;

21798

bool useSSE = isScalarFPTypeInSSEReg(DstVT);

21799

if (useSSE)

21800

Tys = DAG.getVTList(MVT::f80, MVT::Other);

21801

else

21802

Tys = DAG.getVTList(DstVT, MVT::Other);

21803

21804

SDValue FILDOps[] = {Chain, Pointer};

21805

SDValue Result =

21806

DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

21807

Alignment, MachineMemOperand::MOLoad);

21808

Chain = Result.getValue(1);

21809

21810

if (useSSE) {

21811

MachineFunction &MF = DAG.getMachineFunction();

21812

unsigned SSFISize = DstVT.getStoreSize();

21813

int SSFI =

21814

MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

21815

auto PtrVT = getPointerTy(MF.getDataLayout());

21816

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21817

Tys = DAG.getVTList(MVT::Other);

21818

SDValue FSTOps[] = {Chain, Result, StackSlot};

21819

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

21820

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

21821

MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

21822

21823

Chain =

21824

DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

21825

Result = DAG.getLoad(

21826

DstVT, DL, Chain, StackSlot,

21827

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

21828

Chain = Result.getValue(1);

21829

}

21830

21831

return { Result, Chain };

21832

}

21833

21834

/// Horizontal vector math instructions may be slower than normal math with

21835

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

21836

/// implementation, and likely shuffle complexity of the alternate sequence.

21837

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

21838

const X86Subtarget &Subtarget) {

21839

bool IsOptimizingSize = DAG.shouldOptForSize();

21840

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

21841

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

21842

}

21843

21844

/// 64-bit unsigned integer to double expansion.

21845

static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

21846

const X86Subtarget &Subtarget) {

21847

// We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0

21848

// when converting 0 when rounding toward negative infinity. Caller will

21849

// fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.

21850

assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21850, __extension__
__PRETTY_FUNCTION__));

21851

// This algorithm is not obvious. Here it is what we're trying to output:

21852

/*

21853

movq %rax, %xmm0

21854

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

21855

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

21856

#ifdef __SSE3__

21857

haddpd %xmm0, %xmm0

21858

#else

21859

pshufd $0x4e, %xmm0, %xmm1

21860

addpd %xmm1, %xmm0

21861

#endif

21862

*/

21863

21864

SDLoc dl(Op);

21865

LLVMContext *Context = DAG.getContext();

21866

21867

// Build some magic constants.

21868

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

21869

Constant *C0 = ConstantDataVector::get(*Context, CV0);

21870

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21871

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

21872

21873

SmallVector<Constant*,2> CV1;

21874

CV1.push_back(

21875

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21876

APInt(64, 0x4330000000000000ULL))));

21877

CV1.push_back(

21878

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21879

APInt(64, 0x4530000000000000ULL))));

21880

Constant *C1 = ConstantVector::get(CV1);

21881

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

21882

21883

// Load the 64-bit value into an XMM register.

21884

SDValue XR1 =

21885

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));

21886

SDValue CLod0 = DAG.getLoad(

21887

MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

21888

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21889

SDValue Unpck1 =

21890

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

21891

21892

SDValue CLod1 = DAG.getLoad(

21893

MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

21894

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21895

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

21896

// TODO: Are there any fast-math-flags to propagate here?

21897

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

21898

SDValue Result;

21899

21900

if (Subtarget.hasSSE3() &&

21901

shouldUseHorizontalOp(true, DAG, Subtarget)) {

21902

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

21903

} else {

21904

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

21905

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

21906

}

21907

Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

21908

DAG.getIntPtrConstant(0, dl));

21909

return Result;

21910

}

21911

21912

/// 32-bit unsigned integer to float expansion.

21913

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

21914

const X86Subtarget &Subtarget) {

21915

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21916

SDLoc dl(Op);

21917

// FP constant to bias correct the final result.

21918

SDValue Bias = DAG.getConstantFP(

21919

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);

21920

21921

// Load the 32-bit value into an XMM register.

21922

SDValue Load =

21923

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

21924

21925

// Zero out the upper parts of the register.

21926

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

21927

21928

// Or the load with the bias.

21929

SDValue Or = DAG.getNode(

21930

ISD::OR, dl, MVT::v2i64,

21931

DAG.getBitcast(MVT::v2i64, Load),

21932

DAG.getBitcast(MVT::v2i64,

21933

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

21934

Or =

21935

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

21936

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

21937

21938

if (Op.getNode()->isStrictFPOpcode()) {

21939

// Subtract the bias.

21940

// TODO: Are there any fast-math-flags to propagate here?

21941

SDValue Chain = Op.getOperand(0);

21942

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

21943

{Chain, Or, Bias});

21944

21945

if (Op.getValueType() == Sub.getValueType())

21946

return Sub;

21947

21948

// Handle final rounding.

21949

std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

21950

Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

21951

21952

return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

21953

}

21954

21955

// Subtract the bias.

21956

// TODO: Are there any fast-math-flags to propagate here?

21957

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

21958

21959

// Handle final rounding.

21960

return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

21961

}

21962

21963

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

21964

const X86Subtarget &Subtarget,

21965

const SDLoc &DL) {

21966

if (Op.getSimpleValueType() != MVT::v2f64)

21967

return SDValue();

21968

21969

bool IsStrict = Op->isStrictFPOpcode();

21970

21971

SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

21972

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21972, __extension__
__PRETTY_FUNCTION__));

21973

21974

if (Subtarget.hasAVX512()) {

21975

if (!Subtarget.hasVLX()) {

21976

// Let generic type legalization widen this.

21977

if (!IsStrict)

21978

return SDValue();

21979

// Otherwise pad the integer input with 0s and widen the operation.

21980

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21981

DAG.getConstant(0, DL, MVT::v2i32));

21982

SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

21983

{Op.getOperand(0), N0});

21984

SDValue Chain = Res.getValue(1);

21985

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

21986

DAG.getIntPtrConstant(0, DL));

21987

return DAG.getMergeValues({Res, Chain}, DL);

21988

}

21989

21990

// Legalize to v4i32 type.

21991

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21992

DAG.getUNDEF(MVT::v2i32));

21993

if (IsStrict)

21994

return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

21995

{Op.getOperand(0), N0});

21996

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

21997

}

21998

21999

// Zero extend to 2i64, OR with the floating point representation of 2^52.

22000

// This gives us the floating point equivalent of 2^52 + the i32 integer

22001

// since double has 52-bits of mantissa. Then subtract 2^52 in floating

22002

// point leaving just our i32 integers in double format.

22003

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

22004

SDValue VBias = DAG.getConstantFP(

22005

llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);

22006

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

22007

DAG.getBitcast(MVT::v2i64, VBias));

22008

Or = DAG.getBitcast(MVT::v2f64, Or);

22009

22010

if (IsStrict)

22011

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

22012

{Op.getOperand(0), Or, VBias});

22013

return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

22014

}

22015

22016

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

22017

const X86Subtarget &Subtarget) {

22018

SDLoc DL(Op);

22019

bool IsStrict = Op->isStrictFPOpcode();

22020

SDValue V = Op->getOperand(IsStrict ? 1 : 0);

22021

MVT VecIntVT = V.getSimpleValueType();

22022

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__
__PRETTY_FUNCTION__))

22023

"Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__
__PRETTY_FUNCTION__));

22024

22025

if (Subtarget.hasAVX512()) {

22026

// With AVX512, but not VLX we need to widen to get a 512-bit result type.

22027

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22027, __extension__
__PRETTY_FUNCTION__));

22028

MVT VT = Op->getSimpleValueType(0);

22029

22030

// v8i32->v8f64 is legal with AVX512 so just return it.

22031

if (VT == MVT::v8f64)

22032

return Op;

22033

22034

assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__
__PRETTY_FUNCTION__))

22035

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__
__PRETTY_FUNCTION__));

22036

MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

22037

MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

22038

// Need to concat with zero vector for strict fp to avoid spurious

22039

// exceptions.

22040

SDValue Tmp =

22041

IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

22042

V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

22043

DAG.getIntPtrConstant(0, DL));

22044

SDValue Res, Chain;

22045

if (IsStrict) {

22046

Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

22047

{Op->getOperand(0), V});

22048

Chain = Res.getValue(1);

22049

} else {

22050

Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

22051

}

22052

22053

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

22054

DAG.getIntPtrConstant(0, DL));

22055

22056

if (IsStrict)

22057

return DAG.getMergeValues({Res, Chain}, DL);

22058

return Res;

22059

}

22060

22061

if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

22062

Op->getSimpleValueType(0) == MVT::v4f64) {

22063

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

22064

Constant *Bias = ConstantFP::get(

22065

*DAG.getContext(),

22066

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

22067

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

22068

SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

22069

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

22070

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

22071

SDValue VBias = DAG.getMemIntrinsicNode(

22072

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

22073

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

22074

MachineMemOperand::MOLoad);

22075

22076

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

22077

DAG.getBitcast(MVT::v4i64, VBias));

22078

Or = DAG.getBitcast(MVT::v4f64, Or);

22079

22080

if (IsStrict)

22081

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

22082

{Op.getOperand(0), Or, VBias});

22083

return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

22084

}

22085

22086

// The algorithm is the following:

22087

// #ifdef __SSE4_1__

22088

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

22089

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

22090

// (uint4) 0x53000000, 0xaa);

22091

// #else

22092

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

22093

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

22094

// #endif

22095

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

22096

// return (float4) lo + fhi;

22097

22098

bool Is128 = VecIntVT == MVT::v4i32;

22099

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

22100

// If we convert to something else than the supported type, e.g., to v4f64,

22101

// abort early.

22102

if (VecFloatVT != Op->getSimpleValueType(0))

22103

return SDValue();

22104

22105

// In the #idef/#else code, we have in common:

22106

// - The vector of constants:

22107

// -- 0x4b000000

22108

// -- 0x53000000

22109

// - A shift:

22110

// -- v >> 16

22111

22112

// Create the splat vector for 0x4b000000.

22113

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

22114

// Create the splat vector for 0x53000000.

22115

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

22116

22117

// Create the right shift.

22118

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

22119

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

22120

22121

SDValue Low, High;

22122

if (Subtarget.hasSSE41()) {

22123

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

22124

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

22125

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

22126

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

22127

// Low will be bitcasted right away, so do not bother bitcasting back to its

22128

// original type.

22129

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

22130

VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

22131

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

22132

// (uint4) 0x53000000, 0xaa);

22133

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

22134

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

22135

// High will be bitcasted right away, so do not bother bitcasting back to

22136

// its original type.

22137

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

22138

VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

22139

} else {

22140

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

22141

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

22142

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

22143

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

22144

22145

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

22146

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

22147

}

22148

22149

// Create the vector constant for (0x1.0p39f + 0x1.0p23f).

22150

SDValue VecCstFSub = DAG.getConstantFP(

22151

APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

22152

22153

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

22154

// NOTE: By using fsub of a positive constant instead of fadd of a negative

22155

// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

22156

// enabled. See PR24512.

22157

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

22158

// TODO: Are there any fast-math-flags to propagate here?

22159

// (float4) lo;

22160

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

22161

// return (float4) lo + fhi;

22162

if (IsStrict) {

22163

SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

22164

{Op.getOperand(0), HighBitcast, VecCstFSub});

22165

return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

22166

{FHigh.getValue(1), LowBitcast, FHigh});

22167

}

22168

22169

SDValue FHigh =

22170

DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

22171

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

22172

}

22173

22174

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

22175

const X86Subtarget &Subtarget) {

22176

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

22177

SDValue N0 = Op.getOperand(OpNo);

22178

MVT SrcVT = N0.getSimpleValueType();

22179

SDLoc dl(Op);

22180

22181

switch (SrcVT.SimpleTy) {

22182

default:

22183

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22183);

22184

case MVT::v2i32:

22185

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

22186

case MVT::v4i32:

22187

case MVT::v8i32:

22188

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

22189

case MVT::v2i64:

22190

case MVT::v4i64:

22191

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

22192

}

22193

}

22194

22195

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

22196

SelectionDAG &DAG) const {

22197

bool IsStrict = Op->isStrictFPOpcode();

22198

unsigned OpNo = IsStrict ? 1 : 0;

22199

SDValue Src = Op.getOperand(OpNo);

22200

SDLoc dl(Op);

22201

auto PtrVT = getPointerTy(DAG.getDataLayout());

22202

MVT SrcVT = Src.getSimpleValueType();

22203

MVT DstVT = Op->getSimpleValueType(0);

22204

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22205

22206

// Bail out when we don't have native conversion instructions.

22207

if (DstVT == MVT::f128)

22208

return SDValue();

22209

22210

if (isSoftFP16(DstVT))

22211

return promoteXINT_TO_FP(Op, DAG);

22212

else if (isLegalConversion(SrcVT, false, Subtarget))

22213

return Op;

22214

22215

if (DstVT.isVector())

22216

return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

22217

22218

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

22219

return LowerWin64_INT128_TO_FP(Op, DAG);

22220

22221

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

22222

return Extract;

22223

22224

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

22225

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

22226

// Conversions from unsigned i32 to f32/f64 are legal,

22227

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

22228

return Op;

22229

}

22230

22231

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

22232

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

22233

Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

22234

if (IsStrict)

22235

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

22236

{Chain, Src});

22237

return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

22238

}

22239

22240

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

22241

return V;

22242

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

22243

return V;

22244

22245

// The transform for i64->f64 isn't correct for 0 when rounding to negative

22246

// infinity. It produces -0.0, so disable under strictfp.

22247

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&

22248

!IsStrict)

22249

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

22250

// The transform for i32->f64/f32 isn't correct for 0 when rounding to

22251

// negative infinity. So disable under strictfp. Using FILD instead.

22252

if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&

22253

!IsStrict)

22254

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

22255

if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&

22256

(DstVT == MVT::f32 || DstVT == MVT::f64))

22257

return SDValue();

22258

22259

// Make a 64-bit buffer, and use it to build an FILD.

22260

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

22261

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

22262

Align SlotAlign(8);

22263

MachinePointerInfo MPI =

22264

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

22265

if (SrcVT == MVT::i32) {

22266

SDValue OffsetSlot =

22267

DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);

22268

SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

22269

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

22270

OffsetSlot, MPI.getWithOffset(4), SlotAlign);

22271

std::pair<SDValue, SDValue> Tmp =

22272

BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

22273

if (IsStrict)

22274

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

22275

22276

return Tmp.first;

22277

}

22278

22279

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22279, __extension__
__PRETTY_FUNCTION__));

22280

SDValue ValueToStore = Src;

22281

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

22282

// Bitcasting to f64 here allows us to do a single 64-bit store from

22283

// an SSE register, avoiding the store forwarding penalty that would come

22284

// with two 32-bit stores.

22285

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

22286

}

22287

SDValue Store =

22288

DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

22289

// For i64 source, we need to add the appropriate power of 2 if the input

22290

// was negative. We must be careful to do the computation in x87 extended

22291

// precision, not in SSE.

22292

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22293

SDValue Ops[] = { Store, StackSlot };

22294

SDValue Fild =

22295

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

22296

SlotAlign, MachineMemOperand::MOLoad);

22297

Chain = Fild.getValue(1);

22298

22299

22300

// Check whether the sign bit is set.

22301

SDValue SignSet = DAG.getSetCC(

22302

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

22303

Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

22304

22305

// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

22306

APInt FF(64, 0x5F80000000000000ULL);

22307

SDValue FudgePtr = DAG.getConstantPool(

22308

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

22309

Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

22310

22311

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

22312

SDValue Zero = DAG.getIntPtrConstant(0, dl);

22313

SDValue Four = DAG.getIntPtrConstant(4, dl);

22314

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

22315

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

22316

22317

// Load the value out, extending it from f32 to f80.

22318

SDValue Fudge = DAG.getExtLoad(

22319

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

22320

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

22321

CPAlignment);

22322

Chain = Fudge.getValue(1);

22323

// Extend everything to 80 bits to force it to be done on x87.

22324

// TODO: Are there any fast-math-flags to propagate here?

22325

if (IsStrict) {

22326

unsigned Opc = ISD::STRICT_FADD;

22327

// Windows needs the precision control changed to 80bits around this add.

22328

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22329

Opc = X86ISD::STRICT_FP80_ADD;

22330

22331

SDValue Add =

22332

DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});

22333

// STRICT_FP_ROUND can't handle equal types.

22334

if (DstVT == MVT::f80)

22335

return Add;

22336

return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

22337

{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

22338

}

22339

unsigned Opc = ISD::FADD;

22340

// Windows needs the precision control changed to 80bits around this add.

22341

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22342

Opc = X86ISD::FP80_ADD;

22343

22344

SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);

22345

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

22346

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

22347

}

22348

22349

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

22350

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

22351

// just return an SDValue().

22352

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

22353

// to i16, i32 or i64, and we lower it to a legal sequence and return the

22354

// result.

22355

SDValue

22356

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

22357

bool IsSigned, SDValue &Chain) const {

22358

bool IsStrict = Op->isStrictFPOpcode();

22359

SDLoc DL(Op);

22360

22361

EVT DstTy = Op.getValueType();

22362

SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

22363

EVT TheVT = Value.getValueType();

22364

auto PtrVT = getPointerTy(DAG.getDataLayout());

22365

22366

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

22367

// f16 must be promoted before using the lowering in this routine.

22368

// fp128 does not use this lowering.

22369

return SDValue();

22370

}

22371

22372

// If using FIST to compute an unsigned i64, we'll need some fixup

22373

// to handle values above the maximum signed i64. A FIST is always

22374

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

22375

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

22376

22377

// FIXME: This does not generate an invalid exception if the input does not

22378

// fit in i32. PR44019

22379

if (!IsSigned && DstTy != MVT::i64) {

22380

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

22381

// The low 32 bits of the fist result will have the correct uint32 result.

22382

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22382, __extension__
__PRETTY_FUNCTION__));

22383

DstTy = MVT::i64;

22384

}

22385

22386

assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))

22387

DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))

22388

"Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__));

22389

22390

// We lower FP->int64 into FISTP64 followed by a load from a temporary

22391

// stack slot.

22392

MachineFunction &MF = DAG.getMachineFunction();

22393

unsigned MemSize = DstTy.getStoreSize();

22394

int SSFI =

22395

MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

22396

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

22397

22398

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22399

22400

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

22401

22402

if (UnsignedFixup) {

22403

//

22404

// Conversion to unsigned i64 is implemented with a select,

22405

// depending on whether the source value fits in the range

22406

// of a signed i64. Let Thresh be the FP equivalent of

22407

// 0x8000000000000000ULL.

22408

//

22409

// Adjust = (Value >= Thresh) ? 0x80000000 : 0;

22410

// FltOfs = (Value >= Thresh) ? 0x80000000 : 0;

22411

// FistSrc = (Value - FltOfs);

22412

// Fist-to-mem64 FistSrc

22413

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

22414

// to XOR'ing the high 32 bits with Adjust.

22415

//

22416

// Being a power of 2, Thresh is exactly representable in all FP formats.

22417

// For X87 we'd like to use the smallest FP type for this constant, but

22418

// for DAG type consistency we have to match the FP operand type.

22419

22420

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

22421

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

22422

bool LosesInfo = false;

22423

if (TheVT == MVT::f64)

22424

// The rounding mode is irrelevant as the conversion should be exact.

22425

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

22426

&LosesInfo);

22427

else if (TheVT == MVT::f80)

22428

Status = Thresh.convert(APFloat::x87DoubleExtended(),

22429

APFloat::rmNearestTiesToEven, &LosesInfo);

22430

22431

assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__
__PRETTY_FUNCTION__))

22432

"FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__
__PRETTY_FUNCTION__));

22433

22434

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

22435

22436

EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

22437

*DAG.getContext(), TheVT);

22438

SDValue Cmp;

22439

if (IsStrict) {

22440

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,

22441

/*IsSignaling*/ true);

22442

Chain = Cmp.getValue(1);

22443

} else {

22444

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);

22445

}

22446

22447

// Our preferred lowering of

22448

//

22449

// (Value >= Thresh) ? 0x8000000000000000ULL : 0

22450

//

22451

// is

22452

//

22453

// (Value >= Thresh) << 63

22454

//

22455

// but since we can get here after LegalOperations, DAGCombine might do the

22456

// wrong thing if we create a select. So, directly create the preferred

22457

// version.

22458

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);

22459

SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);

22460

Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);

22461

22462

SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,

22463

DAG.getConstantFP(0.0, DL, TheVT));

22464

22465

if (IsStrict) {

22466

Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

22467

{ Chain, Value, FltOfs });

22468

Chain = Value.getValue(1);

22469

} else

22470

Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

22471

}

22472

22473

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

22474

22475

// FIXME This causes a redundant load/store if the SSE-class value is already

22476

// in memory, such as if it is on the callstack.

22477

if (isScalarFPTypeInSSEReg(TheVT)) {

22478

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22478, __extension__
__PRETTY_FUNCTION__));

22479

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

22480

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22481

SDValue Ops[] = { Chain, StackSlot };

22482

22483

unsigned FLDSize = TheVT.getStoreSize();

22484

assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22484, __extension__
__PRETTY_FUNCTION__));

22485

MachineMemOperand *MMO = MF.getMachineMemOperand(

22486

MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

22487

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

22488

Chain = Value.getValue(1);

22489

}

22490

22491

// Build the FP_TO_INT*_IN_MEM

22492

MachineMemOperand *MMO = MF.getMachineMemOperand(

22493

MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

22494

SDValue Ops[] = { Chain, Value, StackSlot };

22495

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

22496

DAG.getVTList(MVT::Other),

22497

Ops, DstTy, MMO);

22498

22499

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

22500

Chain = Res.getValue(1);

22501

22502

// If we need an unsigned fixup, XOR the result with adjust.

22503

if (UnsignedFixup)

22504

Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

22505

22506

return Res;

22507

}

22508

22509

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

22510

const X86Subtarget &Subtarget) {

22511

MVT VT = Op.getSimpleValueType();

22512

SDValue In = Op.getOperand(0);

22513

MVT InVT = In.getSimpleValueType();

22514

SDLoc dl(Op);

22515

unsigned Opc = Op.getOpcode();

22516

22517

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22517, __extension__
__PRETTY_FUNCTION__));

22518

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__
__PRETTY_FUNCTION__))

22519

"Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__
__PRETTY_FUNCTION__));

22520

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__
__PRETTY_FUNCTION__))

22521

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__
__PRETTY_FUNCTION__));

22522

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))

22523

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))

22524

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__))

22525

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__
__PRETTY_FUNCTION__));

22526

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))

22527

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))

22528

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__))

22529

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__
__PRETTY_FUNCTION__));

22530

22531

unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);

22532

22533

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

22534

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22534, __extension__
__PRETTY_FUNCTION__));

22535

return splitVectorIntUnary(Op, DAG);

22536

}

22537

22538

if (Subtarget.hasInt256())

22539

return Op;

22540

22541

// Optimize vectors in AVX mode:

22542

//

22543

// v8i16 -> v8i32

22544

// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.

22545

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

22546

// Concat upper and lower parts.

22547

//

22548

// v4i32 -> v4i64

22549

// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.

22550

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

22551

// Concat upper and lower parts.

22552

//

22553

MVT HalfVT = VT.getHalfNumVectorElementsVT();

22554

SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

22555

22556

// Short-circuit if we can determine that each 128-bit half is the same value.

22557

// Otherwise, this is difficult to match and optimize.

22558

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

22559

if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

22560

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

22561

22562

SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

22563

SDValue Undef = DAG.getUNDEF(InVT);

22564

bool NeedZero = Opc == ISD::ZERO_EXTEND;

22565

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

22566

OpHi = DAG.getBitcast(HalfVT, OpHi);

22567

22568

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

22569

}

22570

22571

// Helper to split and extend a v16i1 mask to v16i8 or v16i16.

22572

static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

22573

const SDLoc &dl, SelectionDAG &DAG) {

22574

assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22574, __extension__
__PRETTY_FUNCTION__));

22575

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22576

DAG.getIntPtrConstant(0, dl));

22577

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22578

DAG.getIntPtrConstant(8, dl));

22579

Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

22580

Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

22581

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

22582

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22583

}

22584

22585

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

22586

const X86Subtarget &Subtarget,

22587

SelectionDAG &DAG) {

22588

MVT VT = Op->getSimpleValueType(0);

22589

SDValue In = Op->getOperand(0);

22590

MVT InVT = In.getSimpleValueType();

22591

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22591, __extension__
__PRETTY_FUNCTION__));

22592

SDLoc DL(Op);

22593

unsigned NumElts = VT.getVectorNumElements();

22594

22595

// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

22596

// avoids a constant pool load.

22597

if (VT.getVectorElementType() != MVT::i8) {

22598

SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

22599

return DAG.getNode(ISD::SRL, DL, VT, Extend,

22600

DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

22601

}

22602

22603

// Extend VT if BWI is not supported.

22604

MVT ExtVT = VT;

22605

if (!Subtarget.hasBWI()) {

22606

// If v16i32 is to be avoided, we'll need to split and concatenate.

22607

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

22608

return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

22609

22610

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

22611

}

22612

22613

// Widen to 512-bits if VLX is not supported.

22614

MVT WideVT = ExtVT;

22615

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

22616

NumElts *= 512 / ExtVT.getSizeInBits();

22617

InVT = MVT::getVectorVT(MVT::i1, NumElts);

22618

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

22619

In, DAG.getIntPtrConstant(0, DL));

22620

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

22621

NumElts);

22622

}

22623

22624

SDValue One = DAG.getConstant(1, DL, WideVT);

22625

SDValue Zero = DAG.getConstant(0, DL, WideVT);

22626

22627

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

22628

22629

// Truncate if we had to extend above.

22630

if (VT != ExtVT) {

22631

WideVT = MVT::getVectorVT(MVT::i8, NumElts);

22632

SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

22633

}

22634

22635

// Extract back to 128/256-bit if we widened.

22636

if (WideVT != VT)

22637

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

22638

DAG.getIntPtrConstant(0, DL));

22639

22640

return SelectedVal;

22641

}

22642

22643

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

22644

SelectionDAG &DAG) {

22645

SDValue In = Op.getOperand(0);

22646

MVT SVT = In.getSimpleValueType();

22647

22648

if (SVT.getVectorElementType() == MVT::i1)

22649

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

22650

22651

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22651, __extension__
__PRETTY_FUNCTION__));

22652

return LowerAVXExtend(Op, DAG, Subtarget);

22653

}

22654

22655

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

22656

/// It makes use of the fact that vectors with enough leading sign/zero bits

22657

/// prevent the PACKSS/PACKUS from saturating the results.

22658

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

22659

/// within each 128-bit lane.

22660

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

22661

const SDLoc &DL, SelectionDAG &DAG,

22662

const X86Subtarget &Subtarget) {

22663

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__
__PRETTY_FUNCTION__))

22664

"Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__
__PRETTY_FUNCTION__));

22665

assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22665, __extension__
__PRETTY_FUNCTION__));

22666

22667

// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

22668

if (!Subtarget.hasSSE2())

22669

return SDValue();

22670

22671

EVT SrcVT = In.getValueType();

22672

22673

// No truncation required, we might get here due to recursive calls.

22674

if (SrcVT == DstVT)

22675

return In;

22676

22677

// We only support vector truncation to 64bits or greater from a

22678

// 128bits or greater source.

22679

unsigned DstSizeInBits = DstVT.getSizeInBits();

22680

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

22681

if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)

22682

return SDValue();

22683

22684

unsigned NumElems = SrcVT.getVectorNumElements();

22685

if (!isPowerOf2_32(NumElems))

22686

return SDValue();

22687

22688

LLVMContext &Ctx = *DAG.getContext();

22689

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22689, __extension__
__PRETTY_FUNCTION__));

22690

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22690, __extension__
__PRETTY_FUNCTION__));

22691

22692

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

22693

22694

// Pack to the largest type possible:

22695

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

22696

EVT InVT = MVT::i16, OutVT = MVT::i8;

22697

if (SrcVT.getScalarSizeInBits() > 16 &&

22698

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

22699

InVT = MVT::i32;

22700

OutVT = MVT::i16;

22701

}

22702

22703

// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.

22704

if (SrcVT.is128BitVector()) {

22705

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

22706

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

22707

In = DAG.getBitcast(InVT, In);

22708

SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

22709

Res = extractSubVector(Res, 0, DAG, DL, 64);

22710

return DAG.getBitcast(DstVT, Res);

22711

}

22712

22713

// Split lower/upper subvectors.

22714

SDValue Lo, Hi;

22715

std::tie(Lo, Hi) = splitVector(In, DAG, DL);

22716

22717

unsigned SubSizeInBits = SrcSizeInBits / 2;

22718

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

22719

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

22720

22721

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

22722

if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

22723

Lo = DAG.getBitcast(InVT, Lo);

22724

Hi = DAG.getBitcast(InVT, Hi);

22725

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22726

return DAG.getBitcast(DstVT, Res);

22727

}

22728

22729

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

22730

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

22731

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

22732

Lo = DAG.getBitcast(InVT, Lo);

22733

Hi = DAG.getBitcast(InVT, Hi);

22734

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22735

22736

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

22737

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

22738

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

22739

SmallVector<int, 64> Mask;

22740

int Scale = 64 / OutVT.getScalarSizeInBits();

22741

narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

22742

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

22743

22744

if (DstVT.is256BitVector())

22745

return DAG.getBitcast(DstVT, Res);

22746

22747

// If 512bit -> 128bit truncate another stage.

22748

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22749

Res = DAG.getBitcast(PackedVT, Res);

22750

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22751

}

22752

22753

// Recursively pack lower/upper subvectors, concat result and pack again.

22754

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22754, __extension__
__PRETTY_FUNCTION__));

22755

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

22756

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

22757

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

22758

22759

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22760

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

22761

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22762

}

22763

22764

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

22765

const X86Subtarget &Subtarget) {

22766

22767

SDLoc DL(Op);

22768

MVT VT = Op.getSimpleValueType();

22769

SDValue In = Op.getOperand(0);

22770

MVT InVT = In.getSimpleValueType();

22771

22772

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22772, __extension__
__PRETTY_FUNCTION__));

22773

22774

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

22775

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

22776

if (InVT.getScalarSizeInBits() <= 16) {

22777

if (Subtarget.hasBWI()) {

22778

// legal, will go to VPMOVB2M, VPMOVW2M

22779

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22780

// We need to shift to get the lsb into sign position.

22781

// Shift packed bytes not supported natively, bitcast to word

22782

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

22783

In = DAG.getNode(ISD::SHL, DL, ExtVT,

22784

DAG.getBitcast(ExtVT, In),

22785

DAG.getConstant(ShiftInx, DL, ExtVT));

22786

In = DAG.getBitcast(InVT, In);

22787

}

22788

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

22789

In, ISD::SETGT);

22790

}

22791

// Use TESTD/Q, extended vector to packed dword/qword.

22792

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__
__PRETTY_FUNCTION__))

22793

"Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__
__PRETTY_FUNCTION__));

22794

unsigned NumElts = InVT.getVectorNumElements();

22795

assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__));

22796

// We need to change to a wider element type that we have support for.

22797

// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

22798

// For 16 element vectors we extend to v16i32 unless we are explicitly

22799

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

22800

// we need to split into two 8 element vectors which we can extend to v8i32,

22801

// truncate and concat the results. There's an additional complication if

22802

// the original type is v16i8. In that case we can't split the v16i8

22803

// directly, so we need to shuffle high elements to low and use

22804

// sign_extend_vector_inreg.

22805

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

22806

SDValue Lo, Hi;

22807

if (InVT == MVT::v16i8) {

22808

Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

22809

Hi = DAG.getVectorShuffle(

22810

InVT, DL, In, In,

22811

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

22812

Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

22813

} else {

22814

assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22814, __extension__
__PRETTY_FUNCTION__));

22815

Lo = extract128BitVector(In, 0, DAG, DL);

22816

Hi = extract128BitVector(In, 8, DAG, DL);

22817

}

22818

// We're split now, just emit two truncates and a concat. The two

22819

// truncates will trigger legalization to come back to this function.

22820

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

22821

Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

22822

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22823

}

22824

// We either have 8 elements or we're allowed to use 512-bit vectors.

22825

// If we have VLX, we want to use the narrowest vector that can get the

22826

// job done so we use vXi32.

22827

MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

22828

MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

22829

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

22830

InVT = ExtVT;

22831

ShiftInx = InVT.getScalarSizeInBits() - 1;

22832

}

22833

22834

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22835

// We need to shift to get the lsb into sign position.

22836

In = DAG.getNode(ISD::SHL, DL, InVT, In,

22837

DAG.getConstant(ShiftInx, DL, InVT));

22838

}

22839

// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

22840

if (Subtarget.hasDQI())

22841

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

22842

return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

22843

}

22844

22845

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

22846

SDLoc DL(Op);

22847

MVT VT = Op.getSimpleValueType();

22848

SDValue In = Op.getOperand(0);

22849

MVT InVT = In.getSimpleValueType();

22850

unsigned InNumEltBits = InVT.getScalarSizeInBits();

22851

22852

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__
__PRETTY_FUNCTION__))

22853

"Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__
__PRETTY_FUNCTION__));

22854

22855

// If we're called by the type legalizer, handle a few cases.

22856

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22857

if (!TLI.isTypeLegal(InVT)) {

22858

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

22859

VT.is128BitVector()) {

22860

assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__
__PRETTY_FUNCTION__))

22861

"Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__
__PRETTY_FUNCTION__));

22862

// The default behavior is to truncate one step, concatenate, and then

22863

// truncate the remainder. We'd rather produce two 64-bit results and

22864

// concatenate those.

22865

SDValue Lo, Hi;

22866

std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

22867

22868

EVT LoVT, HiVT;

22869

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

22870

22871

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

22872

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

22873

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22874

}

22875

22876

// Otherwise let default legalization handle it.

22877

return SDValue();

22878

}

22879

22880

if (VT.getVectorElementType() == MVT::i1)

22881

return LowerTruncateVecI1(Op, DAG, Subtarget);

22882

22883

// vpmovqb/w/d, vpmovdb/w, vpmovwb

22884

if (Subtarget.hasAVX512()) {

22885

if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

22886

assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22886, __extension__
__PRETTY_FUNCTION__));

22887

return splitVectorIntUnary(Op, DAG);

22888

}

22889

22890

// word to byte only under BWI. Otherwise we have to promoted to v16i32

22891

// and then truncate that. But we should only do that if we haven't been

22892

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

22893

// handled by isel patterns.

22894

if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

22895

Subtarget.canExtendTo512DQ())

22896

return Op;

22897

}

22898

22899

unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

22900

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

22901

22902

// Truncate with PACKUS if we are truncating a vector with leading zero bits

22903

// that extend all the way to the packed/truncated value.

22904

// Pre-SSE41 we can only use PACKUSWB.

22905

KnownBits Known = DAG.computeKnownBits(In);

22906

if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())

22907

if (SDValue V =

22908

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

22909

return V;

22910

22911

// Truncate with PACKSS if we are truncating a vector with sign-bits that

22912

// extend all the way to the packed/truncated value.

22913

if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))

22914

if (SDValue V =

22915

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

22916

return V;

22917

22918

// Handle truncation of V256 to V128 using shuffles.

22919

assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22919, __extension__
__PRETTY_FUNCTION__));

22920

22921

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

22922

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

22923

if (Subtarget.hasInt256()) {

22924

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

22925

In = DAG.getBitcast(MVT::v8i32, In);

22926

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

22927

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

22928

DAG.getIntPtrConstant(0, DL));

22929

}

22930

22931

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22932

DAG.getIntPtrConstant(0, DL));

22933

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22934

DAG.getIntPtrConstant(2, DL));

22935

static const int ShufMask[] = {0, 2, 4, 6};

22936

return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),

22937

DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);

22938

}

22939

22940

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

22941

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

22942

if (Subtarget.hasInt256()) {

22943

// The PSHUFB mask:

22944

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

22945

-1, -1, -1, -1, -1, -1, -1, -1,

22946

16, 17, 20, 21, 24, 25, 28, 29,

22947

-1, -1, -1, -1, -1, -1, -1, -1 };

22948

In = DAG.getBitcast(MVT::v32i8, In);

22949

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

22950

In = DAG.getBitcast(MVT::v4i64, In);

22951

22952

static const int ShufMask2[] = {0, 2, -1, -1};

22953

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

22954

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22955

DAG.getIntPtrConstant(0, DL));

22956

return DAG.getBitcast(MVT::v8i16, In);

22957

}

22958

22959

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22960

DAG.getIntPtrConstant(0, DL));

22961

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22962

DAG.getIntPtrConstant(4, DL));

22963

22964

// The PSHUFB mask:

22965

static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};

22966

22967

OpLo = DAG.getBitcast(MVT::v8i16, OpLo);

22968

OpHi = DAG.getBitcast(MVT::v8i16, OpHi);

22969

22970

OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);

22971

OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);

22972

22973

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

22974

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

22975

22976

// The MOVLHPS Mask:

22977

static const int ShufMask2[] = {0, 1, 4, 5};

22978

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

22979

return DAG.getBitcast(MVT::v8i16, res);

22980

}

22981

22982

if (VT == MVT::v16i8 && InVT == MVT::v16i16) {

22983

// Use an AND to zero uppper bits for PACKUS.

22984

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

22985

22986

SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22987

DAG.getIntPtrConstant(0, DL));

22988

SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22989

DAG.getIntPtrConstant(8, DL));

22990

return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);

22991

}

22992

22993

llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22993);

22994

}

22995

22996

// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction

22997

// behaves on out of range inputs to generate optimized conversions.

22998

static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,

22999

SelectionDAG &DAG,

23000

const X86Subtarget &Subtarget) {

23001

MVT SrcVT = Src.getSimpleValueType();

23002

unsigned DstBits = VT.getScalarSizeInBits();

23003

assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23003, __extension__
__PRETTY_FUNCTION__));

23004

23005

// Calculate the converted result for values in the range 0 to

23006

// 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

23007

SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);

23008

SDValue Big =

23009

DAG.getNode(X86ISD::CVTTP2SI, dl, VT,

23010

DAG.getNode(ISD::FSUB, dl, SrcVT, Src,

23011

DAG.getConstantFP(2147483648.0f, dl, SrcVT)));

23012

23013

// The "CVTTP2SI" instruction conveniently sets the sign bit if

23014

// and only if the value was out of range. So we can use that

23015

// as our indicator that we rather use "Big" instead of "Small".

23016

//

23017

// Use "Small" if "IsOverflown" has all bits cleared

23018

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

23019

23020

// AVX1 can't use the signsplat masking for 256-bit vectors - we have to

23021

// use the slightly slower blendv select instead.

23022

if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {

23023

SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);

23024

return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);

23025

}

23026

23027

SDValue IsOverflown =

23028

DAG.getNode(X86ISD::VSRAI, dl, VT, Small,

23029

DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));

23030

return DAG.getNode(ISD::OR, dl, VT, Small,

23031

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

23032

}

23033

23034

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

23035

bool IsStrict = Op->isStrictFPOpcode();

23036

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

23037

Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

23038

MVT VT = Op->getSimpleValueType(0);

23039

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23040

SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();

23041

MVT SrcVT = Src.getSimpleValueType();

23042

SDLoc dl(Op);

23043

23044

SDValue Res;

23045

if (isSoftFP16(SrcVT)) {

23046

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

23047

if (IsStrict)

23048

return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},

23049

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

23050

{NVT, MVT::Other}, {Chain, Src})});

23051

return DAG.getNode(Op.getOpcode(), dl, VT,

23052

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

23053

} else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {

23054

return Op;

23055

}

23056

23057

if (VT.isVector()) {

23058

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

23059

MVT ResVT = MVT::v4i32;

23060

MVT TruncVT = MVT::v4i1;

23061

unsigned Opc;

23062

if (IsStrict)

23063

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

23064

else

23065

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

23066

23067

if (!IsSigned && !Subtarget.hasVLX()) {

23068

assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23068, __extension__
__PRETTY_FUNCTION__));

23069

// Widen to 512-bits.

23070

ResVT = MVT::v8i32;

23071

TruncVT = MVT::v8i1;

23072

Opc = Op.getOpcode();

23073

// Need to concat with zero vector for strict fp to avoid spurious

23074

// exceptions.

23075

// TODO: Should we just do this for non-strict as well?

23076

SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

23077

: DAG.getUNDEF(MVT::v8f64);

23078

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

23079

DAG.getIntPtrConstant(0, dl));

23080

}

23081

if (IsStrict) {

23082

Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});

23083

Chain = Res.getValue(1);

23084

} else {

23085

Res = DAG.getNode(Opc, dl, ResVT, Src);

23086

}

23087

23088

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

23089

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

23090

DAG.getIntPtrConstant(0, dl));

23091

if (IsStrict)

23092

return DAG.getMergeValues({Res, Chain}, dl);

23093

return Res;

23094

}

23095

23096

if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {

23097

if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)

23098

return Op;

23099

23100

MVT ResVT = VT;

23101

MVT EleVT = VT.getVectorElementType();

23102

if (EleVT != MVT::i64)

23103

ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

23104

23105

if (SrcVT != MVT::v8f16) {

23106

SDValue Tmp =

23107

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

23108

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

23109

Ops[0] = Src;

23110

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

23111

}

23112

23113

if (IsStrict) {

23114

Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI

23115

: X86ISD::STRICT_CVTTP2UI,

23116

dl, {ResVT, MVT::Other}, {Chain, Src});

23117

Chain = Res.getValue(1);

23118

} else {

23119

Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,

23120

ResVT, Src);

23121

}

23122

23123

// TODO: Need to add exception check code for strict FP.

23124

if (EleVT.getSizeInBits() < 16) {

23125

ResVT = MVT::getVectorVT(EleVT, 8);

23126

Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);

23127

}

23128

23129

if (ResVT != VT)

23130

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23131

DAG.getIntPtrConstant(0, dl));

23132

23133

if (IsStrict)

23134

return DAG.getMergeValues({Res, Chain}, dl);

23135

return Res;

23136

}

23137

23138

// v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.

23139

if (VT.getVectorElementType() == MVT::i16) {

23140

assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))

23141

SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__))

23142

"Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__
__PRETTY_FUNCTION__));

23143

MVT NVT = VT.changeVectorElementType(MVT::i32);

23144

if (IsStrict) {

23145

Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT

23146

: ISD::STRICT_FP_TO_UINT,

23147

dl, {NVT, MVT::Other}, {Chain, Src});

23148

Chain = Res.getValue(1);

23149

} else {

23150

Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,

23151

NVT, Src);

23152

}

23153

23154

// TODO: Need to add exception check code for strict FP.

23155

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23156

23157

if (IsStrict)

23158

return DAG.getMergeValues({Res, Chain}, dl);

23159

return Res;

23160

}

23161

23162

// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

23163

if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

23164

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23164, __extension__
__PRETTY_FUNCTION__));

23165

assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23165, __extension__
__PRETTY_FUNCTION__));

23166

return Op;

23167

}

23168

23169

// Widen vXi32 fp_to_uint with avx512f to 512-bit source.

23170

if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

23171

(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&

23172

Subtarget.useAVX512Regs()) {

23173

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23173, __extension__
__PRETTY_FUNCTION__));

23174

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23174, __extension__
__PRETTY_FUNCTION__));

23175

MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

23176

MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

23177

// Need to concat with zero vector for strict fp to avoid spurious

23178

// exceptions.

23179

// TODO: Should we just do this for non-strict as well?

23180

SDValue Tmp =

23181

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

23182

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

23183

DAG.getIntPtrConstant(0, dl));

23184

23185

if (IsStrict) {

23186

Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

23187

{Chain, Src});

23188

Chain = Res.getValue(1);

23189

} else {

23190

Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

23191

}

23192

23193

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23194

DAG.getIntPtrConstant(0, dl));

23195

23196

if (IsStrict)

23197

return DAG.getMergeValues({Res, Chain}, dl);

23198

return Res;

23199

}

23200

23201

// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

23202

if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

23203

(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&

23204

Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {

23205

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23205, __extension__
__PRETTY_FUNCTION__));

23206

MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

23207

// Need to concat with zero vector for strict fp to avoid spurious

23208

// exceptions.

23209

// TODO: Should we just do this for non-strict as well?

23210

SDValue Tmp =

23211

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

23212

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

23213

DAG.getIntPtrConstant(0, dl));

23214

23215

if (IsStrict) {

23216

Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

23217

{Chain, Src});

23218

Chain = Res.getValue(1);

23219

} else {

23220

Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

23221

}

23222

23223

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23224

DAG.getIntPtrConstant(0, dl));

23225

23226

if (IsStrict)

23227

return DAG.getMergeValues({Res, Chain}, dl);

23228

return Res;

23229

}

23230

23231

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

23232

if (!Subtarget.hasVLX()) {

23233

// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

23234

// legalizer and then widened again by vector op legalization.

23235

if (!IsStrict)

23236

return SDValue();

23237

23238

SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

23239

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

23240

{Src, Zero, Zero, Zero});

23241

Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

23242

{Chain, Tmp});

23243

SDValue Chain = Tmp.getValue(1);

23244

Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

23245

DAG.getIntPtrConstant(0, dl));

23246

return DAG.getMergeValues({Tmp, Chain}, dl);

23247

}

23248

23249

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23249, __extension__
__PRETTY_FUNCTION__));

23250

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

23251

DAG.getUNDEF(MVT::v2f32));

23252

if (IsStrict) {

23253

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

23254

: X86ISD::STRICT_CVTTP2UI;

23255

return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

23256

}

23257

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

23258

return DAG.getNode(Opc, dl, VT, Tmp);

23259

}

23260

23261

// Generate optimized instructions for pre AVX512 unsigned conversions from

23262

// vXf32 to vXi32.

23263

if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||

23264

(VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||

23265

(VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {

23266

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23266, __extension__
__PRETTY_FUNCTION__));

23267

return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);

23268

}

23269

23270

return SDValue();

23271

}

23272

23273

assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23273, __extension__ __PRETTY_FUNCTION__));

23274

23275

bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

23276

23277

if (!IsSigned && UseSSEReg) {

23278

// Conversions from f32/f64 with AVX512 should be legal.

23279

if (Subtarget.hasAVX512())

23280

return Op;

23281

23282

// We can leverage the specific way the "cvttss2si/cvttsd2si" instruction

23283

// behaves on out of range inputs to generate optimized conversions.

23284

if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||

23285

(VT == MVT::i64 && Subtarget.is64Bit()))) {

23286

unsigned DstBits = VT.getScalarSizeInBits();

23287

APInt UIntLimit = APInt::getSignMask(DstBits);

23288

SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,

23289

DAG.getConstant(UIntLimit, dl, VT));

23290

MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());

23291

23292

// Calculate the converted result for values in the range:

23293

// (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

23294

// (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").

23295

SDValue Small =

23296

DAG.getNode(X86ISD::CVTTS2SI, dl, VT,

23297

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));

23298

SDValue Big = DAG.getNode(

23299

X86ISD::CVTTS2SI, dl, VT,

23300

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,

23301

DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));

23302

23303

// The "CVTTS2SI" instruction conveniently sets the sign bit if

23304

// and only if the value was out of range. So we can use that

23305

// as our indicator that we rather use "Big" instead of "Small".

23306

//

23307

// Use "Small" if "IsOverflown" has all bits cleared

23308

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

23309

SDValue IsOverflown = DAG.getNode(

23310

ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));

23311

return DAG.getNode(ISD::OR, dl, VT, Small,

23312

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

23313

}

23314

23315

// Use default expansion for i64.

23316

if (VT == MVT::i64)

23317

return SDValue();

23318

23319

assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23319, __extension__
__PRETTY_FUNCTION__));

23320

23321

// Promote i32 to i64 and use a signed operation on 64-bit targets.

23322

// FIXME: This does not generate an invalid exception if the input does not

23323

// fit in i32. PR44019

23324

if (Subtarget.is64Bit()) {

23325

if (IsStrict) {

23326

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},

23327

{Chain, Src});

23328

Chain = Res.getValue(1);

23329

} else

23330

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

23331

23332

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23333

if (IsStrict)

23334

return DAG.getMergeValues({Res, Chain}, dl);

23335

return Res;

23336

}

23337

23338

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

23339

// use fisttp which will be handled later.

23340

if (!Subtarget.hasSSE3())

23341

return SDValue();

23342

}

23343

23344

// Promote i16 to i32 if we can use a SSE operation or the type is f128.

23345

// FIXME: This does not generate an invalid exception if the input does not

23346

// fit in i16. PR44019

23347

if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

23348

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23348, __extension__
__PRETTY_FUNCTION__));

23349

if (IsStrict) {

23350

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},

23351

{Chain, Src});

23352

Chain = Res.getValue(1);

23353

} else

23354

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

23355

23356

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23357

if (IsStrict)

23358

return DAG.getMergeValues({Res, Chain}, dl);

23359

return Res;

23360

}

23361

23362

// If this is a FP_TO_SINT using SSEReg we're done.

23363

if (UseSSEReg && IsSigned)

23364

return Op;

23365

23366

// fp128 needs to use a libcall.

23367

if (SrcVT == MVT::f128) {

23368

RTLIB::Libcall LC;

23369

if (IsSigned)

23370

LC = RTLIB::getFPTOSINT(SrcVT, VT);

23371

else

23372

LC = RTLIB::getFPTOUINT(SrcVT, VT);

23373

23374

MakeLibCallOptions CallOptions;

23375

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

23376

SDLoc(Op), Chain);

23377

23378

if (IsStrict)

23379

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

23380

23381

return Tmp.first;

23382

}

23383

23384

// Fall back to X87.

23385

if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

23386

if (IsStrict)

23387

return DAG.getMergeValues({V, Chain}, dl);

23388

return V;

23389

}

23390

23391

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23391);

23392

}

23393

23394

SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

23395

SelectionDAG &DAG) const {

23396

SDValue Src = Op.getOperand(0);

23397

MVT SrcVT = Src.getSimpleValueType();

23398

23399

if (SrcVT == MVT::f16)

23400

return SDValue();

23401

23402

// If the source is in an SSE register, the node is Legal.

23403

if (isScalarFPTypeInSSEReg(SrcVT))

23404

return Op;

23405

23406

return LRINT_LLRINTHelper(Op.getNode(), DAG);

23407

}

23408

23409

SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

23410

SelectionDAG &DAG) const {

23411

EVT DstVT = N->getValueType(0);

23412

SDValue Src = N->getOperand(0);

23413

EVT SrcVT = Src.getValueType();

23414

23415

if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

23416

// f16 must be promoted before using the lowering in this routine.

23417

// fp128 does not use this lowering.

23418

return SDValue();

23419

}

23420

23421

SDLoc DL(N);

23422

SDValue Chain = DAG.getEntryNode();

23423

23424

bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

23425

23426

// If we're converting from SSE, the stack slot needs to hold both types.

23427

// Otherwise it only needs to hold the DstVT.

23428

EVT OtherVT = UseSSE ? SrcVT : DstVT;

23429

SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

23430

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

23431

MachinePointerInfo MPI =

23432

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

23433

23434

if (UseSSE) {

23435

assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23435, __extension__
__PRETTY_FUNCTION__));

23436

Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

23437

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

23438

SDValue Ops[] = { Chain, StackPtr };

23439

23440

Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

23441

/*Align*/ std::nullopt,

23442

MachineMemOperand::MOLoad);

23443

Chain = Src.getValue(1);

23444

}

23445

23446

SDValue StoreOps[] = { Chain, Src, StackPtr };

23447

Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

23448

StoreOps, DstVT, MPI, /*Align*/ std::nullopt,

23449

MachineMemOperand::MOStore);

23450

23451

return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

23452

}

23453

23454

SDValue

23455

X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {

23456

// This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,

23457

// but making use of X86 specifics to produce better instruction sequences.

23458

SDNode *Node = Op.getNode();

23459

bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;

23460

unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

23461

SDLoc dl(SDValue(Node, 0));

23462

SDValue Src = Node->getOperand(0);

23463

23464

// There are three types involved here: SrcVT is the source floating point

23465

// type, DstVT is the type of the result, and TmpVT is the result of the

23466

// intermediate FP_TO_*INT operation we'll use (which may be a promotion of

23467

// DstVT).

23468

EVT SrcVT = Src.getValueType();

23469

EVT DstVT = Node->getValueType(0);

23470

EVT TmpVT = DstVT;

23471

23472

// This code is only for floats and doubles. Fall back to generic code for

23473

// anything else.

23474

if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))

23475

return SDValue();

23476

23477

EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();

23478

unsigned SatWidth = SatVT.getScalarSizeInBits();

23479

unsigned DstWidth = DstVT.getScalarSizeInBits();

23480

unsigned TmpWidth = TmpVT.getScalarSizeInBits();

23481

assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__
__PRETTY_FUNCTION__))

23482

"Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__
__PRETTY_FUNCTION__));

23483

23484

// Promote result of FP_TO_*INT to at least 32 bits.

23485

if (TmpWidth < 32) {

23486

TmpVT = MVT::i32;

23487

TmpWidth = 32;

23488

}

23489

23490

// Promote conversions to unsigned 32-bit to 64-bit, because it will allow

23491

// us to use a native signed conversion instead.

23492

if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {

23493

TmpVT = MVT::i64;

23494

TmpWidth = 64;

23495

}

23496

23497

// If the saturation width is smaller than the size of the temporary result,

23498

// we can always use signed conversion, which is native.

23499

if (SatWidth < TmpWidth)

23500

FpToIntOpcode = ISD::FP_TO_SINT;

23501

23502

// Determine minimum and maximum integer values and their corresponding

23503

// floating-point values.

23504

APInt MinInt, MaxInt;

23505

if (IsSigned) {

23506

MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);

23507

MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);

23508

} else {

23509

MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);

23510

MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);

23511

}

23512

23513

APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23514

APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23515

23516

APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(

23517

MinInt, IsSigned, APFloat::rmTowardZero);

23518

APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(

23519

MaxInt, IsSigned, APFloat::rmTowardZero);

23520

bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)

23521

&& !(MaxStatus & APFloat::opStatus::opInexact);

23522

23523

SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);

23524

SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);

23525

23526

// If the integer bounds are exactly representable as floats, emit a

23527

// min+max+fptoi sequence. Otherwise use comparisons and selects.

23528

if (AreExactFloatBounds) {

23529

if (DstVT != TmpVT) {

23530

// Clamp by MinFloat from below. If Src is NaN, propagate NaN.

23531

SDValue MinClamped = DAG.getNode(

23532

X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);

23533

// Clamp by MaxFloat from above. If Src is NaN, propagate NaN.

23534

SDValue BothClamped = DAG.getNode(

23535

X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);

23536

// Convert clamped value to integer.

23537

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);

23538

23539

// NaN will become INDVAL, with the top bit set and the rest zero.

23540

// Truncation will discard the top bit, resulting in zero.

23541

return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23542

}

23543

23544

// Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.

23545

SDValue MinClamped = DAG.getNode(

23546

X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);

23547

// Clamp by MaxFloat from above. NaN cannot occur.

23548

SDValue BothClamped = DAG.getNode(

23549

X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);

23550

// Convert clamped value to integer.

23551

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);

23552

23553

if (!IsSigned) {

23554

// In the unsigned case we're done, because we mapped NaN to MinFloat,

23555

// which is zero.

23556

return FpToInt;

23557

}

23558

23559

// Otherwise, select zero if Src is NaN.

23560

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23561

return DAG.getSelectCC(

23562

dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);

23563

}

23564

23565

SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);

23566

SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);

23567

23568

// Result of direct conversion, which may be selected away.

23569

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);

23570

23571

if (DstVT != TmpVT) {

23572

// NaN will become INDVAL, with the top bit set and the rest zero.

23573

// Truncation will discard the top bit, resulting in zero.

23574

FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23575

}

23576

23577

SDValue Select = FpToInt;

23578

// For signed conversions where we saturate to the same size as the

23579

// result type of the fptoi instructions, INDVAL coincides with integer

23580

// minimum, so we don't need to explicitly check it.

23581

if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {

23582

// If Src ULT MinFloat, select MinInt. In particular, this also selects

23583

// MinInt if Src is NaN.

23584

Select = DAG.getSelectCC(

23585

dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);

23586

}

23587

23588

// If Src OGT MaxFloat, select MaxInt.

23589

Select = DAG.getSelectCC(

23590

dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);

23591

23592

// In the unsigned case we are done, because we mapped NaN to MinInt, which

23593

// is already zero. The promoted case was already handled above.

23594

if (!IsSigned || DstVT != TmpVT) {

23595

return Select;

23596

}

23597

23598

// Otherwise, select 0 if Src is NaN.

23599

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23600

return DAG.getSelectCC(

23601

dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);

23602

}

23603

23604

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

23605

bool IsStrict = Op->isStrictFPOpcode();

23606

23607

SDLoc DL(Op);

23608

MVT VT = Op.getSimpleValueType();

23609

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23610

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23611

MVT SVT = In.getSimpleValueType();

23612

23613

// Let f16->f80 get lowered to a libcall, except for darwin, where we should

23614

// lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)

23615

if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&

23616

!Subtarget.getTargetTriple().isOSDarwin()))

23617

return SDValue();

23618

23619

if (SVT == MVT::f16) {

23620

if (Subtarget.hasFP16())

23621

return Op;

23622

23623

if (VT != MVT::f32) {

23624

if (IsStrict)

23625

return DAG.getNode(

23626

ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},

23627

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,

23628

{MVT::f32, MVT::Other}, {Chain, In})});

23629

23630

return DAG.getNode(ISD::FP_EXTEND, DL, VT,

23631

DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));

23632

}

23633

23634

if (!Subtarget.hasF16C()) {

23635

if (!Subtarget.getTargetTriple().isOSDarwin())

23636

return SDValue();

23637

23638

assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23638, __extension__
__PRETTY_FUNCTION__));

23639

23640

// Need a libcall, but ABI for f16 is soft-float on MacOS.

23641

TargetLowering::CallLoweringInfo CLI(DAG);

23642

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23643

23644

In = DAG.getBitcast(MVT::i16, In);

23645

TargetLowering::ArgListTy Args;

23646

TargetLowering::ArgListEntry Entry;

23647

Entry.Node = In;

23648

Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());

23649

Entry.IsSExt = false;

23650

Entry.IsZExt = true;

23651

Args.push_back(Entry);

23652

23653

SDValue Callee = DAG.getExternalSymbol(

23654

getLibcallName(RTLIB::FPEXT_F16_F32),

23655

getPointerTy(DAG.getDataLayout()));

23656

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23657

CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,

23658

std::move(Args));

23659

23660

SDValue Res;

23661

std::tie(Res,Chain) = LowerCallTo(CLI);

23662

if (IsStrict)

23663

Res = DAG.getMergeValues({Res, Chain}, DL);

23664

23665

return Res;

23666

}

23667

23668

In = DAG.getBitcast(MVT::i16, In);

23669

In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,

23670

getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,

23671

DAG.getIntPtrConstant(0, DL));

23672

SDValue Res;

23673

if (IsStrict) {

23674

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},

23675

{Chain, In});

23676

Chain = Res.getValue(1);

23677

} else {

23678

Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,

23679

DAG.getTargetConstant(4, DL, MVT::i32));

23680

}

23681

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,

23682

DAG.getIntPtrConstant(0, DL));

23683

if (IsStrict)

23684

return DAG.getMergeValues({Res, Chain}, DL);

23685

return Res;

23686

}

23687

23688

if (!SVT.isVector())

23689

return Op;

23690

23691

if (SVT.getVectorElementType() == MVT::f16) {

23692

assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23692, __extension__
__PRETTY_FUNCTION__));

23693

if (SVT == MVT::v2f16)

23694

In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,

23695

DAG.getUNDEF(MVT::v2f16));

23696

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,

23697

DAG.getUNDEF(MVT::v4f16));

23698

if (IsStrict)

23699

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23700

{Op->getOperand(0), Res});

23701

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23702

} else if (VT == MVT::v4f64 || VT == MVT::v8f64) {

23703

return Op;

23704

}

23705

23706

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23706, __extension__
__PRETTY_FUNCTION__));

23707

23708

SDValue Res =

23709

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

23710

if (IsStrict)

23711

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23712

{Op->getOperand(0), Res});

23713

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23714

}

23715

23716

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

23717

bool IsStrict = Op->isStrictFPOpcode();

23718

23719

SDLoc DL(Op);

23720

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23721

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23722

MVT VT = Op.getSimpleValueType();

23723

MVT SVT = In.getSimpleValueType();

23724

23725

if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))

23726

return SDValue();

23727

23728

if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&

23729

!Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {

23730

if (!Subtarget.getTargetTriple().isOSDarwin())

23731

return SDValue();

23732

23733

// We need a libcall but the ABI for f16 libcalls on MacOS is soft.

23734

TargetLowering::CallLoweringInfo CLI(DAG);

23735

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23736

23737

TargetLowering::ArgListTy Args;

23738

TargetLowering::ArgListEntry Entry;

23739

Entry.Node = In;

23740

Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());

23741

Entry.IsSExt = false;

23742

Entry.IsZExt = true;

23743

Args.push_back(Entry);

23744

23745

SDValue Callee = DAG.getExternalSymbol(

23746

getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16

23747

: RTLIB::FPROUND_F32_F16),

23748

getPointerTy(DAG.getDataLayout()));

23749

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23750

CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,

23751

std::move(Args));

23752

23753

SDValue Res;

23754

std::tie(Res, Chain) = LowerCallTo(CLI);

23755

23756

Res = DAG.getBitcast(MVT::f16, Res);

23757

23758

if (IsStrict)

23759

Res = DAG.getMergeValues({Res, Chain}, DL);

23760

23761

return Res;

23762

}

23763

23764

if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {

23765

if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)

23766

return SDValue();

23767

23768

if (VT.isVector())

23769

return Op;

23770

23771

SDValue Res;

23772

SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,

23773

MVT::i32);

23774

if (IsStrict) {

23775

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,

23776

DAG.getConstantFP(0, DL, MVT::v4f32), In,

23777

DAG.getIntPtrConstant(0, DL));

23778

Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},

23779

{Chain, Res, Rnd});

23780

Chain = Res.getValue(1);

23781

} else {

23782

// FIXME: Should we use zeros for upper elements for non-strict?

23783

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);

23784

Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);

23785

}

23786

23787

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,

23788

DAG.getIntPtrConstant(0, DL));

23789

Res = DAG.getBitcast(MVT::f16, Res);

23790

23791

if (IsStrict)

23792

return DAG.getMergeValues({Res, Chain}, DL);

23793

23794

return Res;

23795

}

23796

23797

return Op;

23798

}

23799

23800

static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

23801

bool IsStrict = Op->isStrictFPOpcode();

23802

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23803

assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__
__PRETTY_FUNCTION__))

23804

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__
__PRETTY_FUNCTION__));

23805

23806

SDLoc dl(Op);

23807

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

23808

DAG.getConstant(0, dl, MVT::v8i16), Src,

23809

DAG.getIntPtrConstant(0, dl));

23810

23811

SDValue Chain;

23812

if (IsStrict) {

23813

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

23814

{Op.getOperand(0), Res});

23815

Chain = Res.getValue(1);

23816

} else {

23817

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

23818

}

23819

23820

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

23821

DAG.getIntPtrConstant(0, dl));

23822

23823

if (IsStrict)

23824

return DAG.getMergeValues({Res, Chain}, dl);

23825

23826

return Res;

23827

}

23828

23829

static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

23830

bool IsStrict = Op->isStrictFPOpcode();

23831

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23832

assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__
__PRETTY_FUNCTION__))

23833

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__
__PRETTY_FUNCTION__));

23834

23835

SDLoc dl(Op);

23836

SDValue Res, Chain;

23837

if (IsStrict) {

23838

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

23839

DAG.getConstantFP(0, dl, MVT::v4f32), Src,

23840

DAG.getIntPtrConstant(0, dl));

23841

Res = DAG.getNode(

23842

X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

23843

{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

23844

Chain = Res.getValue(1);

23845

} else {

23846

// FIXME: Should we use zeros for upper elements for non-strict?

23847

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

23848

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

23849

DAG.getTargetConstant(4, dl, MVT::i32));

23850

}

23851

23852

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

23853

DAG.getIntPtrConstant(0, dl));

23854

23855

if (IsStrict)

23856

return DAG.getMergeValues({Res, Chain}, dl);

23857

23858

return Res;

23859

}

23860

23861

SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,

23862

SelectionDAG &DAG) const {

23863

SDLoc DL(Op);

23864

MakeLibCallOptions CallOptions;

23865

RTLIB::Libcall LC =

23866

RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);

23867

SDValue Res =

23868

makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;

23869

return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,

23870

DAG.getBitcast(MVT::i32, Res));

23871

}

23872

23873

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23874

/// vector operation in place of the typical scalar operation.

23875

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

23876

const X86Subtarget &Subtarget) {

23877

// If both operands have other uses, this is probably not profitable.

23878

SDValue LHS = Op.getOperand(0);

23879

SDValue RHS = Op.getOperand(1);

23880

if (!LHS.hasOneUse() && !RHS.hasOneUse())

23881

return Op;

23882

23883

// FP horizontal add/sub were added with SSE3. Integer with SSSE3.

23884

bool IsFP = Op.getSimpleValueType().isFloatingPoint();

23885

if (IsFP && !Subtarget.hasSSE3())

23886

return Op;

23887

if (!IsFP && !Subtarget.hasSSSE3())

23888

return Op;

23889

23890

// Extract from a common vector.

23891

if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23892

RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23893

LHS.getOperand(0) != RHS.getOperand(0) ||

23894

!isa<ConstantSDNode>(LHS.getOperand(1)) ||

23895

!isa<ConstantSDNode>(RHS.getOperand(1)) ||

23896

!shouldUseHorizontalOp(true, DAG, Subtarget))

23897

return Op;

23898

23899

// Allow commuted 'hadd' ops.

23900

// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

23901

unsigned HOpcode;

23902

switch (Op.getOpcode()) {

23903

case ISD::ADD: HOpcode = X86ISD::HADD; break;

23904

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

23905

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

23906

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

23907

default:

23908

llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23908);

23909

}

23910

unsigned LExtIndex = LHS.getConstantOperandVal(1);

23911

unsigned RExtIndex = RHS.getConstantOperandVal(1);

23912

if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

23913

(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

23914

std::swap(LExtIndex, RExtIndex);

23915

23916

if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

23917

return Op;

23918

23919

SDValue X = LHS.getOperand(0);

23920

EVT VecVT = X.getValueType();

23921

unsigned BitWidth = VecVT.getSizeInBits();

23922

unsigned NumLanes = BitWidth / 128;

23923

unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

23924

assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__
__PRETTY_FUNCTION__))

23925

"Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__
__PRETTY_FUNCTION__));

23926

23927

// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

23928

// equivalent, so extract the 256/512-bit source op to 128-bit if we can.

23929

SDLoc DL(Op);

23930

if (BitWidth == 256 || BitWidth == 512) {

23931

unsigned LaneIdx = LExtIndex / NumEltsPerLane;

23932

X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

23933

LExtIndex %= NumEltsPerLane;

23934

}

23935

23936

// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

23937

// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

23938

// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

23939

// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

23940

SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

23941

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

23942

DAG.getIntPtrConstant(LExtIndex / 2, DL));

23943

}

23944

23945

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23946

/// vector operation in place of the typical scalar operation.

23947

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

23948

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__
__PRETTY_FUNCTION__))

23949

"Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__
__PRETTY_FUNCTION__));

23950

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

23951

}

23952

23953

/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

23954

/// This mode isn't supported in hardware on X86. But as long as we aren't

23955

/// compiling with trapping math, we can emulate this with

23956

/// trunc(X + copysign(nextafter(0.5, 0.0), X)).

23957

static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

23958

SDValue N0 = Op.getOperand(0);

23959

SDLoc dl(Op);

23960

MVT VT = Op.getSimpleValueType();

23961

23962

// N0 += copysign(nextafter(0.5, 0.0), N0)

23963

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23964

bool Ignored;

23965

APFloat Point5Pred = APFloat(0.5f);

23966

Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

23967

Point5Pred.next(/*nextDown*/true);

23968

23969

SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

23970

DAG.getConstantFP(Point5Pred, dl, VT), N0);

23971

N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

23972

23973

// Truncate the result to remove fraction.

23974

return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

23975

}

23976

23977

/// The only differences between FABS and FNEG are the mask and the logic op.

23978

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

23979

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

23980

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__
__PRETTY_FUNCTION__))

23981

"Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__
__PRETTY_FUNCTION__));

23982

23983

bool IsFABS = (Op.getOpcode() == ISD::FABS);

23984

23985

// If this is a FABS and it has an FNEG user, bail out to fold the combination

23986

// into an FNABS. We'll lower the FABS after that if it is still in use.

23987

if (IsFABS)

23988

for (SDNode *User : Op->uses())

23989

if (User->getOpcode() == ISD::FNEG)

23990

return Op;

23991

23992

SDLoc dl(Op);

23993

MVT VT = Op.getSimpleValueType();

23994

23995

bool IsF128 = (VT == MVT::f128);

23996

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))

23997

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__))

23998

"Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__
__PRETTY_FUNCTION__));

23999

24000

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

24001

// decide if we should generate a 16-byte constant mask when we only need 4 or

24002

// 8 bytes for the scalar case.

24003

24004

// There are no scalar bitwise logical SSE/AVX instructions, so we

24005

// generate a 16-byte vector constant and logic op even for the scalar case.

24006

// Using a 16-byte mask allows folding the load of the mask with

24007

// the logic op, so it can save (~4 bytes) on code size.

24008

bool IsFakeVector = !VT.isVector() && !IsF128;

24009

MVT LogicVT = VT;

24010

if (IsFakeVector)

24011

LogicVT = (VT == MVT::f64) ? MVT::v2f64

24012

: (VT == MVT::f32) ? MVT::v4f32

24013

: MVT::v8f16;

24014

24015

unsigned EltBits = VT.getScalarSizeInBits();

24016

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

24017

APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

24018

APInt::getSignMask(EltBits);

24019

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

24020

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

24021

24022

SDValue Op0 = Op.getOperand(0);

24023

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

24024

unsigned LogicOp = IsFABS ? X86ISD::FAND :

24025

IsFNABS ? X86ISD::FOR :

24026

X86ISD::FXOR;

24027

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

24028

24029

if (VT.isVector() || IsF128)

24030

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

24031

24032

// For the scalar case extend to a 128-bit vector, perform the logic op,

24033

// and extract the scalar result back out.

24034

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

24035

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

24036

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

24037

DAG.getIntPtrConstant(0, dl));

24038

}

24039

24040

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

24041

SDValue Mag = Op.getOperand(0);

24042

SDValue Sign = Op.getOperand(1);

24043

SDLoc dl(Op);

24044

24045

// If the sign operand is smaller, extend it first.

24046

MVT VT = Op.getSimpleValueType();

24047

if (Sign.getSimpleValueType().bitsLT(VT))

24048

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

24049

24050

// And if it is bigger, shrink it first.

24051

if (Sign.getSimpleValueType().bitsGT(VT))

24052

Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,

24053

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

24054

24055

// At this point the operands and the result should have the same

24056

// type, and that won't be f80 since that is not custom lowered.

24057

bool IsF128 = (VT == MVT::f128);

24058

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))

24059

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__))

24060

"Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__
__PRETTY_FUNCTION__));

24061

24062

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

24063

24064

// Perform all scalar logic operations as 16-byte vectors because there are no

24065

// scalar FP logic instructions in SSE.

24066

// TODO: This isn't necessary. If we used scalar types, we might avoid some

24067

// unnecessary splats, but we might miss load folding opportunities. Should

24068

// this decision be based on OptimizeForSize?

24069

bool IsFakeVector = !VT.isVector() && !IsF128;

24070

MVT LogicVT = VT;

24071

if (IsFakeVector)

24072

LogicVT = (VT == MVT::f64) ? MVT::v2f64

24073

: (VT == MVT::f32) ? MVT::v4f32

24074

: MVT::v8f16;

24075

24076

// The mask constants are automatically splatted for vector types.

24077

unsigned EltSizeInBits = VT.getScalarSizeInBits();

24078

SDValue SignMask = DAG.getConstantFP(

24079

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

24080

SDValue MagMask = DAG.getConstantFP(

24081

APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

24082

24083

// First, clear all bits but the sign bit from the second operand (sign).

24084

if (IsFakeVector)

24085

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

24086

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

24087

24088

// Next, clear the sign bit from the first operand (magnitude).

24089

// TODO: If we had general constant folding for FP logic ops, this check

24090

// wouldn't be necessary.

24091

SDValue MagBits;

24092

if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

24093

APFloat APF = Op0CN->getValueAPF();

24094

APF.clearSign();

24095

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

24096

} else {

24097

// If the magnitude operand wasn't a constant, we need to AND out the sign.

24098

if (IsFakeVector)

24099

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

24100

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

24101

}

24102

24103

// OR the magnitude value with the sign bit.

24104

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

24105

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

24106

DAG.getIntPtrConstant(0, dl));

24107

}

24108

24109

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

24110

SDValue N0 = Op.getOperand(0);

24111

SDLoc dl(Op);

24112

MVT VT = Op.getSimpleValueType();

24113

24114

MVT OpVT = N0.getSimpleValueType();

24115

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__
__PRETTY_FUNCTION__))

24116

"Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__
__PRETTY_FUNCTION__));

24117

24118

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

24119

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

24120

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

24121

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

24122

Res = DAG.getZExtOrTrunc(Res, dl, VT);

24123

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

24124

return Res;

24125

}

24126

24127

/// Helper for attempting to create a X86ISD::BT node.

24128

static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {

24129

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

24130

// instruction. Since the shift amount is in-range-or-undefined, we know

24131

// that doing a bittest on the i32 value is ok. We extend to i32 because

24132

// the encoding for the i16 version is larger than the i32 version.

24133

// Also promote i16 to i32 for performance / code size reason.

24134

if (Src.getValueType().getScalarSizeInBits() < 32)

24135

Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);

24136

24137

// No legal type found, give up.

24138

if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))

24139

return SDValue();

24140

24141

// See if we can use the 32-bit instruction instead of the 64-bit one for a

24142

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

24143

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

24144

// known to be zero.

24145

if (Src.getValueType() == MVT::i64 &&

24146

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

24147

Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);

24148

24149

// If the operand types disagree, extend the shift amount to match. Since

24150

// BT ignores high bits (like shifts) we can use anyextend.

24151

if (Src.getValueType() != BitNo.getValueType()) {

24152

// Peek through a mask/modulo operation.

24153

// TODO: DAGCombine fails to do this as it just checks isTruncateFree, but

24154

// we probably need a better IsDesirableToPromoteOp to handle this as well.

24155

if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())

24156

BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),

24157

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

24158

BitNo.getOperand(0)),

24159

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

24160

BitNo.getOperand(1)));

24161

else

24162

BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);

24163

}

24164

24165

return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);

24166

}

24167

24168

/// Helper for creating a X86ISD::SETCC node.

24169

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

24170

SelectionDAG &DAG) {

24171

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

24172

DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

24173

}

24174

24175

/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

24176

/// recognizable memcmp expansion.

24177

static bool isOrXorXorTree(SDValue X, bool Root = true) {

24178

if (X.getOpcode() == ISD::OR)

24179

return isOrXorXorTree(X.getOperand(0), false) &&

24180

isOrXorXorTree(X.getOperand(1), false);

24181

if (Root)

24182

return false;

24183

return X.getOpcode() == ISD::XOR;

24184

}

24185

24186

/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

24187

/// expansion.

24188

template <typename F>

24189

static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,

24190

EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

24191

SDValue Op0 = X.getOperand(0);

24192

SDValue Op1 = X.getOperand(1);

24193

if (X.getOpcode() == ISD::OR) {

24194

SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

24195

SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

24196

if (VecVT != CmpVT)

24197

return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

24198

if (HasPT)

24199

return DAG.getNode(ISD::OR, DL, VecVT, A, B);

24200

return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

24201

}

24202

if (X.getOpcode() == ISD::XOR) {

24203

SDValue A = SToV(Op0);

24204

SDValue B = SToV(Op1);

24205

if (VecVT != CmpVT)

24206

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

24207

if (HasPT)

24208

return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

24209

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

24210

}

24211

llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24211);

24212

}

24213

24214

/// Try to map a 128-bit or larger integer comparison to vector instructions

24215

/// before type legalization splits it up into chunks.

24216

static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,

24217

ISD::CondCode CC,

24218

const SDLoc &DL,

24219

SelectionDAG &DAG,

24220

const X86Subtarget &Subtarget) {

24221

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24221, __extension__
__PRETTY_FUNCTION__));

24222

24223

// We're looking for an oversized integer equality comparison.

24224

EVT OpVT = X.getValueType();

24225

unsigned OpSize = OpVT.getSizeInBits();

24226

if (!OpVT.isScalarInteger() || OpSize < 128)

24227

return SDValue();

24228

24229

// Ignore a comparison with zero because that gets special treatment in

24230

// EmitTest(). But make an exception for the special case of a pair of

24231

// logically-combined vector-sized operands compared to zero. This pattern may

24232

// be generated by the memcmp expansion pass with oversized integer compares

24233

// (see PR33325).

24234

bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

24235

if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

24236

return SDValue();

24237

24238

// Don't perform this combine if constructing the vector will be expensive.

24239

auto IsVectorBitCastCheap = [](SDValue X) {

24240

X = peekThroughBitcasts(X);

24241

return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

24242

X.getOpcode() == ISD::LOAD;

24243

};

24244

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

24245

!IsOrXorXorTreeCCZero)

24246

return SDValue();

24247

24248

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

24249

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

24250

// Otherwise use PCMPEQ (plus AND) and mask testing.

24251

bool NoImplicitFloatOps =

24252

DAG.getMachineFunction().getFunction().hasFnAttribute(

24253

Attribute::NoImplicitFloat);

24254

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

24255

((OpSize == 128 && Subtarget.hasSSE2()) ||

24256

(OpSize == 256 && Subtarget.hasAVX()) ||

24257

(OpSize == 512 && Subtarget.useAVX512Regs()))) {

24258

bool HasPT = Subtarget.hasSSE41();

24259

24260

// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

24261

// vector registers are essentially free. (Technically, widening registers

24262

// prevents load folding, but the tradeoff is worth it.)

24263

bool PreferKOT = Subtarget.preferMaskRegisters();

24264

bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

24265

24266

EVT VecVT = MVT::v16i8;

24267

EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

24268

if (OpSize == 256) {

24269

VecVT = MVT::v32i8;

24270

CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

24271

}

24272

EVT CastVT = VecVT;

24273

bool NeedsAVX512FCast = false;

24274

if (OpSize == 512 || NeedZExt) {

24275

if (Subtarget.hasBWI()) {

24276

VecVT = MVT::v64i8;

24277

CmpVT = MVT::v64i1;

24278

if (OpSize == 512)

24279

CastVT = VecVT;

24280

} else {

24281

VecVT = MVT::v16i32;

24282

CmpVT = MVT::v16i1;

24283

CastVT = OpSize == 512 ? VecVT

24284

: OpSize == 256 ? MVT::v8i32

24285

: MVT::v4i32;

24286

NeedsAVX512FCast = true;

24287

}

24288

}

24289

24290

auto ScalarToVector = [&](SDValue X) -> SDValue {

24291

bool TmpZext = false;

24292

EVT TmpCastVT = CastVT;

24293

if (X.getOpcode() == ISD::ZERO_EXTEND) {

24294

SDValue OrigX = X.getOperand(0);

24295

unsigned OrigSize = OrigX.getScalarValueSizeInBits();

24296

if (OrigSize < OpSize) {

24297

if (OrigSize == 128) {

24298

TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

24299

X = OrigX;

24300

TmpZext = true;

24301

} else if (OrigSize == 256) {

24302

TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

24303

X = OrigX;

24304

TmpZext = true;

24305

}

24306

}

24307

}

24308

X = DAG.getBitcast(TmpCastVT, X);

24309

if (!NeedZExt && !TmpZext)

24310

return X;

24311

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

24312

DAG.getConstant(0, DL, VecVT), X,

24313

DAG.getVectorIdxConstant(0, DL));

24314

};

24315

24316

SDValue Cmp;

24317

if (IsOrXorXorTreeCCZero) {

24318

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

24319

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

24320

// Use 2 vector equality compares and 'and' the results before doing a

24321

// MOVMSK.

24322

Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

24323

} else {

24324

SDValue VecX = ScalarToVector(X);

24325

SDValue VecY = ScalarToVector(Y);

24326

if (VecVT != CmpVT) {

24327

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

24328

} else if (HasPT) {

24329

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

24330

} else {

24331

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

24332

}

24333

}

24334

// AVX512 should emit a setcc that will lower to kortest.

24335

if (VecVT != CmpVT) {

24336

EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64

24337

: CmpVT == MVT::v32i1 ? MVT::i32

24338

: MVT::i16;

24339

return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

24340

DAG.getConstant(0, DL, KRegVT), CC);

24341

}

24342

if (HasPT) {

24343

SDValue BCCmp =

24344

DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);

24345

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

24346

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

24347

SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

24348

return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

24349

}

24350

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

24351

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

24352

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

24353

assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__
__PRETTY_FUNCTION__))

24354

"Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__
__PRETTY_FUNCTION__));

24355

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

24356

SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

24357

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

24358

}

24359

24360

return SDValue();

24361

}

24362

24363

/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))

24364

/// style scalarized (associative) reduction patterns. Partial reductions

24365

/// are supported when the pointer SrcMask is non-null.

24366

/// TODO - move this to SelectionDAG?

24367

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

24368

SmallVectorImpl<SDValue> &SrcOps,

24369

SmallVectorImpl<APInt> *SrcMask = nullptr) {

24370

SmallVector<SDValue, 8> Opnds;

24371

DenseMap<SDValue, APInt> SrcOpMap;

24372

EVT VT = MVT::Other;

24373

24374

// Recognize a special case where a vector is casted into wide integer to

24375

// test all 0s.

24376

assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__
__PRETTY_FUNCTION__))

24377

"Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__
__PRETTY_FUNCTION__));

24378

Opnds.push_back(Op.getOperand(0));

24379

Opnds.push_back(Op.getOperand(1));

24380

24381

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

24382

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

24383

// BFS traverse all BinOp operands.

24384

if (I->getOpcode() == unsigned(BinOp)) {

24385

Opnds.push_back(I->getOperand(0));

24386

Opnds.push_back(I->getOperand(1));

24387

// Re-evaluate the number of nodes to be traversed.

24388

e += 2; // 2 more nodes (LHS and RHS) are pushed.

24389

continue;

24390

}

24391

24392

// Quit if a non-EXTRACT_VECTOR_ELT

24393

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

24394

return false;

24395

24396

// Quit if without a constant index.

24397

auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

24398

if (!Idx)

24399

return false;

24400

24401

SDValue Src = I->getOperand(0);

24402

DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

24403

if (M == SrcOpMap.end()) {

24404

VT = Src.getValueType();

24405

// Quit if not the same type.

24406

if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())

24407

return false;

24408

unsigned NumElts = VT.getVectorNumElements();

24409

APInt EltCount = APInt::getZero(NumElts);

24410

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

24411

SrcOps.push_back(Src);

24412

}

24413

24414

// Quit if element already used.

24415

unsigned CIdx = Idx->getZExtValue();

24416

if (M->second[CIdx])

24417

return false;

24418

M->second.setBit(CIdx);

24419

}

24420

24421

if (SrcMask) {

24422

// Collect the source partial masks.

24423

for (SDValue &SrcOp : SrcOps)

24424

SrcMask->push_back(SrcOpMap[SrcOp]);

24425

} else {

24426

// Quit if not all elements are used.

24427

for (const auto &I : SrcOpMap)

24428

if (!I.second.isAllOnes())

24429

return false;

24430

}

24431

24432

return true;

24433

}

24434

24435

// Helper function for comparing all bits of two vectors.

24436

static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,

24437

ISD::CondCode CC, const APInt &OriginalMask,

24438

const X86Subtarget &Subtarget,

24439

SelectionDAG &DAG, X86::CondCode &X86CC) {

24440

EVT VT = LHS.getValueType();

24441

unsigned ScalarSize = VT.getScalarSizeInBits();

24442

if (OriginalMask.getBitWidth() != ScalarSize) {

24443

assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24443, __extension__
__PRETTY_FUNCTION__));

24444

return SDValue();

24445

}

24446

24447

// Quit if not convertable to legal scalar or 128/256-bit vector.

24448

if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24449

return SDValue();

24450

24451

// FCMP may use ISD::SETNE when nnan - early out if we manage to get here.

24452

if (VT.isFloatingPoint())

24453

return SDValue();

24454

24455

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24455, __extension__
__PRETTY_FUNCTION__));

24456

X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

24457

24458

APInt Mask = OriginalMask;

24459

24460

auto MaskBits = [&](SDValue Src) {

24461

if (Mask.isAllOnes())

24462

return Src;

24463

EVT SrcVT = Src.getValueType();

24464

SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

24465

return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

24466

};

24467

24468

// For sub-128-bit vector, cast to (legal) integer and compare with zero.

24469

if (VT.getSizeInBits() < 128) {

24470

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

24471

if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {

24472

if (IntVT != MVT::i64)

24473

return SDValue();

24474

auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,

24475

MVT::i32, MVT::i32);

24476

auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,

24477

MVT::i32, MVT::i32);

24478

SDValue Lo =

24479

DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);

24480

SDValue Hi =

24481

DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);

24482

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

24483

DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),

24484

DAG.getConstant(0, DL, MVT::i32));

24485

}

24486

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

24487

DAG.getBitcast(IntVT, MaskBits(LHS)),

24488

DAG.getBitcast(IntVT, MaskBits(RHS)));

24489

}

24490

24491

// Without PTEST, a masked v2i64 or-reduction is not faster than

24492

// scalarization.

24493

bool UseKORTEST = Subtarget.useAVX512Regs();

24494

bool UsePTEST = Subtarget.hasSSE41();

24495

if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)

24496

return SDValue();

24497

24498

// Split down to 128/256/512-bit vector.

24499

unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);

24500

24501

// If the input vector has vector elements wider than the target test size,

24502

// then cast to <X x i64> so it will safely split.

24503

if (ScalarSize > TestSize) {

24504

if (!Mask.isAllOnes())

24505

return SDValue();

24506

VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);

24507

LHS = DAG.getBitcast(VT, LHS);

24508

RHS = DAG.getBitcast(VT, RHS);

24509

Mask = APInt::getAllOnes(64);

24510

}

24511

24512

if (VT.getSizeInBits() > TestSize) {

24513

KnownBits KnownRHS = DAG.computeKnownBits(RHS);

24514

if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {

24515

// If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.

24516

while (VT.getSizeInBits() > TestSize) {

24517

auto Split = DAG.SplitVector(LHS, DL);

24518

VT = Split.first.getValueType();

24519

LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

24520

}

24521

RHS = DAG.getAllOnesConstant(DL, VT);

24522

} else if (!UsePTEST && !KnownRHS.isZero()) {

24523

// MOVMSK Special Case:

24524

// ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)

24525

MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;

24526

VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());

24527

LHS = DAG.getBitcast(VT, MaskBits(LHS));

24528

RHS = DAG.getBitcast(VT, MaskBits(RHS));

24529

EVT BoolVT = VT.changeVectorElementType(MVT::i1);

24530

SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);

24531

V = DAG.getSExtOrTrunc(V, DL, VT);

24532

while (VT.getSizeInBits() > TestSize) {

24533

auto Split = DAG.SplitVector(V, DL);

24534

VT = Split.first.getValueType();

24535

V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

24536

}

24537

V = DAG.getNOT(DL, V, VT);

24538

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

24539

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

24540

DAG.getConstant(0, DL, MVT::i32));

24541

} else {

24542

// Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.

24543

SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);

24544

while (VT.getSizeInBits() > TestSize) {

24545

auto Split = DAG.SplitVector(V, DL);

24546

VT = Split.first.getValueType();

24547

V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

24548

}

24549

LHS = V;

24550

RHS = DAG.getConstant(0, DL, VT);

24551

}

24552

}

24553

24554

if (UseKORTEST && VT.is512BitVector()) {

24555

MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

24556

MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);

24557

LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

24558

RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

24559

SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);

24560

return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);

24561

}

24562

24563

if (UsePTEST) {

24564

MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

24565

LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

24566

RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

24567

SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);

24568

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

24569

}

24570

24571

assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 &&
"Failure to split to 128-bits") ? void (0) : __assert_fail (
"VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24571, __extension__
__PRETTY_FUNCTION__));

24572

MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;

24573

LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));

24574

RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));

24575

SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);

24576

V = DAG.getNOT(DL, V, MaskVT);

24577

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

24578

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

24579

DAG.getConstant(0, DL, MVT::i32));

24580

}

24581

24582

// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback

24583

// to CMP(MOVMSK(PCMPEQB(X,Y))).

24584

static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,

24585

ISD::CondCode CC, const SDLoc &DL,

24586

const X86Subtarget &Subtarget,

24587

SelectionDAG &DAG,

24588

X86::CondCode &X86CC) {

24589

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24589, __extension__
__PRETTY_FUNCTION__));

24590

24591

bool CmpNull = isNullConstant(RHS);

24592

bool CmpAllOnes = isAllOnesConstant(RHS);

24593

if (!CmpNull && !CmpAllOnes)

24594

return SDValue();

24595

24596

SDValue Op = LHS;

24597

if (!Subtarget.hasSSE2() || !Op->hasOneUse())

24598

return SDValue();

24599

24600

// Check whether we're masking/truncating an OR-reduction result, in which

24601

// case track the masked bits.

24602

// TODO: Add CmpAllOnes support.

24603

APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());

24604

if (CmpNull) {

24605

switch (Op.getOpcode()) {

24606

case ISD::TRUNCATE: {

24607

SDValue Src = Op.getOperand(0);

24608

Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

24609

Op.getScalarValueSizeInBits());

24610

Op = Src;

24611

break;

24612

}

24613

case ISD::AND: {

24614

if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

24615

Mask = Cst->getAPIntValue();

24616

Op = Op.getOperand(0);

24617

}

24618

break;

24619

}

24620

}

24621

}

24622

24623

ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;

24624

24625

// Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.

24626

// Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.

24627

SmallVector<SDValue, 8> VecIns;

24628

if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {

24629

EVT VT = VecIns[0].getValueType();

24630

assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))

24631

[VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__))

24632

"Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__
__PRETTY_FUNCTION__));

24633

24634

// Quit if not splittable to scalar/128/256/512-bit vector.

24635

if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24636

return SDValue();

24637

24638

// If more than one full vector is evaluated, AND/OR them first before

24639

// PTEST.

24640

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

24641

Slot += 2, e += 1) {

24642

// Each iteration will AND/OR 2 nodes and append the result until there is

24643

// only 1 node left, i.e. the final value of all vectors.

24644

SDValue LHS = VecIns[Slot];

24645

SDValue RHS = VecIns[Slot + 1];

24646

VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));

24647

}

24648

24649

return LowerVectorAllEqual(DL, VecIns.back(),

24650

CmpNull ? DAG.getConstant(0, DL, VT)

24651

: DAG.getAllOnesConstant(DL, VT),

24652

CC, Mask, Subtarget, DAG, X86CC);

24653

}

24654

24655

// Match icmp(reduce_or(X),0) anyof reduction patterns.

24656

// Match icmp(reduce_and(X),-1) allof reduction patterns.

24657

if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

24658

ISD::NodeType BinOp;

24659

if (SDValue Match =

24660

DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {

24661

EVT MatchVT = Match.getValueType();

24662

return LowerVectorAllEqual(DL, Match,

24663

CmpNull ? DAG.getConstant(0, DL, MatchVT)

24664

: DAG.getAllOnesConstant(DL, MatchVT),

24665

CC, Mask, Subtarget, DAG, X86CC);

24666

}

24667

}

24668

24669

if (Mask.isAllOnes()) {

24670

assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__
__PRETTY_FUNCTION__))

24671

"Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__
__PRETTY_FUNCTION__));

24672

SDValue Src = peekThroughBitcasts(Op);

24673

if (Src.getValueType().isFixedLengthVector() &&

24674

Src.getValueType().getScalarType() == MVT::i1) {

24675

// Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.

24676

// Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.

24677

if (Src.getOpcode() == ISD::SETCC) {

24678

SDValue LHS = Src.getOperand(0);

24679

SDValue RHS = Src.getOperand(1);

24680

EVT LHSVT = LHS.getValueType();

24681

ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();

24682

if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&

24683

llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {

24684

APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());

24685

return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,

24686

X86CC);

24687

}

24688

}

24689

// Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.

24690

// Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.

24691

// Peek through truncation, mask the LSB and compare against zero/LSB.

24692

if (Src.getOpcode() == ISD::TRUNCATE) {

24693

SDValue Inner = Src.getOperand(0);

24694

EVT InnerVT = Inner.getValueType();

24695

if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {

24696

unsigned BW = InnerVT.getScalarSizeInBits();

24697

APInt SrcMask = APInt(BW, 1);

24698

APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;

24699

return LowerVectorAllEqual(DL, Inner,

24700

DAG.getConstant(Cmp, DL, InnerVT), CC,

24701

SrcMask, Subtarget, DAG, X86CC);

24702

}

24703

}

24704

}

24705

}

24706

24707

return SDValue();

24708

}

24709

24710

/// return true if \c Op has a use that doesn't just read flags.

24711

static bool hasNonFlagsUse(SDValue Op) {

24712

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

24713

++UI) {

24714

SDNode *User = *UI;

24715

unsigned UOpNo = UI.getOperandNo();

24716

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

24717

// Look pass truncate.

24718

UOpNo = User->use_begin().getOperandNo();

24719

User = *User->use_begin();

24720

}

24721

24722

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

24723

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

24724

return true;

24725

}

24726

return false;

24727

}

24728

24729

// Transform to an x86-specific ALU node with flags if there is a chance of

24730

// using an RMW op or only the flags are used. Otherwise, leave

24731

// the node alone and emit a 'cmp' or 'test' instruction.

24732

static bool isProfitableToUseFlagOp(SDValue Op) {

24733

for (SDNode *U : Op->uses())

24734

if (U->getOpcode() != ISD::CopyToReg &&

24735

U->getOpcode() != ISD::SETCC &&

24736

U->getOpcode() != ISD::STORE)

24737

return false;

24738

24739

return true;

24740

}

24741

24742

/// Emit nodes that will be selected as "test Op0,Op0", or something

24743

/// equivalent.

24744

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

24745

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

24746

// CF and OF aren't always set the way we want. Determine which

24747

// of these we need.

24748

bool NeedCF = false;

24749

bool NeedOF = false;

24750

switch (X86CC) {

24751

default: break;

24752

case X86::COND_A: case X86::COND_AE:

24753

case X86::COND_B: case X86::COND_BE:

24754

NeedCF = true;

24755

break;

24756

case X86::COND_G: case X86::COND_GE:

24757

case X86::COND_L: case X86::COND_LE:

24758

case X86::COND_O: case X86::COND_NO: {

24759

// Check if we really need to set the

24760

// Overflow flag. If NoSignedWrap is present

24761

// that is not actually needed.

24762

switch (Op->getOpcode()) {

24763

case ISD::ADD:

24764

case ISD::SUB:

24765

case ISD::MUL:

24766

case ISD::SHL:

24767

if (Op.getNode()->getFlags().hasNoSignedWrap())

24768

break;

24769

[[fallthrough]];

24770

default:

24771

NeedOF = true;

24772

break;

24773

}

24774

break;

24775

}

24776

}

24777

// See if we can use the EFLAGS value from the operand instead of

24778

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

24779

// we prove that the arithmetic won't overflow, we can't use OF or CF.

24780

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

24781

// Emit a CMP with 0, which is the TEST pattern.

24782

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24783

DAG.getConstant(0, dl, Op.getValueType()));

24784

}

24785

unsigned Opcode = 0;

24786

unsigned NumOperands = 0;

24787

24788

SDValue ArithOp = Op;

24789

24790

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

24791

// which may be the result of a CAST. We use the variable 'Op', which is the

24792

// non-casted variable when we check for possible users.

24793

switch (ArithOp.getOpcode()) {

24794

case ISD::AND:

24795

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

24796

// because a TEST instruction will be better.

24797

if (!hasNonFlagsUse(Op))

24798

break;

24799

24800

[[fallthrough]];

24801

case ISD::ADD:

24802

case ISD::SUB:

24803

case ISD::OR:

24804

case ISD::XOR:

24805

if (!isProfitableToUseFlagOp(Op))

24806

break;

24807

24808

// Otherwise use a regular EFLAGS-setting instruction.

24809

switch (ArithOp.getOpcode()) {

24810

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24810);

24811

case ISD::ADD: Opcode = X86ISD::ADD; break;

24812

case ISD::SUB: Opcode = X86ISD::SUB; break;

24813

case ISD::XOR: Opcode = X86ISD::XOR; break;

24814

case ISD::AND: Opcode = X86ISD::AND; break;

24815

case ISD::OR: Opcode = X86ISD::OR; break;

24816

}

24817

24818

NumOperands = 2;

24819

break;

24820

case X86ISD::ADD:

24821

case X86ISD::SUB:

24822

case X86ISD::OR:

24823

case X86ISD::XOR:

24824

case X86ISD::AND:

24825

return SDValue(Op.getNode(), 1);

24826

case ISD::SSUBO:

24827

case ISD::USUBO: {

24828

// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

24829

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24830

return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

24831

Op->getOperand(1)).getValue(1);

24832

}

24833

default:

24834

break;

24835

}

24836

24837

if (Opcode == 0) {

24838

// Emit a CMP with 0, which is the TEST pattern.

24839

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24840

DAG.getConstant(0, dl, Op.getValueType()));

24841

}

24842

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24843

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

24844

24845

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

24846

DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

24847

return SDValue(New.getNode(), 1);

24848

}

24849

24850

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

24851

/// equivalent.

24852

static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

24853

const SDLoc &dl, SelectionDAG &DAG,

24854

const X86Subtarget &Subtarget) {

24855

if (isNullConstant(Op1))

24856

return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

24857

24858

EVT CmpVT = Op0.getValueType();

24859

24860

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__
__PRETTY_FUNCTION__))

24861

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__
__PRETTY_FUNCTION__));

24862

24863

// Only promote the compare up to I32 if it is a 16 bit operation

24864

// with an immediate. 16 bit immediates are to be avoided.

24865

if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&

24866

!DAG.getMachineFunction().getFunction().hasMinSize()) {

24867

ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);

24868

ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);

24869

// Don't do this if the immediate can fit in 8-bits.

24870

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

24871

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

24872

unsigned ExtendOp =

24873

isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

24874

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

24875

// For equality comparisons try to use SIGN_EXTEND if the input was

24876

// truncate from something with enough sign bits.

24877

if (Op0.getOpcode() == ISD::TRUNCATE) {

24878

if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)

24879

ExtendOp = ISD::SIGN_EXTEND;

24880

} else if (Op1.getOpcode() == ISD::TRUNCATE) {

24881

if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)

24882

ExtendOp = ISD::SIGN_EXTEND;

24883

}

24884

}

24885

24886

CmpVT = MVT::i32;

24887

Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

24888

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

24889

}

24890

}

24891

24892

// Try to shrink i64 compares if the input has enough zero bits.

24893

// FIXME: Do this for non-constant compares for constant on LHS?

24894

if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

24895

Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

24896

cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

24897

DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

24898

CmpVT = MVT::i32;

24899

Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

24900

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

24901

}

24902

24903

// 0-x == y --> x+y == 0

24904

// 0-x != y --> x+y != 0

24905

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

24906

Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24907

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24908

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

24909

return Add.getValue(1);

24910

}

24911

24912

// x == 0-y --> x+y == 0

24913

// x != 0-y --> x+y != 0

24914

if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

24915

Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24916

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24917

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

24918

return Add.getValue(1);

24919

}

24920

24921

// Use SUB instead of CMP to enable CSE between SUB and CMP.

24922

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24923

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

24924

return Sub.getValue(1);

24925

}

24926

24927

/// Check if replacement of SQRT with RSQRT should be disabled.

24928

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

24929

EVT VT = Op.getValueType();

24930

24931

// We don't need to replace SQRT with RSQRT for half type.

24932

if (VT.getScalarType() == MVT::f16)

24933

return true;

24934

24935

// We never want to use both SQRT and RSQRT instructions for the same input.

24936

if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

24937

return false;

24938

24939

if (VT.isVector())

24940

return Subtarget.hasFastVectorFSQRT();

24941

return Subtarget.hasFastScalarFSQRT();

24942

}

24943

24944

/// The minimum architected relative accuracy is 2^-12. We need one

24945

/// Newton-Raphson step to have a good float result (24 bits of precision).

24946

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

24947

SelectionDAG &DAG, int Enabled,

24948

int &RefinementSteps,

24949

bool &UseOneConstNR,

24950

bool Reciprocal) const {

24951

SDLoc DL(Op);

24952

EVT VT = Op.getValueType();

24953

24954

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

24955

// It is likely not profitable to do this for f64 because a double-precision

24956

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

24957

// instructions: convert to single, rsqrtss, convert back to double, refine

24958

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

24959

// along with FMA, this could be a throughput win.

24960

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

24961

// after legalize types.

24962

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24963

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

24964

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

24965

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24966

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24967

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24968

RefinementSteps = 1;

24969

24970

UseOneConstNR = false;

24971

// There is no FSQRT for 512-bits, but there is RSQRT14.

24972

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

24973

SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);

24974

if (RefinementSteps == 0 && !Reciprocal)

24975

Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);

24976

return Estimate;

24977

}

24978

24979

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24980

Subtarget.hasFP16()) {

24981

assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24981, __extension__
__PRETTY_FUNCTION__));

24982

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24983

RefinementSteps = 0;

24984

24985

if (VT == MVT::f16) {

24986

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24987

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24988

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24989

Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);

24990

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24991

}

24992

24993

return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);

24994

}

24995

return SDValue();

24996

}

24997

24998

/// The minimum architected relative accuracy is 2^-12. We need one

24999

/// Newton-Raphson step to have a good float result (24 bits of precision).

25000

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

25001

int Enabled,

25002

int &RefinementSteps) const {

25003

SDLoc DL(Op);

25004

EVT VT = Op.getValueType();

25005

25006

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

25007

// It is likely not profitable to do this for f64 because a double-precision

25008

// reciprocal estimate with refinement on x86 prior to FMA requires

25009

// 15 instructions: convert to single, rcpss, convert back to double, refine

25010

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

25011

// along with FMA, this could be a throughput win.

25012

25013

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

25014

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

25015

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

25016

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

25017

// Enable estimate codegen with 1 refinement step for vector division.

25018

// Scalar division estimates are disabled because they break too much

25019

// real-world code. These defaults are intended to match GCC behavior.

25020

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

25021

return SDValue();

25022

25023

if (RefinementSteps == ReciprocalEstimate::Unspecified)

25024

RefinementSteps = 1;

25025

25026

// There is no FSQRT for 512-bits, but there is RCP14.

25027

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

25028

return DAG.getNode(Opcode, DL, VT, Op);

25029

}

25030

25031

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

25032

Subtarget.hasFP16()) {

25033

if (RefinementSteps == ReciprocalEstimate::Unspecified)

25034

RefinementSteps = 0;

25035

25036

if (VT == MVT::f16) {

25037

SDValue Zero = DAG.getIntPtrConstant(0, DL);

25038

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

25039

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

25040

Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);

25041

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

25042

}

25043

25044

return DAG.getNode(X86ISD::RCP14, DL, VT, Op);

25045

}

25046

return SDValue();

25047

}

25048

25049

/// If we have at least two divisions that use the same divisor, convert to

25050

/// multiplication by a reciprocal. This may need to be adjusted for a given

25051

/// CPU if a division's cost is not at least twice the cost of a multiplication.

25052

/// This is because we still need one division to calculate the reciprocal and

25053

/// then we need two multiplies by that reciprocal as replacements for the

25054

/// original divisions.

25055

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

25056

return 2;

25057

}

25058

25059

SDValue

25060

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

25061

SelectionDAG &DAG,

25062

SmallVectorImpl<SDNode *> &Created) const {

25063

AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

25064

if (isIntDivCheap(N->getValueType(0), Attr))

25065

return SDValue(N,0); // Lower SDIV as SDIV

25066

25067

assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__
__PRETTY_FUNCTION__))

25068

"Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__
__PRETTY_FUNCTION__));

25069

25070

// Only perform this transform if CMOV is supported otherwise the select

25071

// below will become a branch.

25072

if (!Subtarget.canUseCMOV())

25073

return SDValue();

25074

25075

// fold (sdiv X, pow2)

25076

EVT VT = N->getValueType(0);

25077

// FIXME: Support i8.

25078

if (VT != MVT::i16 && VT != MVT::i32 &&

25079

!(Subtarget.is64Bit() && VT == MVT::i64))

25080

return SDValue();

25081

25082

unsigned Lg2 = Divisor.countr_zero();

25083

25084

// If the divisor is 2 or -2, the default expansion is better.

25085

if (Lg2 == 1)

25086

return SDValue();

25087

25088

SDLoc DL(N);

25089

SDValue N0 = N->getOperand(0);

25090

SDValue Zero = DAG.getConstant(0, DL, VT);

25091

APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);

25092

SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

25093

25094

// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.

25095

SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);

25096

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);

25097

SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

25098

25099

Created.push_back(Cmp.getNode());

25100

Created.push_back(Add.getNode());

25101

Created.push_back(CMov.getNode());

25102

25103

// Divide by pow2.

25104

SDValue SRA =

25105

DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

25106

25107

// If we're dividing by a positive value, we're done. Otherwise, we must

25108

// negate the result.

25109

if (Divisor.isNonNegative())

25110

return SRA;

25111

25112

Created.push_back(SRA.getNode());

25113

return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);

25114

}

25115

25116

/// Result of 'and' is compared against zero. Change to a BT node if possible.

25117

/// Returns the BT node and the condition code needed to use it.

25118

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,

25119

SelectionDAG &DAG, X86::CondCode &X86CC) {

25120

assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25120, __extension__
__PRETTY_FUNCTION__));

25121

SDValue Op0 = And.getOperand(0);

25122

SDValue Op1 = And.getOperand(1);

25123

if (Op0.getOpcode() == ISD::TRUNCATE)

25124

Op0 = Op0.getOperand(0);

25125

if (Op1.getOpcode() == ISD::TRUNCATE)

25126

Op1 = Op1.getOperand(0);

25127

25128

SDValue Src, BitNo;

25129

if (Op1.getOpcode() == ISD::SHL)

25130

std::swap(Op0, Op1);

25131

if (Op0.getOpcode() == ISD::SHL) {

25132

if (isOneConstant(Op0.getOperand(0))) {

25133

// If we looked past a truncate, check that it's only truncating away

25134

// known zeros.

25135

unsigned BitWidth = Op0.getValueSizeInBits();

25136

unsigned AndBitWidth = And.getValueSizeInBits();

25137

if (BitWidth > AndBitWidth) {

25138

KnownBits Known = DAG.computeKnownBits(Op0);

25139

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

25140

return SDValue();

25141

}

25142

Src = Op1;

25143

BitNo = Op0.getOperand(1);

25144

}

25145

} else if (Op1.getOpcode() == ISD::Constant) {

25146

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

25147

uint64_t AndRHSVal = AndRHS->getZExtValue();

25148

SDValue AndLHS = Op0;

25149

25150

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

25151

Src = AndLHS.getOperand(0);

25152

BitNo = AndLHS.getOperand(1);

25153

} else {

25154

// Use BT if the immediate can't be encoded in a TEST instruction or we

25155

// are optimizing for size and the immedaite won't fit in a byte.

25156

bool OptForSize = DAG.shouldOptForSize();

25157

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

25158

isPowerOf2_64(AndRHSVal)) {

25159

Src = AndLHS;

25160

BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

25161

Src.getValueType());

25162

}

25163

}

25164

}

25165

25166

// No patterns found, give up.

25167

if (!Src.getNode())

25168

return SDValue();

25169

25170

// Remove any bit flip.

25171

if (isBitwiseNot(Src)) {

25172

Src = Src.getOperand(0);

25173

CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;

25174

}

25175

25176

// Attempt to create the X86ISD::BT node.

25177

if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {

25178

X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25179

return BT;

25180

}

25181

25182

return SDValue();

25183

}

25184

25185

// Check if pre-AVX condcode can be performed by a single FCMP op.

25186

static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {

25187

return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);

25188

}

25189

25190

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

25191

/// CMPs.

25192

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

25193

SDValue &Op1, bool &IsAlwaysSignaling) {

25194

unsigned SSECC;

25195

bool Swap = false;

25196

25197

// SSE Condition code mapping:

25198

// 0 - EQ

25199

// 1 - LT

25200

// 2 - LE

25201

// 3 - UNORD

25202

// 4 - NEQ

25203

// 5 - NLT

25204

// 6 - NLE

25205

// 7 - ORD

25206

switch (SetCCOpcode) {

25207

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25207);

25208

case ISD::SETOEQ:

25209

case ISD::SETEQ: SSECC = 0; break;

25210

case ISD::SETOGT:

25211

case ISD::SETGT: Swap = true; [[fallthrough]];

25212

case ISD::SETLT:

25213

case ISD::SETOLT: SSECC = 1; break;

25214

case ISD::SETOGE:

25215

case ISD::SETGE: Swap = true; [[fallthrough]];

25216

case ISD::SETLE:

25217

case ISD::SETOLE: SSECC = 2; break;

25218

case ISD::SETUO: SSECC = 3; break;

25219

case ISD::SETUNE:

25220

case ISD::SETNE: SSECC = 4; break;

25221

case ISD::SETULE: Swap = true; [[fallthrough]];

25222

case ISD::SETUGE: SSECC = 5; break;

25223

case ISD::SETULT: Swap = true; [[fallthrough]];

25224

case ISD::SETUGT: SSECC = 6; break;

25225

case ISD::SETO: SSECC = 7; break;

25226

case ISD::SETUEQ: SSECC = 8; break;

25227

case ISD::SETONE: SSECC = 12; break;

25228

}

25229

if (Swap)

25230

std::swap(Op0, Op1);

25231

25232

switch (SetCCOpcode) {

25233

default:

25234

IsAlwaysSignaling = true;

25235

break;

25236

case ISD::SETEQ:

25237

case ISD::SETOEQ:

25238

case ISD::SETUEQ:

25239

case ISD::SETNE:

25240

case ISD::SETONE:

25241

case ISD::SETUNE:

25242

case ISD::SETO:

25243

case ISD::SETUO:

25244

IsAlwaysSignaling = false;

25245

break;

25246

}

25247

25248

return SSECC;

25249

}

25250

25251

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

25252

/// concatenate the result back.

25253

static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,

25254

ISD::CondCode Cond, SelectionDAG &DAG,

25255

const SDLoc &dl) {

25256

assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__
__PRETTY_FUNCTION__))

25257

VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__
__PRETTY_FUNCTION__));

25258

25259

SDValue CC = DAG.getCondCode(Cond);

25260

25261

// Extract the LHS Lo/Hi vectors

25262

SDValue LHS1, LHS2;

25263

std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);

25264

25265

// Extract the RHS Lo/Hi vectors

25266

SDValue RHS1, RHS2;

25267

std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);

25268

25269

// Issue the operation on the smaller types and concatenate the result back

25270

EVT LoVT, HiVT;

25271

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

25272

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

25273

DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

25274

DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

25275

}

25276

25277

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

25278

25279

SDValue Op0 = Op.getOperand(0);

25280

SDValue Op1 = Op.getOperand(1);

25281

SDValue CC = Op.getOperand(2);

25282

MVT VT = Op.getSimpleValueType();

25283

SDLoc dl(Op);

25284

25285

assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__
__PRETTY_FUNCTION__))

25286

"Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__
__PRETTY_FUNCTION__));

25287

25288

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

25289

25290

// Prefer SETGT over SETLT.

25291

if (SetCCOpcode == ISD::SETLT) {

25292

SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

25293

std::swap(Op0, Op1);

25294

}

25295

25296

return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

25297

}

25298

25299

/// Given a buildvector constant, return a new vector constant with each element

25300

/// incremented or decremented. If incrementing or decrementing would result in

25301

/// unsigned overflow or underflow or this is not a simple vector constant,

25302

/// return an empty value.

25303

static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,

25304

bool NSW) {

25305

auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

25306

if (!BV || !V.getValueType().isSimple())

25307

return SDValue();

25308

25309

MVT VT = V.getSimpleValueType();

25310

MVT EltVT = VT.getVectorElementType();

25311

unsigned NumElts = VT.getVectorNumElements();

25312

SmallVector<SDValue, 8> NewVecC;

25313

SDLoc DL(V);

25314

for (unsigned i = 0; i < NumElts; ++i) {

25315

auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

25316

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

25317

return SDValue();

25318

25319

// Avoid overflow/underflow.

25320

const APInt &EltC = Elt->getAPIntValue();

25321

if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))

25322

return SDValue();

25323

if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||

25324

(!IsInc && EltC.isMinSignedValue())))

25325

return SDValue();

25326

25327

NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

25328

}

25329

25330

return DAG.getBuildVector(VT, DL, NewVecC);

25331

}

25332

25333

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

25334

/// Op0 u<= Op1:

25335

/// t = psubus Op0, Op1

25336

/// pcmpeq t, <0..0>

25337

static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

25338

ISD::CondCode Cond, const SDLoc &dl,

25339

const X86Subtarget &Subtarget,

25340

SelectionDAG &DAG) {

25341

if (!Subtarget.hasSSE2())

25342

return SDValue();

25343

25344

MVT VET = VT.getVectorElementType();

25345

if (VET != MVT::i8 && VET != MVT::i16)

25346

return SDValue();

25347

25348

switch (Cond) {

25349

default:

25350

return SDValue();

25351

case ISD::SETULT: {

25352

// If the comparison is against a constant we can turn this into a

25353

// setule. With psubus, setule does not require a swap. This is

25354

// beneficial because the constant in the register is no longer

25355

// destructed as the destination so it can be hoisted out of a loop.

25356

// Only do this pre-AVX since vpcmp* is no longer destructive.

25357

if (Subtarget.hasAVX())

25358

return SDValue();

25359

SDValue ULEOp1 =

25360

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);

25361

if (!ULEOp1)

25362

return SDValue();

25363

Op1 = ULEOp1;

25364

break;

25365

}

25366

case ISD::SETUGT: {

25367

// If the comparison is against a constant, we can turn this into a setuge.

25368

// This is beneficial because materializing a constant 0 for the PCMPEQ is

25369

// probably cheaper than XOR+PCMPGT using 2 different vector constants:

25370

// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

25371

SDValue UGEOp1 =

25372

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);

25373

if (!UGEOp1)

25374

return SDValue();

25375

Op1 = Op0;

25376

Op0 = UGEOp1;

25377

break;

25378

}

25379

// Psubus is better than flip-sign because it requires no inversion.

25380

case ISD::SETUGE:

25381

std::swap(Op0, Op1);

25382

break;

25383

case ISD::SETULE:

25384

break;

25385

}

25386

25387

SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

25388

return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

25389

DAG.getConstant(0, dl, VT));

25390

}

25391

25392

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

25393

SelectionDAG &DAG) {

25394

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25395

Op.getOpcode() == ISD::STRICT_FSETCCS;

25396

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25397

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25398

SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

25399

MVT VT = Op->getSimpleValueType(0);

25400

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

25401

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

25402

SDLoc dl(Op);

25403

25404

if (isFP) {

25405

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

25406

assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25406, __extension__
__PRETTY_FUNCTION__));

25407

if (isSoftFP16(EltVT, Subtarget))

25408

return SDValue();

25409

25410

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

25411

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25412

25413

// If we have a strict compare with a vXi1 result and the input is 128/256

25414

// bits we can't use a masked compare unless we have VLX. If we use a wider

25415

// compare like we do for non-strict, we might trigger spurious exceptions

25416

// from the upper elements. Instead emit a AVX compare and convert to mask.

25417

unsigned Opc;

25418

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

25419

(!IsStrict || Subtarget.hasVLX() ||

25420

Op0.getSimpleValueType().is512BitVector())) {

25421

#ifndef NDEBUG

25422

unsigned Num = VT.getVectorNumElements();

25423

assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25423, __extension__
__PRETTY_FUNCTION__));

25424

#endif

25425

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

25426

} else {

25427

Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

25428

// The SSE/AVX packed FP comparison nodes are defined with a

25429

// floating-point vector result that matches the operand type. This allows

25430

// them to work with an SSE1 target (integer vector types are not legal).

25431

VT = Op0.getSimpleValueType();

25432

}

25433

25434

SDValue Cmp;

25435

bool IsAlwaysSignaling;

25436

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

25437

if (!Subtarget.hasAVX()) {

25438

// TODO: We could use following steps to handle a quiet compare with

25439

// signaling encodings.

25440

// 1. Get ordered masks from a quiet ISD::SETO

25441

// 2. Use the masks to mask potential unordered elements in operand A, B

25442

// 3. Get the compare results of masked A, B

25443

// 4. Calculating final result using the mask and result from 3

25444

// But currently, we just fall back to scalar operations.

25445

if (IsStrict && IsAlwaysSignaling && !IsSignaling)

25446

return SDValue();

25447

25448

// Insert an extra signaling instruction to raise exception.

25449

if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

25450

SDValue SignalCmp = DAG.getNode(

25451

Opc, dl, {VT, MVT::Other},

25452

{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

25453

// FIXME: It seems we need to update the flags of all new strict nodes.

25454

// Otherwise, mayRaiseFPException in MI will return false due to

25455

// NoFPExcept = false by default. However, I didn't find it in other

25456

// patches.

25457

SignalCmp->setFlags(Op->getFlags());

25458

Chain = SignalCmp.getValue(1);

25459

}

25460

25461

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

25462

// emit two comparisons and a logic op to tie them together.

25463

if (!cheapX86FSETCC_SSE(Cond)) {

25464

// LLVM predicate is SETUEQ or SETONE.

25465

unsigned CC0, CC1;

25466

unsigned CombineOpc;

25467

if (Cond == ISD::SETUEQ) {

25468

CC0 = 3; // UNORD

25469

CC1 = 0; // EQ

25470

CombineOpc = X86ISD::FOR;

25471

} else {

25472

assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25472, __extension__ __PRETTY_FUNCTION__));

25473

CC0 = 7; // ORD

25474

CC1 = 4; // NEQ

25475

CombineOpc = X86ISD::FAND;

25476

}

25477

25478

SDValue Cmp0, Cmp1;

25479

if (IsStrict) {

25480

Cmp0 = DAG.getNode(

25481

Opc, dl, {VT, MVT::Other},

25482

{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

25483

Cmp1 = DAG.getNode(

25484

Opc, dl, {VT, MVT::Other},

25485

{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

25486

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

25487

Cmp1.getValue(1));

25488

} else {

25489

Cmp0 = DAG.getNode(

25490

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

25491

Cmp1 = DAG.getNode(

25492

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

25493

}

25494

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

25495

} else {

25496

if (IsStrict) {

25497

Cmp = DAG.getNode(

25498

Opc, dl, {VT, MVT::Other},

25499

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

25500

Chain = Cmp.getValue(1);

25501

} else

25502

Cmp = DAG.getNode(

25503

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

25504

}

25505

} else {

25506

// Handle all other FP comparisons here.

25507

if (IsStrict) {

25508

// Make a flip on already signaling CCs before setting bit 4 of AVX CC.

25509

SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

25510

Cmp = DAG.getNode(

25511

Opc, dl, {VT, MVT::Other},

25512

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

25513

Chain = Cmp.getValue(1);

25514

} else

25515

Cmp = DAG.getNode(

25516

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

25517

}

25518

25519

if (VT.getFixedSizeInBits() >

25520

Op.getSimpleValueType().getFixedSizeInBits()) {

25521

// We emitted a compare with an XMM/YMM result. Finish converting to a

25522

// mask register using a vptestm.

25523

EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

25524

Cmp = DAG.getBitcast(CastVT, Cmp);

25525

Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

25526

DAG.getConstant(0, dl, CastVT), ISD::SETNE);

25527

} else {

25528

// If this is SSE/AVX CMPP, bitcast the result back to integer to match

25529

// the result type of SETCC. The bitcast is expected to be optimized

25530

// away during combining/isel.

25531

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

25532

}

25533

25534

if (IsStrict)

25535

return DAG.getMergeValues({Cmp, Chain}, dl);

25536

25537

return Cmp;

25538

}

25539

25540

assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25540, __extension__
__PRETTY_FUNCTION__));

25541

25542

MVT VTOp0 = Op0.getSimpleValueType();

25543

(void)VTOp0;

25544

assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__
__PRETTY_FUNCTION__))

25545

"Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__
__PRETTY_FUNCTION__));

25546

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__
__PRETTY_FUNCTION__))

25547

"Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__
__PRETTY_FUNCTION__));

25548

25549

// The non-AVX512 code below works under the assumption that source and

25550

// destination types are the same.

25551

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__
__PRETTY_FUNCTION__))

25552

"Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__
__PRETTY_FUNCTION__));

25553

25554

// The result is boolean, but operands are int/float

25555

if (VT.getVectorElementType() == MVT::i1) {

25556

// In AVX-512 architecture setcc returns mask with i1 elements,

25557

// But there is no compare instruction for i8 and i16 elements in KNL.

25558

assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__
__PRETTY_FUNCTION__))

25559

"Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__
__PRETTY_FUNCTION__));

25560

return LowerIntVSETCC_AVX512(Op, DAG);

25561

}

25562

25563

// Lower using XOP integer comparisons.

25564

if (VT.is128BitVector() && Subtarget.hasXOP()) {

25565

// Translate compare code to XOP PCOM compare mode.

25566

unsigned CmpMode = 0;

25567

switch (Cond) {

25568

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25568);

25569

case ISD::SETULT:

25570

case ISD::SETLT: CmpMode = 0x00; break;

25571

case ISD::SETULE:

25572

case ISD::SETLE: CmpMode = 0x01; break;

25573

case ISD::SETUGT:

25574

case ISD::SETGT: CmpMode = 0x02; break;

25575

case ISD::SETUGE:

25576

case ISD::SETGE: CmpMode = 0x03; break;

25577

case ISD::SETEQ: CmpMode = 0x04; break;

25578

case ISD::SETNE: CmpMode = 0x05; break;

25579

}

25580

25581

// Are we comparing unsigned or signed integers?

25582

unsigned Opc =

25583

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

25584

25585

return DAG.getNode(Opc, dl, VT, Op0, Op1,

25586

DAG.getTargetConstant(CmpMode, dl, MVT::i8));

25587

}

25588

25589

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

25590

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

25591

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

25592

SDValue BC0 = peekThroughBitcasts(Op0);

25593

if (BC0.getOpcode() == ISD::AND) {

25594

APInt UndefElts;

25595

SmallVector<APInt, 64> EltBits;

25596

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

25597

VT.getScalarSizeInBits(), UndefElts,

25598

EltBits, false, false)) {

25599

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

25600

Cond = ISD::SETEQ;

25601

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

25602

}

25603

}

25604

}

25605

}

25606

25607

// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

25608

if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

25609

Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

25610

ConstantSDNode *C1 = isConstOrConstSplat(Op1);

25611

if (C1 && C1->getAPIntValue().isPowerOf2()) {

25612

unsigned BitWidth = VT.getScalarSizeInBits();

25613

unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

25614

25615

SDValue Result = Op0.getOperand(0);

25616

Result = DAG.getNode(ISD::SHL, dl, VT, Result,

25617

DAG.getConstant(ShiftAmt, dl, VT));

25618

Result = DAG.getNode(ISD::SRA, dl, VT, Result,

25619

DAG.getConstant(BitWidth - 1, dl, VT));

25620

return Result;

25621

}

25622

}

25623

25624

// Break 256-bit integer vector compare into smaller ones.

25625

if (VT.is256BitVector() && !Subtarget.hasInt256())

25626

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25627

25628

// Break 512-bit integer vector compare into smaller ones.

25629

// TODO: Try harder to use VPCMPx + VPMOV2x?

25630

if (VT.is512BitVector())

25631

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25632

25633

// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid

25634

// not-of-PCMPEQ:

25635

// X != INT_MIN --> X >s INT_MIN

25636

// X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X

25637

// +X != 0 --> +X >s 0

25638

APInt ConstValue;

25639

if (Cond == ISD::SETNE &&

25640

ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

25641

if (ConstValue.isMinSignedValue())

25642

Cond = ISD::SETGT;

25643

else if (ConstValue.isMaxSignedValue())

25644

Cond = ISD::SETLT;

25645

else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))

25646

Cond = ISD::SETGT;

25647

}

25648

25649

// If both operands are known non-negative, then an unsigned compare is the

25650

// same as a signed compare and there's no need to flip signbits.

25651

// TODO: We could check for more general simplifications here since we're

25652

// computing known bits.

25653

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

25654

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

25655

25656

// Special case: Use min/max operations for unsigned compares.

25657

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25658

if (ISD::isUnsignedIntSetCC(Cond) &&

25659

(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

25660

TLI.isOperationLegal(ISD::UMIN, VT)) {

25661

// If we have a constant operand, increment/decrement it and change the

25662

// condition to avoid an invert.

25663

if (Cond == ISD::SETUGT) {

25664

// X > C --> X >= (C+1) --> X == umax(X, C+1)

25665

if (SDValue UGTOp1 =

25666

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {

25667

Op1 = UGTOp1;

25668

Cond = ISD::SETUGE;

25669

}

25670

}

25671

if (Cond == ISD::SETULT) {

25672

// X < C --> X <= (C-1) --> X == umin(X, C-1)

25673

if (SDValue ULTOp1 =

25674

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {

25675

Op1 = ULTOp1;

25676

Cond = ISD::SETULE;

25677

}

25678

}

25679

bool Invert = false;

25680

unsigned Opc;

25681

switch (Cond) {

25682

default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25682);

25683

case ISD::SETUGT: Invert = true; [[fallthrough]];

25684

case ISD::SETULE: Opc = ISD::UMIN; break;

25685

case ISD::SETULT: Invert = true; [[fallthrough]];

25686

case ISD::SETUGE: Opc = ISD::UMAX; break;

25687

}

25688

25689

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25690

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

25691

25692

// If the logical-not of the result is required, perform that now.

25693

if (Invert)

25694

Result = DAG.getNOT(dl, Result, VT);

25695

25696

return Result;

25697

}

25698

25699

// Try to use SUBUS and PCMPEQ.

25700

if (FlipSigns)

25701

if (SDValue V =

25702

LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

25703

return V;

25704

25705

// We are handling one of the integer comparisons here. Since SSE only has

25706

// GT and EQ comparisons for integer, swapping operands and multiple

25707

// operations may be required for some comparisons.

25708

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

25709

: X86ISD::PCMPGT;

25710

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

25711

Cond == ISD::SETGE || Cond == ISD::SETUGE;

25712

bool Invert = Cond == ISD::SETNE ||

25713

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

25714

25715

if (Swap)

25716

std::swap(Op0, Op1);

25717

25718

// Check that the operation in question is available (most are plain SSE2,

25719

// but PCMPGTQ and PCMPEQQ have different requirements).

25720

if (VT == MVT::v2i64) {

25721

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

25722

assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25722, __extension__
__PRETTY_FUNCTION__));

25723

25724

// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

25725

// the odd elements over the even elements.

25726

if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

25727

Op0 = DAG.getConstant(0, dl, MVT::v4i32);

25728

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25729

25730

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25731

static const int MaskHi[] = { 1, 1, 3, 3 };

25732

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25733

25734

return DAG.getBitcast(VT, Result);

25735

}

25736

25737

if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

25738

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25739

Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

25740

25741

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25742

static const int MaskHi[] = { 1, 1, 3, 3 };

25743

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25744

25745

return DAG.getBitcast(VT, Result);

25746

}

25747

25748

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25749

// bits of the inputs before performing those operations. The lower

25750

// compare is always unsigned.

25751

SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL

25752

: 0x0000000080000000ULL,

25753

dl, MVT::v2i64);

25754

25755

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

25756

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

25757

25758

// Cast everything to the right type.

25759

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25760

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25761

25762

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

25763

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25764

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

25765

25766

// Create masks for only the low parts/high parts of the 64 bit integers.

25767

static const int MaskHi[] = { 1, 1, 3, 3 };

25768

static const int MaskLo[] = { 0, 0, 2, 2 };

25769

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

25770

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

25771

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25772

25773

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

25774

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

25775

25776

if (Invert)

25777

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25778

25779

return DAG.getBitcast(VT, Result);

25780

}

25781

25782

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

25783

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

25784

// pcmpeqd + pshufd + pand.

25785

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25785, __extension__
__PRETTY_FUNCTION__));

25786

25787

// First cast everything to the right type.

25788

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25789

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25790

25791

// Do the compare.

25792

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

25793

25794

// Make sure the lower and upper halves are both all-ones.

25795

static const int Mask[] = { 1, 0, 3, 2 };

25796

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

25797

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

25798

25799

if (Invert)

25800

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25801

25802

return DAG.getBitcast(VT, Result);

25803

}

25804

}

25805

25806

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25807

// bits of the inputs before performing those operations.

25808

if (FlipSigns) {

25809

MVT EltVT = VT.getVectorElementType();

25810

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

25811

VT);

25812

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

25813

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

25814

}

25815

25816

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25817

25818

// If the logical-not of the result is required, perform that now.

25819

if (Invert)

25820

Result = DAG.getNOT(dl, Result, VT);

25821

25822

return Result;

25823

}

25824

25825

// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.

25826

static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

25827

const SDLoc &dl, SelectionDAG &DAG,

25828

const X86Subtarget &Subtarget,

25829

SDValue &X86CC) {

25830

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25830, __extension__
__PRETTY_FUNCTION__));

25831

25832

// Must be a bitcast from vXi1.

25833

if (Op0.getOpcode() != ISD::BITCAST)

25834

return SDValue();

25835

25836

Op0 = Op0.getOperand(0);

25837

MVT VT = Op0.getSimpleValueType();

25838

if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

25839

!(Subtarget.hasDQI() && VT == MVT::v8i1) &&

25840

!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

25841

return SDValue();

25842

25843

X86::CondCode X86Cond;

25844

if (isNullConstant(Op1)) {

25845

X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

25846

} else if (isAllOnesConstant(Op1)) {

25847

// C flag is set for all ones.

25848

X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

25849

} else

25850

return SDValue();

25851

25852

// If the input is an AND, we can combine it's operands into the KTEST.

25853

bool KTestable = false;

25854

if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

25855

KTestable = true;

25856

if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

25857

KTestable = true;

25858

if (!isNullConstant(Op1))

25859

KTestable = false;

25860

if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

25861

SDValue LHS = Op0.getOperand(0);

25862

SDValue RHS = Op0.getOperand(1);

25863

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25864

return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

25865

}

25866

25867

// If the input is an OR, we can combine it's operands into the KORTEST.

25868

SDValue LHS = Op0;

25869

SDValue RHS = Op0;

25870

if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

25871

LHS = Op0.getOperand(0);

25872

RHS = Op0.getOperand(1);

25873

}

25874

25875

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25876

return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

25877

}

25878

25879

/// Emit flags for the given setcc condition and operands. Also returns the

25880

/// corresponding X86 condition code constant in X86CC.

25881

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

25882

ISD::CondCode CC, const SDLoc &dl,

25883

SelectionDAG &DAG,

25884

SDValue &X86CC) const {

25885

// Equality Combines.

25886

if (CC == ISD::SETEQ || CC == ISD::SETNE) {

25887

X86::CondCode X86CondCode;

25888

25889

// Optimize to BT if possible.

25890

// Lower (X & (1 << N)) == 0 to BT(X, N).

25891

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

25892

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

25893

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {

25894

if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {

25895

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25896

return BT;

25897

}

25898

}

25899

25900

// Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.

25901

if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,

25902

X86CondCode)) {

25903

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25904

return CmpZ;

25905

}

25906

25907

// Try to lower using KORTEST or KTEST.

25908

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

25909

return Test;

25910

25911

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms

25912

// of these.

25913

if (isOneConstant(Op1) || isNullConstant(Op1)) {

25914

// If the input is a setcc, then reuse the input setcc or use a new one

25915

// with the inverted condition.

25916

if (Op0.getOpcode() == X86ISD::SETCC) {

25917

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

25918

25919

X86CC = Op0.getOperand(0);

25920

if (Invert) {

25921

X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);

25922

X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);

25923

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25924

}

25925

25926

return Op0.getOperand(1);

25927

}

25928

}

25929

25930

// Try to use the carry flag from the add in place of an separate CMP for:

25931

// (seteq (add X, -1), -1). Similar for setne.

25932

if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

25933

Op0.getOperand(1) == Op1) {

25934

if (isProfitableToUseFlagOp(Op0)) {

25935

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

25936

25937

SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

25938

Op0.getOperand(1));

25939

DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

25940

X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25941

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25942

return SDValue(New.getNode(), 1);

25943

}

25944

}

25945

}

25946

25947

X86::CondCode CondCode =

25948

TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

25949

assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25949, __extension__
__PRETTY_FUNCTION__));

25950

25951

SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

25952

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25953

return EFLAGS;

25954

}

25955

25956

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

25957

25958

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25959

Op.getOpcode() == ISD::STRICT_FSETCCS;

25960

MVT VT = Op->getSimpleValueType(0);

25961

25962

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

25963

25964

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25964, __extension__
__PRETTY_FUNCTION__));

25965

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25966

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25967

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25968

SDLoc dl(Op);

25969

ISD::CondCode CC =

25970

cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

25971

25972

if (isSoftFP16(Op0.getValueType()))

25973

return SDValue();

25974

25975

// Handle f128 first, since one possible outcome is a normal integer

25976

// comparison which gets handled by emitFlagsForSetcc.

25977

if (Op0.getValueType() == MVT::f128) {

25978

softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

25979

Op.getOpcode() == ISD::STRICT_FSETCCS);

25980

25981

// If softenSetCCOperands returned a scalar, use it.

25982

if (!Op1.getNode()) {

25983

assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__
__PRETTY_FUNCTION__))

25984

"Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__
__PRETTY_FUNCTION__));

25985

if (IsStrict)

25986

return DAG.getMergeValues({Op0, Chain}, dl);

25987

return Op0;

25988

}

25989

}

25990

25991

if (Op0.getSimpleValueType().isInteger()) {

25992

// Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which

25993

// reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),

25994

// this may translate to less uops depending on uarch implementation. The

25995

// equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already

25996

// canonicalize to that CondCode.

25997

// NOTE: Only do this if incrementing the constant doesn't increase the bit

25998

// encoding size - so it must either already be a i8 or i32 immediate, or it

25999

// shrinks down to that. We don't do this for any i64's to avoid additional

26000

// constant materializations.

26001

// TODO: Can we move this to TranslateX86CC to handle jumps/branches too?

26002

if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {

26003

const APInt &Op1Val = Op1C->getAPIntValue();

26004

if (!Op1Val.isZero()) {

26005

// Ensure the constant+1 doesn't overflow.

26006

if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||

26007

(CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {

26008

APInt Op1ValPlusOne = Op1Val + 1;

26009

if (Op1ValPlusOne.isSignedIntN(32) &&

26010

(!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {

26011

Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());

26012

CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE

26013

: ISD::CondCode::SETUGE;

26014

}

26015

}

26016

}

26017

}

26018

26019

SDValue X86CC;

26020

SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

26021

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

26022

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

26023

}

26024

26025

// Handle floating point.

26026

X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

26027

if (CondCode == X86::COND_INVALID)

26028

return SDValue();

26029

26030

SDValue EFLAGS;

26031

if (IsStrict) {

26032

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

26033

EFLAGS =

26034

DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

26035

dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

26036

Chain = EFLAGS.getValue(1);

26037

} else {

26038

EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

26039

}

26040

26041

SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

26042

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

26043

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

26044

}

26045

26046

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

26047

SDValue LHS = Op.getOperand(0);

26048

SDValue RHS = Op.getOperand(1);

26049

SDValue Carry = Op.getOperand(2);

26050

SDValue Cond = Op.getOperand(3);

26051

SDLoc DL(Op);

26052

26053

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26053, __extension__
__PRETTY_FUNCTION__));

26054

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

26055

26056

// Recreate the carry if needed.

26057

EVT CarryVT = Carry.getValueType();

26058

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

26059

Carry, DAG.getAllOnesConstant(DL, CarryVT));

26060

26061

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

26062

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

26063

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

26064

}

26065

26066

// This function returns three things: the arithmetic computation itself

26067

// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The

26068

// flag and the condition code define the case in which the arithmetic

26069

// computation overflows.

26070

static std::pair<SDValue, SDValue>

26071

getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

26072

assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26072, __extension__
__PRETTY_FUNCTION__));

26073

SDValue Value, Overflow;

26074

SDValue LHS = Op.getOperand(0);

26075

SDValue RHS = Op.getOperand(1);

26076

unsigned BaseOp = 0;

26077

SDLoc DL(Op);

26078

switch (Op.getOpcode()) {

26079

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 26079);

26080

case ISD::SADDO:

26081

BaseOp = X86ISD::ADD;

26082

Cond = X86::COND_O;

26083

break;

26084

case ISD::UADDO:

26085

BaseOp = X86ISD::ADD;

26086

Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

26087

break;

26088

case ISD::SSUBO:

26089

BaseOp = X86ISD::SUB;

26090

Cond = X86::COND_O;

26091

break;

26092

case ISD::USUBO:

26093

BaseOp = X86ISD::SUB;

26094

Cond = X86::COND_B;

26095

break;

26096

case ISD::SMULO:

26097

BaseOp = X86ISD::SMUL;

26098

Cond = X86::COND_O;

26099

break;

26100

case ISD::UMULO:

26101

BaseOp = X86ISD::UMUL;

26102

Cond = X86::COND_O;

26103

break;

26104

}

26105

26106

if (BaseOp) {

26107

// Also sets EFLAGS.

26108

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

26109

Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

26110

Overflow = Value.getValue(1);

26111

}

26112

26113

return std::make_pair(Value, Overflow);

26114

}

26115

26116

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

26117

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

26118

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

26119

// looks for this combo and may remove the "setcc" instruction if the "setcc"

26120

// has only one use.

26121

SDLoc DL(Op);

26122

X86::CondCode Cond;

26123

SDValue Value, Overflow;

26124

std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

26125

26126

SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

26127

assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26127, __extension__
__PRETTY_FUNCTION__));

26128

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

26129

}

26130

26131

/// Return true if opcode is a X86 logical comparison.

26132

static bool isX86LogicalCmp(SDValue Op) {

26133

unsigned Opc = Op.getOpcode();

26134

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

26135

Opc == X86ISD::FCMP)

26136

return true;

26137

if (Op.getResNo() == 1 &&

26138

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

26139

Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

26140

Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

26141

return true;

26142

26143

return false;

26144

}

26145

26146

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

26147

if (V.getOpcode() != ISD::TRUNCATE)

26148

return false;

26149

26150

SDValue VOp0 = V.getOperand(0);

26151

unsigned InBits = VOp0.getValueSizeInBits();

26152

unsigned Bits = V.getValueSizeInBits();

26153

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

26154

}

26155

26156

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

26157

bool AddTest = true;

26158

SDValue Cond = Op.getOperand(0);

26159

SDValue Op1 = Op.getOperand(1);

26160

SDValue Op2 = Op.getOperand(2);

26161

SDLoc DL(Op);

26162

MVT VT = Op1.getSimpleValueType();

26163

SDValue CC;

26164

26165

if (isSoftFP16(VT)) {

26166

MVT NVT = VT.changeTypeToInteger();

26167

return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,

26168

DAG.getBitcast(NVT, Op1),

26169

DAG.getBitcast(NVT, Op2)));

26170

}

26171

26172

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

26173

// are available or VBLENDV if AVX is available.

26174

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

26175

if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

26176

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

26177

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

26178

bool IsAlwaysSignaling;

26179

unsigned SSECC =

26180

translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

26181

CondOp0, CondOp1, IsAlwaysSignaling);

26182

26183

if (Subtarget.hasAVX512()) {

26184

SDValue Cmp =

26185

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

26186

DAG.getTargetConstant(SSECC, DL, MVT::i8));

26187

assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26187, __extension__
__PRETTY_FUNCTION__));

26188

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

26189

}

26190

26191

if (SSECC < 8 || Subtarget.hasAVX()) {

26192

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

26193

DAG.getTargetConstant(SSECC, DL, MVT::i8));

26194

26195

// If we have AVX, we can use a variable vector select (VBLENDV) instead

26196

// of 3 logic instructions for size savings and potentially speed.

26197

// Unfortunately, there is no scalar form of VBLENDV.

26198

26199

// If either operand is a +0.0 constant, don't try this. We can expect to

26200

// optimize away at least one of the logic instructions later in that

26201

// case, so that sequence would be faster than a variable blend.

26202

26203

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

26204

// uses XMM0 as the selection register. That may need just as many

26205

// instructions as the AND/ANDN/OR sequence due to register moves, so

26206

// don't bother.

26207

if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&

26208

!isNullFPConstant(Op2)) {

26209

// Convert to vectors, do a VSELECT, and convert back to scalar.

26210

// All of the conversions should be optimized away.

26211

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

26212

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

26213

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

26214

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

26215

26216

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

26217

VCmp = DAG.getBitcast(VCmpVT, VCmp);

26218

26219

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

26220

26221

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

26222

VSel, DAG.getIntPtrConstant(0, DL));

26223

}

26224

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

26225

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

26226

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

26227

}

26228

}

26229

26230

// AVX512 fallback is to lower selects of scalar floats to masked moves.

26231

if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

26232

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

26233

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

26234

}

26235

26236

if (Cond.getOpcode() == ISD::SETCC &&

26237

!isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {

26238

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

26239

Cond = NewCond;

26240

// If the condition was updated, it's possible that the operands of the

26241

// select were also updated (for example, EmitTest has a RAUW). Refresh

26242

// the local references to the select operands in case they got stale.

26243

Op1 = Op.getOperand(1);

26244

Op2 = Op.getOperand(2);

26245

}

26246

}

26247

26248

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

26249

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

26250

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

26251

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

26252

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

26253

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

26254

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

26255

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

26256

if (Cond.getOpcode() == X86ISD::SETCC &&

26257

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

26258

isNullConstant(Cond.getOperand(1).getOperand(1))) {

26259

SDValue Cmp = Cond.getOperand(1);

26260

SDValue CmpOp0 = Cmp.getOperand(0);

26261

unsigned CondCode = Cond.getConstantOperandVal(0);

26262

26263

// Special handling for __builtin_ffs(X) - 1 pattern which looks like

26264

// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

26265

// handle to keep the CMP with 0. This should be removed by

26266

// optimizeCompareInst by using the flags from the BSR/TZCNT used for the

26267

// cttz_zero_undef.

26268

auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

26269

return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

26270

Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

26271

};

26272

if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&

26273

((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

26274

(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

26275

// Keep Cmp.

26276

} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

26277

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

26278

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

26279

SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

26280

26281

// 'X - 1' sets the carry flag if X == 0.

26282

// '0 - X' sets the carry flag if X != 0.

26283

// Convert the carry flag to a -1/0 mask with sbb:

26284

// select (X != 0), -1, Y --> 0 - X; or (sbb), Y

26285

// select (X == 0), Y, -1 --> 0 - X; or (sbb), Y

26286

// select (X != 0), Y, -1 --> X - 1; or (sbb), Y

26287

// select (X == 0), -1, Y --> X - 1; or (sbb), Y

26288

SDValue Sub;

26289

if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {

26290

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

26291

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

26292

} else {

26293

SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());

26294

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);

26295

}

26296

SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

26297

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

26298

Sub.getValue(1));

26299

return DAG.getNode(ISD::OR, DL, VT, SBB, Y);

26300

} else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&

26301

CmpOp0.getOpcode() == ISD::AND &&

26302

isOneConstant(CmpOp0.getOperand(1))) {

26303

SDValue Src1, Src2;

26304

// true if Op2 is XOR or OR operator and one of its operands

26305

// is equal to Op1

26306

// ( a , a op b) || ( b , a op b)

26307

auto isOrXorPattern = [&]() {

26308

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

26309

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

26310

Src1 =

26311

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

26312

Src2 = Op1;

26313

return true;

26314

}

26315

return false;

26316

};

26317

26318

if (isOrXorPattern()) {

26319

SDValue Neg;

26320

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

26321

// we need mask of all zeros or ones with same size of the other

26322

// operands.

26323

if (CmpSz > VT.getSizeInBits())

26324

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

26325

else if (CmpSz < VT.getSizeInBits())

26326

Neg = DAG.getNode(ISD::AND, DL, VT,

26327

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

26328

DAG.getConstant(1, DL, VT));

26329

else

26330

Neg = CmpOp0;

26331

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

26332

Neg); // -(and (x, 0x1))

26333

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

26334

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

26335

}

26336

} else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&

26337

Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&

26338

((CondCode == X86::COND_S) || // smin(x, 0)

26339

(CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)

26340

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

26341

//

26342

// If the comparison is testing for a positive value, we have to invert

26343

// the sign bit mask, so only do that transform if the target has a

26344

// bitwise 'and not' instruction (the invert is free).

26345

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

26346

unsigned ShCt = VT.getSizeInBits() - 1;

26347

SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);

26348

SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);

26349

if (CondCode == X86::COND_G)

26350

Shift = DAG.getNOT(DL, Shift, VT);

26351

return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);

26352

}

26353

}

26354

26355

// Look past (and (setcc_carry (cmp ...)), 1).

26356

if (Cond.getOpcode() == ISD::AND &&

26357

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

26358

isOneConstant(Cond.getOperand(1)))

26359

Cond = Cond.getOperand(0);

26360

26361

// If condition flag is set by a X86ISD::CMP, then use it as the condition

26362

// setting operand in place of the X86ISD::SETCC.

26363

unsigned CondOpcode = Cond.getOpcode();

26364

if (CondOpcode == X86ISD::SETCC ||

26365

CondOpcode == X86ISD::SETCC_CARRY) {

26366

CC = Cond.getOperand(0);

26367

26368

SDValue Cmp = Cond.getOperand(1);

26369

bool IllegalFPCMov = false;

26370

if (VT.isFloatingPoint() && !VT.isVector() &&

26371

!isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?

26372

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

26373

26374

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

26375

Cmp.getOpcode() == X86ISD::BT) { // FIXME

26376

Cond = Cmp;

26377

AddTest = false;

26378

}

26379

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

26380

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

26381

CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

26382

SDValue Value;

26383

X86::CondCode X86Cond;

26384

std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

26385

26386

CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

26387

AddTest = false;

26388

}

26389

26390

if (AddTest) {

26391

// Look past the truncate if the high bits are known zero.

26392

if (isTruncWithZeroHighBitsInput(Cond, DAG))

26393

Cond = Cond.getOperand(0);

26394

26395

// We know the result of AND is compared against zero. Try to match

26396

// it to BT.

26397

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

26398

X86::CondCode X86CondCode;

26399

if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {

26400

CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);

26401

Cond = BT;

26402

AddTest = false;

26403

}

26404

}

26405

}

26406

26407

if (AddTest) {

26408

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

26409

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

26410

}

26411

26412

// a < b ? -1 : 0 -> RES = ~setcc_carry

26413

// a < b ? 0 : -1 -> RES = setcc_carry

26414

// a >= b ? -1 : 0 -> RES = setcc_carry

26415

// a >= b ? 0 : -1 -> RES = ~setcc_carry

26416

if (Cond.getOpcode() == X86ISD::SUB) {

26417

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

26418

26419

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

26420

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

26421

(isNullConstant(Op1) || isNullConstant(Op2))) {

26422

SDValue Res =

26423

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

26424

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

26425

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

26426

return DAG.getNOT(DL, Res, Res.getValueType());

26427

return Res;

26428

}

26429

}

26430

26431

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

26432

// widen the cmov and push the truncate through. This avoids introducing a new

26433

// branch during isel and doesn't add any extensions.

26434

if (Op.getValueType() == MVT::i8 &&

26435

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

26436

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

26437

if (T1.getValueType() == T2.getValueType() &&

26438

// Exclude CopyFromReg to avoid partial register stalls.

26439

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

26440

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

26441

CC, Cond);

26442

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

26443

}

26444

}

26445

26446

// Or finally, promote i8 cmovs if we have CMOV,

26447

// or i16 cmovs if it won't prevent folding a load.

26448

// FIXME: we should not limit promotion of i8 case to only when the CMOV is

26449

// legal, but EmitLoweredSelect() can not deal with these extensions

26450

// being inserted between two CMOV's. (in i16 case too TBN)

26451

// https://bugs.llvm.org/show_bug.cgi?id=40974

26452

if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||

26453

(Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&

26454

!X86::mayFoldLoad(Op2, Subtarget))) {

26455

Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

26456

Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

26457

SDValue Ops[] = { Op2, Op1, CC, Cond };

26458

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

26459

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

26460

}

26461

26462

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

26463

// condition is true.

26464

SDValue Ops[] = { Op2, Op1, CC, Cond };

26465

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

26466

}

26467

26468

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

26469

const X86Subtarget &Subtarget,

26470

SelectionDAG &DAG) {

26471

MVT VT = Op->getSimpleValueType(0);

26472

SDValue In = Op->getOperand(0);

26473

MVT InVT = In.getSimpleValueType();

26474

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26474, __extension__
__PRETTY_FUNCTION__));

26475

MVT VTElt = VT.getVectorElementType();

26476

SDLoc dl(Op);

26477

26478

unsigned NumElts = VT.getVectorNumElements();

26479

26480

// Extend VT if the scalar type is i8/i16 and BWI is not supported.

26481

MVT ExtVT = VT;

26482

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

26483

// If v16i32 is to be avoided, we'll need to split and concatenate.

26484

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

26485

return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

26486

26487

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

26488

}

26489

26490

// Widen to 512-bits if VLX is not supported.

26491

MVT WideVT = ExtVT;

26492

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

26493

NumElts *= 512 / ExtVT.getSizeInBits();

26494

InVT = MVT::getVectorVT(MVT::i1, NumElts);

26495

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

26496

In, DAG.getIntPtrConstant(0, dl));

26497

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

26498

}

26499

26500

SDValue V;

26501

MVT WideEltVT = WideVT.getVectorElementType();

26502

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

26503

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

26504

V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

26505

} else {

26506

SDValue NegOne = DAG.getConstant(-1, dl, WideVT);

26507

SDValue Zero = DAG.getConstant(0, dl, WideVT);

26508

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

26509

}

26510

26511

// Truncate if we had to extend i16/i8 above.

26512

if (VT != ExtVT) {

26513

WideVT = MVT::getVectorVT(VTElt, NumElts);

26514

V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

26515

}

26516

26517

// Extract back to 128/256-bit if we widened.

26518

if (WideVT != VT)

26519

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

26520

DAG.getIntPtrConstant(0, dl));

26521

26522

return V;

26523

}

26524

26525

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

26526

SelectionDAG &DAG) {

26527

SDValue In = Op->getOperand(0);

26528

MVT InVT = In.getSimpleValueType();

26529

26530

if (InVT.getVectorElementType() == MVT::i1)

26531

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

26532

26533

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26533, __extension__
__PRETTY_FUNCTION__));

26534

return LowerAVXExtend(Op, DAG, Subtarget);

26535

}

26536

26537

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

26538

// For sign extend this needs to handle all vector sizes and SSE4.1 and

26539

// non-SSE4.1 targets. For zero extend this should only handle inputs of

26540

// MVT::v64i8 when BWI is not supported, but AVX512 is.

26541

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

26542

const X86Subtarget &Subtarget,

26543

SelectionDAG &DAG) {

26544

SDValue In = Op->getOperand(0);

26545

MVT VT = Op->getSimpleValueType(0);

26546

MVT InVT = In.getSimpleValueType();

26547

26548

MVT SVT = VT.getVectorElementType();

26549

MVT InSVT = InVT.getVectorElementType();

26550

assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26550, __extension__
__PRETTY_FUNCTION__));

26551

26552

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

26553

return SDValue();

26554

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

26555

return SDValue();

26556

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

26557

!(VT.is256BitVector() && Subtarget.hasAVX()) &&

26558

!(VT.is512BitVector() && Subtarget.hasAVX512()))

26559

return SDValue();

26560

26561

SDLoc dl(Op);

26562

unsigned Opc = Op.getOpcode();

26563

unsigned NumElts = VT.getVectorNumElements();

26564

26565

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

26566

// For 512-bit vectors, we need 128-bits or 256-bits.

26567

if (InVT.getSizeInBits() > 128) {

26568

// Input needs to be at least the same number of elements as output, and

26569

// at least 128-bits.

26570

int InSize = InSVT.getSizeInBits() * NumElts;

26571

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

26572

InVT = In.getSimpleValueType();

26573

}

26574

26575

// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

26576

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

26577

// need to be handled here for 256/512-bit results.

26578

if (Subtarget.hasInt256()) {

26579

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26579, __extension__
__PRETTY_FUNCTION__));

26580

26581

if (InVT.getVectorNumElements() != NumElts)

26582

return DAG.getNode(Op.getOpcode(), dl, VT, In);

26583

26584

// FIXME: Apparently we create inreg operations that could be regular

26585

// extends.

26586

unsigned ExtOpc =

26587

Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

26588

: ISD::ZERO_EXTEND;

26589

return DAG.getNode(ExtOpc, dl, VT, In);

26590

}

26591

26592

// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

26593

if (Subtarget.hasAVX()) {

26594

assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26594, __extension__
__PRETTY_FUNCTION__));

26595

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26596

int HalfNumElts = HalfVT.getVectorNumElements();

26597

26598

unsigned NumSrcElts = InVT.getVectorNumElements();

26599

SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

26600

for (int i = 0; i != HalfNumElts; ++i)

26601

HiMask[i] = HalfNumElts + i;

26602

26603

SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

26604

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

26605

Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

26606

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

26607

}

26608

26609

// We should only get here for sign extend.

26610

assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26610, __extension__
__PRETTY_FUNCTION__));

26611

assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26611, __extension__
__PRETTY_FUNCTION__));

26612

26613

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

26614

SDValue Curr = In;

26615

SDValue SignExt = Curr;

26616

26617

// As SRAI is only available on i16/i32 types, we expand only up to i32

26618

// and handle i64 separately.

26619

if (InVT != MVT::v4i32) {

26620

MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

26621

26622

unsigned DestWidth = DestVT.getScalarSizeInBits();

26623

unsigned Scale = DestWidth / InSVT.getSizeInBits();

26624

26625

unsigned InNumElts = InVT.getVectorNumElements();

26626

unsigned DestElts = DestVT.getVectorNumElements();

26627

26628

// Build a shuffle mask that takes each input element and places it in the

26629

// MSBs of the new element size.

26630

SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

26631

for (unsigned i = 0; i != DestElts; ++i)

26632

Mask[i * Scale + (Scale - 1)] = i;

26633

26634

Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

26635

Curr = DAG.getBitcast(DestVT, Curr);

26636

26637

unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

26638

SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

26639

DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

26640

}

26641

26642

if (VT == MVT::v2i64) {

26643

assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__
__PRETTY_FUNCTION__));

26644

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

26645

SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

26646

SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

26647

SignExt = DAG.getBitcast(VT, SignExt);

26648

}

26649

26650

return SignExt;

26651

}

26652

26653

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

26654

SelectionDAG &DAG) {

26655

MVT VT = Op->getSimpleValueType(0);

26656

SDValue In = Op->getOperand(0);

26657

MVT InVT = In.getSimpleValueType();

26658

SDLoc dl(Op);

26659

26660

if (InVT.getVectorElementType() == MVT::i1)

26661

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

26662

26663

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26663, __extension__
__PRETTY_FUNCTION__));

26664

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__
__PRETTY_FUNCTION__))

26665

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__
__PRETTY_FUNCTION__));

26666

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))

26667

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))

26668

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__))

26669

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__
__PRETTY_FUNCTION__));

26670

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))

26671

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))

26672

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__))

26673

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__));

26674

26675

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

26676

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26676, __extension__
__PRETTY_FUNCTION__));

26677

return splitVectorIntUnary(Op, DAG);

26678

}

26679

26680

if (Subtarget.hasInt256())

26681

return Op;

26682

26683

// Optimize vectors in AVX mode

26684

// Sign extend v8i16 to v8i32 and

26685

// v4i32 to v4i64

26686

//

26687

// Divide input vector into two parts

26688

// for v4i32 the high shuffle mask will be {2, 3, -1, -1}

26689

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

26690

// concat the vectors to original VT

26691

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26692

SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

26693

26694

unsigned NumElems = InVT.getVectorNumElements();

26695

SmallVector<int,8> ShufMask(NumElems, -1);

26696

for (unsigned i = 0; i != NumElems/2; ++i)

26697

ShufMask[i] = i + NumElems/2;

26698

26699

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

26700

OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

26701

26702

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

26703

}

26704

26705

/// Change a vector store into a pair of half-size vector stores.

26706

static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

26707

SDValue StoredVal = Store->getValue();

26708

assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))

26709

StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__))

26710

"Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__
__PRETTY_FUNCTION__));

26711

26712

// Splitting volatile memory ops is not allowed unless the operation was not

26713

// legal to begin with. Assume the input store is legal (this transform is

26714

// only used for targets with AVX). Note: It is possible that we have an

26715

// illegal type like v2i128, and so we could allow splitting a volatile store

26716

// in that case if that is important.

26717

if (!Store->isSimple())

26718

return SDValue();

26719

26720

SDLoc DL(Store);

26721

SDValue Value0, Value1;

26722

std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

26723

unsigned HalfOffset = Value0.getValueType().getStoreSize();

26724

SDValue Ptr0 = Store->getBasePtr();

26725

SDValue Ptr1 =

26726

DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);

26727

SDValue Ch0 =

26728

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

26729

Store->getOriginalAlign(),

26730

Store->getMemOperand()->getFlags());

26731

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

26732

Store->getPointerInfo().getWithOffset(HalfOffset),

26733

Store->getOriginalAlign(),

26734

Store->getMemOperand()->getFlags());

26735

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

26736

}

26737

26738

/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

26739

/// type.

26740

static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

26741

SelectionDAG &DAG) {

26742

SDValue StoredVal = Store->getValue();

26743

assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__
__PRETTY_FUNCTION__))

26744

StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__
__PRETTY_FUNCTION__));

26745

StoredVal = DAG.getBitcast(StoreVT, StoredVal);

26746

26747

// Splitting volatile memory ops is not allowed unless the operation was not

26748

// legal to begin with. We are assuming the input op is legal (this transform

26749

// is only used for targets with AVX).

26750

if (!Store->isSimple())

26751

return SDValue();

26752

26753

MVT StoreSVT = StoreVT.getScalarType();

26754

unsigned NumElems = StoreVT.getVectorNumElements();

26755

unsigned ScalarSize = StoreSVT.getStoreSize();

26756

26757

SDLoc DL(Store);

26758

SmallVector<SDValue, 4> Stores;

26759

for (unsigned i = 0; i != NumElems; ++i) {

26760

unsigned Offset = i * ScalarSize;

26761

SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

26762

TypeSize::Fixed(Offset), DL);

26763

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

26764

DAG.getIntPtrConstant(i, DL));

26765

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

26766

Store->getPointerInfo().getWithOffset(Offset),

26767

Store->getOriginalAlign(),

26768

Store->getMemOperand()->getFlags());

26769

Stores.push_back(Ch);

26770

}

26771

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

26772

}

26773

26774

static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

26775

SelectionDAG &DAG) {

26776

StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

26777

SDLoc dl(St);

26778

SDValue StoredVal = St->getValue();

26779

26780

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

26781

if (StoredVal.getValueType().isVector() &&

26782

StoredVal.getValueType().getVectorElementType() == MVT::i1) {

26783

unsigned NumElts = StoredVal.getValueType().getVectorNumElements();

26784

assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26784, __extension__
__PRETTY_FUNCTION__));

26785

assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26785, __extension__
__PRETTY_FUNCTION__));

26786

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__
__PRETTY_FUNCTION__))

26787

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__
__PRETTY_FUNCTION__));

26788

26789

// We must pad with zeros to ensure we store zeroes to any unused bits.

26790

StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

26791

DAG.getUNDEF(MVT::v16i1), StoredVal,

26792

DAG.getIntPtrConstant(0, dl));

26793

StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

26794

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

26795

// Make sure we store zeros in the extra bits.

26796

if (NumElts < 8)

26797

StoredVal = DAG.getZeroExtendInReg(

26798

StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));

26799

26800

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26801

St->getPointerInfo(), St->getOriginalAlign(),

26802

St->getMemOperand()->getFlags());

26803

}

26804

26805

if (St->isTruncatingStore())

26806

return SDValue();

26807

26808

// If this is a 256-bit store of concatenated ops, we are better off splitting

26809

// that store into two 128-bit stores. This avoids spurious use of 256-bit ops

26810

// and each half can execute independently. Some cores would split the op into

26811

// halves anyway, so the concat (vinsertf128) is purely an extra op.

26812

MVT StoreVT = StoredVal.getSimpleValueType();

26813

if (StoreVT.is256BitVector() ||

26814

((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

26815

!Subtarget.hasBWI())) {

26816

SmallVector<SDValue, 4> CatOps;

26817

if (StoredVal.hasOneUse() &&

26818

collectConcatOps(StoredVal.getNode(), CatOps, DAG))

26819

return splitVectorStore(St, DAG);

26820

return SDValue();

26821

}

26822

26823

if (StoreVT.is32BitVector())

26824

return SDValue();

26825

26826

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26827

assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26827, __extension__
__PRETTY_FUNCTION__));

26828

assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))

26829

TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__))

26830

"Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__
__PRETTY_FUNCTION__));

26831

26832

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

26833

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

26834

DAG.getUNDEF(StoreVT));

26835

26836

if (Subtarget.hasSSE2()) {

26837

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element

26838

// and store it.

26839

MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

26840

MVT CastVT = MVT::getVectorVT(StVT, 2);

26841

StoredVal = DAG.getBitcast(CastVT, StoredVal);

26842

StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

26843

DAG.getIntPtrConstant(0, dl));

26844

26845

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26846

St->getPointerInfo(), St->getOriginalAlign(),

26847

St->getMemOperand()->getFlags());

26848

}

26849

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26849, __extension__
__PRETTY_FUNCTION__));

26850

SDVTList Tys = DAG.getVTList(MVT::Other);

26851

SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

26852

return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

26853

St->getMemOperand());

26854

}

26855

26856

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

26857

// may emit an illegal shuffle but the expansion is still better than scalar

26858

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

26859

// we'll emit a shuffle and a arithmetic shift.

26860

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

26861

// TODO: It is possible to support ZExt by zeroing the undef values during

26862

// the shuffle phase or after the shuffle.

26863

static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

26864

SelectionDAG &DAG) {

26865

MVT RegVT = Op.getSimpleValueType();

26866

assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26866, __extension__
__PRETTY_FUNCTION__));

26867

assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__
__PRETTY_FUNCTION__))

26868

"We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__
__PRETTY_FUNCTION__));

26869

26870

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

26871

SDLoc dl(Ld);

26872

26873

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

26874

if (RegVT.getVectorElementType() == MVT::i1) {

26875

assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26875, __extension__
__PRETTY_FUNCTION__));

26876

assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26876, __extension__
__PRETTY_FUNCTION__));

26877

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__
__PRETTY_FUNCTION__))

26878

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__
__PRETTY_FUNCTION__));

26879

26880

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

26881

Ld->getPointerInfo(), Ld->getOriginalAlign(),

26882

Ld->getMemOperand()->getFlags());

26883

26884

// Replace chain users with the new chain.

26885

assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26885, __extension__
__PRETTY_FUNCTION__));

26886

26887

SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

26888

Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

26889

DAG.getBitcast(MVT::v16i1, Val),

26890

DAG.getIntPtrConstant(0, dl));

26891

return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

26892

}

26893

26894

return SDValue();

26895

}

26896

26897

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

26898

/// each of which has no other use apart from the AND / OR.

26899

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

26900

Opc = Op.getOpcode();

26901

if (Opc != ISD::OR && Opc != ISD::AND)

26902

return false;

26903

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

26904

Op.getOperand(0).hasOneUse() &&

26905

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

26906

Op.getOperand(1).hasOneUse());

26907

}

26908

26909

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

26910

SDValue Chain = Op.getOperand(0);

26911

SDValue Cond = Op.getOperand(1);

26912

SDValue Dest = Op.getOperand(2);

26913

SDLoc dl(Op);

26914

26915

// Bail out when we don't have native compare instructions.

26916

if (Cond.getOpcode() == ISD::SETCC &&

26917

Cond.getOperand(0).getValueType() != MVT::f128 &&

26918

!isSoftFP16(Cond.getOperand(0).getValueType())) {

26919

SDValue LHS = Cond.getOperand(0);

26920

SDValue RHS = Cond.getOperand(1);

26921

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

26922

26923

// Special case for

26924

// setcc([su]{add,sub,mul}o == 0)

26925

// setcc([su]{add,sub,mul}o != 1)

26926

if (ISD::isOverflowIntrOpRes(LHS) &&

26927

(CC == ISD::SETEQ || CC == ISD::SETNE) &&

26928

(isNullConstant(RHS) || isOneConstant(RHS))) {

26929

SDValue Value, Overflow;

26930

X86::CondCode X86Cond;

26931

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

26932

26933

if ((CC == ISD::SETEQ) == isNullConstant(RHS))

26934

X86Cond = X86::GetOppositeBranchCondition(X86Cond);

26935

26936

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26937

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26938

Overflow);

26939

}

26940

26941

if (LHS.getSimpleValueType().isInteger()) {

26942

SDValue CCVal;

26943

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

26944

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26945

EFLAGS);

26946

}

26947

26948

if (CC == ISD::SETOEQ) {

26949

// For FCMP_OEQ, we can emit

26950

// two branches instead of an explicit AND instruction with a

26951

// separate test. However, we only do this if this block doesn't

26952

// have a fall-through edge, because this requires an explicit

26953

// jmp when the condition is false.

26954

if (Op.getNode()->hasOneUse()) {

26955

SDNode *User = *Op.getNode()->use_begin();

26956

// Look for an unconditional branch following this conditional branch.

26957

// We need this because we need to reverse the successors in order

26958

// to implement FCMP_OEQ.

26959

if (User->getOpcode() == ISD::BR) {

26960

SDValue FalseBB = User->getOperand(1);

26961

SDNode *NewBR =

26962

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

26963

assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26963, __extension__ __PRETTY_FUNCTION__));

26964

(void)NewBR;

26965

Dest = FalseBB;

26966

26967

SDValue Cmp =

26968

DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26969

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26970

Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

26971

CCVal, Cmp);

26972

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26973

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26974

Cmp);

26975

}

26976

}

26977

} else if (CC == ISD::SETUNE) {

26978

// For FCMP_UNE, we can emit

26979

// two branches instead of an explicit OR instruction with a

26980

// separate test.

26981

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26982

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26983

Chain =

26984

DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

26985

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26986

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26987

Cmp);

26988

} else {

26989

X86::CondCode X86Cond =

26990

TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

26991

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26992

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26993

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26994

Cmp);

26995

}

26996

}

26997

26998

if (ISD::isOverflowIntrOpRes(Cond)) {

26999

SDValue Value, Overflow;

27000

X86::CondCode X86Cond;

27001

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

27002

27003

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

27004

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

27005

Overflow);

27006

}

27007

27008

// Look past the truncate if the high bits are known zero.

27009

if (isTruncWithZeroHighBitsInput(Cond, DAG))

27010

Cond = Cond.getOperand(0);

27011

27012

EVT CondVT = Cond.getValueType();

27013

27014

// Add an AND with 1 if we don't already have one.

27015

if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

27016

Cond =

27017

DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

27018

27019

SDValue LHS = Cond;

27020

SDValue RHS = DAG.getConstant(0, dl, CondVT);

27021

27022

SDValue CCVal;

27023

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

27024

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

27025

EFLAGS);

27026

}

27027

27028

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

27029

// Calls to _alloca are needed to probe the stack when allocating more than 4k

27030

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

27031

// that the guard pages used by the OS virtual memory manager are allocated in

27032

// correct sequence.

27033

SDValue

27034

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

27035

SelectionDAG &DAG) const {

27036

MachineFunction &MF = DAG.getMachineFunction();

27037

bool SplitStack = MF.shouldSplitStack();

27038

bool EmitStackProbeCall = hasStackProbeSymbol(MF);

27039

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

27040

SplitStack || EmitStackProbeCall;

27041

SDLoc dl(Op);

27042

27043

// Get the inputs.

27044

SDNode *Node = Op.getNode();

27045

SDValue Chain = Op.getOperand(0);

27046

SDValue Size = Op.getOperand(1);

27047

MaybeAlign Alignment(Op.getConstantOperandVal(2));

27048

EVT VT = Node->getValueType(0);

27049

27050

// Chain the dynamic stack allocation so that it doesn't modify the stack

27051

// pointer when other instructions are using the stack.

27052

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

27053

27054

bool Is64Bit = Subtarget.is64Bit();

27055

MVT SPTy = getPointerTy(DAG.getDataLayout());

27056

27057

SDValue Result;

27058

if (!Lower) {

27059

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27060

Register SPReg = TLI.getStackPointerRegisterToSaveRestore();

27061

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__
__PRETTY_FUNCTION__))

27062

" not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__
__PRETTY_FUNCTION__));

27063

27064

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

27065

const Align StackAlign = TFI.getStackAlign();

27066

if (hasInlineStackProbe(MF)) {

27067

MachineRegisterInfo &MRI = MF.getRegInfo();

27068

27069

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

27070

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

27071

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

27072

Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

27073

DAG.getRegister(Vreg, SPTy));

27074

} else {

27075

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

27076

Chain = SP.getValue(1);

27077

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

27078

}

27079

if (Alignment && *Alignment > StackAlign)

27080

Result =

27081

DAG.getNode(ISD::AND, dl, VT, Result,

27082

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

27083

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

27084

} else if (SplitStack) {

27085

MachineRegisterInfo &MRI = MF.getRegInfo();

27086

27087

if (Is64Bit) {

27088

// The 64 bit implementation of segmented stacks needs to clobber both r10

27089

// r11. This makes it impossible to use it along with nested parameters.

27090

const Function &F = MF.getFunction();

27091

for (const auto &A : F.args()) {

27092

if (A.hasNestAttr())

27093

report_fatal_error("Cannot use segmented stacks with functions that "

27094

"have nested arguments.");

27095

}

27096

}

27097

27098

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

27099

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

27100

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

27101

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

27102

DAG.getRegister(Vreg, SPTy));

27103

} else {

27104

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

27105

Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);

27106

MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);

27107

27108

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27109

Register SPReg = RegInfo->getStackRegister();

27110

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

27111

Chain = SP.getValue(1);

27112

27113

if (Alignment) {

27114

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

27115

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

27116

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

27117

}

27118

27119

Result = SP;

27120

}

27121

27122

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

27123

27124

SDValue Ops[2] = {Result, Chain};

27125

return DAG.getMergeValues(Ops, dl);

27126

}

27127

27128

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

27129

MachineFunction &MF = DAG.getMachineFunction();

27130

auto PtrVT = getPointerTy(MF.getDataLayout());

27131

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

27132

27133

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

27134

SDLoc DL(Op);

27135

27136

if (!Subtarget.is64Bit() ||

27137

Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

27138

// vastart just stores the address of the VarArgsFrameIndex slot into the

27139

// memory location argument.

27140

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

27141

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

27142

MachinePointerInfo(SV));

27143

}

27144

27145

// __va_list_tag:

27146

// gp_offset (0 - 6 * 8)

27147

// fp_offset (48 - 48 + 8 * 16)

27148

// overflow_arg_area (point to parameters coming in memory).

27149

// reg_save_area

27150

SmallVector<SDValue, 8> MemOps;

27151

SDValue FIN = Op.getOperand(1);

27152

// Store gp_offset

27153

SDValue Store = DAG.getStore(

27154

Op.getOperand(0), DL,

27155

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

27156

MachinePointerInfo(SV));

27157

MemOps.push_back(Store);

27158

27159

// Store fp_offset

27160

FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);

27161

Store = DAG.getStore(

27162

Op.getOperand(0), DL,

27163

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

27164

MachinePointerInfo(SV, 4));

27165

MemOps.push_back(Store);

27166

27167

// Store ptr to overflow_arg_area

27168

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

27169

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

27170

Store =

27171

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

27172

MemOps.push_back(Store);

27173

27174

// Store ptr to reg_save_area.

27175

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

27176

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

27177

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

27178

Store = DAG.getStore(

27179

Op.getOperand(0), DL, RSFIN, FIN,

27180

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

27181

MemOps.push_back(Store);

27182

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

27183

}

27184

27185

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

27186

assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__
__PRETTY_FUNCTION__))

27187

"LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__
__PRETTY_FUNCTION__));

27188

assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27188, __extension__ __PRETTY_FUNCTION__));

27189

27190

MachineFunction &MF = DAG.getMachineFunction();

27191

if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

27192

// The Win64 ABI uses char* instead of a structure.

27193

return DAG.expandVAArg(Op.getNode());

27194

27195

SDValue Chain = Op.getOperand(0);

27196

SDValue SrcPtr = Op.getOperand(1);

27197

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

27198

unsigned Align = Op.getConstantOperandVal(3);

27199

SDLoc dl(Op);

27200

27201

EVT ArgVT = Op.getNode()->getValueType(0);

27202

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

27203

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

27204

uint8_t ArgMode;

27205

27206

// Decide which area this value should be read from.

27207

// TODO: Implement the AMD64 ABI in its entirety. This simple

27208

// selection mechanism works only for the basic types.

27209

assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27209, __extension__
__PRETTY_FUNCTION__));

27210

if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

27211

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

27212

} else {

27213

assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__
__PRETTY_FUNCTION__))

27214

"Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__
__PRETTY_FUNCTION__));

27215

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

27216

}

27217

27218

if (ArgMode == 2) {

27219

// Make sure using fp_offset makes sense.

27220

assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))

27221

!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__))

27222

Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__
__PRETTY_FUNCTION__));

27223

}

27224

27225

// Insert VAARG node into the DAG

27226

// VAARG returns two values: Variable Argument Address, Chain

27227

SDValue InstOps[] = {Chain, SrcPtr,

27228

DAG.getTargetConstant(ArgSize, dl, MVT::i32),

27229

DAG.getTargetConstant(ArgMode, dl, MVT::i8),

27230

DAG.getTargetConstant(Align, dl, MVT::i32)};

27231

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

27232

SDValue VAARG = DAG.getMemIntrinsicNode(

27233

Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,

27234

VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

27235

/*Alignment=*/std::nullopt,

27236

MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

27237

Chain = VAARG.getValue(1);

27238

27239

// Load the next argument and return it

27240

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

27241

}

27242

27243

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

27244

SelectionDAG &DAG) {

27245

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

27246

// where a va_list is still an i8*.

27247

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27247, __extension__
__PRETTY_FUNCTION__));

27248

if (Subtarget.isCallingConvWin64(

27249

DAG.getMachineFunction().getFunction().getCallingConv()))

27250

// Probably a Win64 va_copy.

27251

return DAG.expandVACopy(Op.getNode());

27252

27253

SDValue Chain = Op.getOperand(0);

27254

SDValue DstPtr = Op.getOperand(1);

27255

SDValue SrcPtr = Op.getOperand(2);

27256

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

27257

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

27258

SDLoc DL(Op);

27259

27260

return DAG.getMemcpy(

27261

Chain, DL, DstPtr, SrcPtr,

27262

DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),

27263

Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,

27264

false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

27265

}

27266

27267

// Helper to get immediate/variable SSE shift opcode from other shift opcodes.

27268

static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

27269

switch (Opc) {

27270

case ISD::SHL:

27271

case X86ISD::VSHL:

27272

case X86ISD::VSHLI:

27273

return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

27274

case ISD::SRL:

27275

case X86ISD::VSRL:

27276

case X86ISD::VSRLI:

27277

return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

27278

case ISD::SRA:

27279

case X86ISD::VSRA:

27280

case X86ISD::VSRAI:

27281

return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

27282

}

27283

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27283);

27284

}

27285

27286

/// Handle vector element shifts where the shift amount is a constant.

27287

/// Takes immediate version of shift as input.

27288

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

27289

SDValue SrcOp, uint64_t ShiftAmt,

27290

SelectionDAG &DAG) {

27291

MVT ElementType = VT.getVectorElementType();

27292

27293

// Bitcast the source vector to the output type, this is mainly necessary for

27294

// vXi8/vXi64 shifts.

27295

if (VT != SrcOp.getSimpleValueType())

27296

SrcOp = DAG.getBitcast(VT, SrcOp);

27297

27298

// Fold this packed shift into its first operand if ShiftAmt is 0.

27299

if (ShiftAmt == 0)

27300

return SrcOp;

27301

27302

// Check for ShiftAmt >= element width

27303

if (ShiftAmt >= ElementType.getSizeInBits()) {

27304

if (Opc == X86ISD::VSRAI)

27305

ShiftAmt = ElementType.getSizeInBits() - 1;

27306

else

27307

return DAG.getConstant(0, dl, VT);

27308

}

27309

27310

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__
__PRETTY_FUNCTION__))

27311

&& "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__
__PRETTY_FUNCTION__));

27312

27313

// Fold this packed vector shift into a build vector if SrcOp is a

27314

// vector of Constants or UNDEFs.

27315

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

27316

unsigned ShiftOpc;

27317

switch (Opc) {

27318

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27318);

27319

case X86ISD::VSHLI:

27320

ShiftOpc = ISD::SHL;

27321

break;

27322

case X86ISD::VSRLI:

27323

ShiftOpc = ISD::SRL;

27324

break;

27325

case X86ISD::VSRAI:

27326

ShiftOpc = ISD::SRA;

27327

break;

27328

}

27329

27330

SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);

27331

if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))

27332

return C;

27333

}

27334

27335

return DAG.getNode(Opc, dl, VT, SrcOp,

27336

DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

27337

}

27338

27339

/// Handle vector element shifts by a splat shift amount

27340

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

27341

SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,

27342

const X86Subtarget &Subtarget,

27343

SelectionDAG &DAG) {

27344

MVT AmtVT = ShAmt.getSimpleValueType();

27345

assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27345, __extension__
__PRETTY_FUNCTION__));

27346

assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__
__PRETTY_FUNCTION__))

27347

"Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__
__PRETTY_FUNCTION__));

27348

27349

// Move the splat element to the bottom element.

27350

if (ShAmtIdx != 0) {

27351

SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);

27352

Mask[0] = ShAmtIdx;

27353

ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);

27354

}

27355

27356

// Peek through any zext node if we can get back to a 128-bit source.

27357

if (AmtVT.getScalarSizeInBits() == 64 &&

27358

(ShAmt.getOpcode() == ISD::ZERO_EXTEND ||

27359

ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

27360

ShAmt.getOperand(0).getValueType().isSimple() &&

27361

ShAmt.getOperand(0).getValueType().is128BitVector()) {

27362

ShAmt = ShAmt.getOperand(0);

27363

AmtVT = ShAmt.getSimpleValueType();

27364

}

27365

27366

// See if we can mask off the upper elements using the existing source node.

27367

// The shift uses the entire lower 64-bits of the amount vector, so no need to

27368

// do this for vXi64 types.

27369

bool IsMasked = false;

27370

if (AmtVT.getScalarSizeInBits() < 64) {

27371

if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||

27372

ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {

27373

// If the shift amount has come from a scalar, then zero-extend the scalar

27374

// before moving to the vector.

27375

ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);

27376

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

27377

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);

27378

AmtVT = MVT::v4i32;

27379

IsMasked = true;

27380

} else if (ShAmt.getOpcode() == ISD::AND) {

27381

// See if the shift amount is already masked (e.g. for rotation modulo),

27382

// then we can zero-extend it by setting all the other mask elements to

27383

// zero.

27384

SmallVector<SDValue> MaskElts(

27385

AmtVT.getVectorNumElements(),

27386

DAG.getConstant(0, dl, AmtVT.getScalarType()));

27387

MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());

27388

SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);

27389

if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,

27390

{ShAmt.getOperand(1), Mask}))) {

27391

ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);

27392

IsMasked = true;

27393

}

27394

}

27395

}

27396

27397

// Extract if the shift amount vector is larger than 128-bits.

27398

if (AmtVT.getSizeInBits() > 128) {

27399

ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);

27400

AmtVT = ShAmt.getSimpleValueType();

27401

}

27402

27403

// Zero-extend bottom element to v2i64 vector type, either by extension or

27404

// shuffle masking.

27405

if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {

27406

if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||

27407

ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {

27408

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);

27409

} else if (Subtarget.hasSSE41()) {

27410

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

27411

MVT::v2i64, ShAmt);

27412

} else {

27413

SDValue ByteShift = DAG.getTargetConstant(

27414

(128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

27415

ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

27416

ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

27417

ByteShift);

27418

ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

27419

ByteShift);

27420

}

27421

}

27422

27423

// Change opcode to non-immediate version.

27424

Opc = getTargetVShiftUniformOpcode(Opc, true);

27425

27426

// The return type has to be a 128-bit type with the same element

27427

// type as the input type.

27428

MVT EltVT = VT.getVectorElementType();

27429

MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

27430

27431

ShAmt = DAG.getBitcast(ShVT, ShAmt);

27432

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

27433

}

27434

27435

/// Return Mask with the necessary casting or extending

27436

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

27437

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

27438

const X86Subtarget &Subtarget, SelectionDAG &DAG,

27439

const SDLoc &dl) {

27440

27441

if (isAllOnesConstant(Mask))

27442

return DAG.getConstant(1, dl, MaskVT);

27443

if (X86::isZeroNode(Mask))

27444

return DAG.getConstant(0, dl, MaskVT);

27445

27446

assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27446, __extension__
__PRETTY_FUNCTION__));

27447

27448

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

27449

assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27449, __extension__
__PRETTY_FUNCTION__));

27450

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27450, __extension__
__PRETTY_FUNCTION__));

27451

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

27452

SDValue Lo, Hi;

27453

std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);

27454

Lo = DAG.getBitcast(MVT::v32i1, Lo);

27455

Hi = DAG.getBitcast(MVT::v32i1, Hi);

27456

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

27457

} else {

27458

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

27459

Mask.getSimpleValueType().getSizeInBits());

27460

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

27461

// are extracted by EXTRACT_SUBVECTOR.

27462

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

27463

DAG.getBitcast(BitcastVT, Mask),

27464

DAG.getIntPtrConstant(0, dl));

27465

}

27466

}

27467

27468

/// Return (and \p Op, \p Mask) for compare instructions or

27469

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

27470

/// necessary casting or extending for \p Mask when lowering masking intrinsics

27471

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

27472

SDValue PreservedSrc,

27473

const X86Subtarget &Subtarget,

27474

SelectionDAG &DAG) {

27475

MVT VT = Op.getSimpleValueType();

27476

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

27477

unsigned OpcodeSelect = ISD::VSELECT;

27478

SDLoc dl(Op);

27479

27480

if (isAllOnesConstant(Mask))

27481

return Op;

27482

27483

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27484

27485

if (PreservedSrc.isUndef())

27486

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

27487

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

27488

}

27489

27490

/// Creates an SDNode for a predicated scalar operation.

27491

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

27492

/// The mask is coming as MVT::i8 and it should be transformed

27493

/// to MVT::v1i1 while lowering masking intrinsics.

27494

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

27495

/// "X86select" instead of "vselect". We just can't create the "vselect" node

27496

/// for a scalar instruction.

27497

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

27498

SDValue PreservedSrc,

27499

const X86Subtarget &Subtarget,

27500

SelectionDAG &DAG) {

27501

27502

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

27503

if (MaskConst->getZExtValue() & 0x1)

27504

return Op;

27505

27506

MVT VT = Op.getSimpleValueType();

27507

SDLoc dl(Op);

27508

27509

assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27509, __extension__
__PRETTY_FUNCTION__));

27510

SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

27511

DAG.getBitcast(MVT::v8i1, Mask),

27512

DAG.getIntPtrConstant(0, dl));

27513

if (Op.getOpcode() == X86ISD::FSETCCM ||

27514

Op.getOpcode() == X86ISD::FSETCCM_SAE ||

27515

Op.getOpcode() == X86ISD::VFPCLASSS)

27516

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

27517

27518

if (PreservedSrc.isUndef())

27519

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

27520

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

27521

}

27522

27523

static int getSEHRegistrationNodeSize(const Function *Fn) {

27524

if (!Fn->hasPersonalityFn())

27525

report_fatal_error(

27526

"querying registration node size for function without personality");

27527

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

27528

// WinEHStatePass for the full struct definition.

27529

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

27530

case EHPersonality::MSVC_X86SEH: return 24;

27531

case EHPersonality::MSVC_CXX: return 16;

27532

default: break;

27533

}

27534

report_fatal_error(

27535

"can only recover FP for 32-bit MSVC EH personality functions");

27536

}

27537

27538

/// When the MSVC runtime transfers control to us, either to an outlined

27539

/// function or when returning to a parent frame after catching an exception, we

27540

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

27541

/// Here's the math:

27542

/// RegNodeBase = EntryEBP - RegNodeSize

27543

/// ParentFP = RegNodeBase - ParentFrameOffset

27544

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

27545

/// subtracting the offset (negative on x86) takes us back to the parent FP.

27546

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

27547

SDValue EntryEBP) {

27548

MachineFunction &MF = DAG.getMachineFunction();

27549

SDLoc dl;

27550

27551

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27552

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

27553

27554

// It's possible that the parent function no longer has a personality function

27555

// if the exceptional code was optimized away, in which case we just return

27556

// the incoming EBP.

27557

if (!Fn->hasPersonalityFn())

27558

return EntryEBP;

27559

27560

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

27561

// registration, or the .set_setframe offset.

27562

MCSymbol *OffsetSym =

27563

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

27564

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

27565

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

27566

SDValue ParentFrameOffset =

27567

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

27568

27569

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

27570

// prologue to RBP in the parent function.

27571

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

27572

if (Subtarget.is64Bit())

27573

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

27574

27575

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

27576

// RegNodeBase = EntryEBP - RegNodeSize

27577

// ParentFP = RegNodeBase - ParentFrameOffset

27578

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

27579

DAG.getConstant(RegNodeSize, dl, PtrVT));

27580

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

27581

}

27582

27583

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

27584

SelectionDAG &DAG) const {

27585

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

27586

auto isRoundModeCurDirection = [](SDValue Rnd) {

27587

if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

27588

return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

27589

27590

return false;

27591

};

27592

auto isRoundModeSAE = [](SDValue Rnd) {

27593

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27594

unsigned RC = C->getZExtValue();

27595

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27596

// Clear the NO_EXC bit and check remaining bits.

27597

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27598

// As a convenience we allow no other bits or explicitly

27599

// current direction.

27600

return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

27601

}

27602

}

27603

27604

return false;

27605

};

27606

auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

27607

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27608

RC = C->getZExtValue();

27609

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27610

// Clear the NO_EXC bit and check remaining bits.

27611

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27612

return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

27613

RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

27614

RC == X86::STATIC_ROUNDING::TO_POS_INF ||

27615

RC == X86::STATIC_ROUNDING::TO_ZERO;

27616

}

27617

}

27618

27619

return false;

27620

};

27621

27622

SDLoc dl(Op);

27623

unsigned IntNo = Op.getConstantOperandVal(0);

27624

MVT VT = Op.getSimpleValueType();

27625

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

27626

27627

// Propagate flags from original node to transformed node(s).

27628

SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());

27629

27630

if (IntrData) {

27631

switch(IntrData->Type) {

27632

case INTR_TYPE_1OP: {

27633

// We specify 2 possible opcodes for intrinsics with rounding modes.

27634

// First, we check if the intrinsic may have non-default rounding mode,

27635

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27636

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27637

if (IntrWithRoundingModeOpcode != 0) {

27638

SDValue Rnd = Op.getOperand(2);

27639

unsigned RC = 0;

27640

if (isRoundModeSAEToX(Rnd, RC))

27641

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27642

Op.getOperand(1),

27643

DAG.getTargetConstant(RC, dl, MVT::i32));

27644

if (!isRoundModeCurDirection(Rnd))

27645

return SDValue();

27646

}

27647

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27648

Op.getOperand(1));

27649

}

27650

case INTR_TYPE_1OP_SAE: {

27651

SDValue Sae = Op.getOperand(2);

27652

27653

unsigned Opc;

27654

if (isRoundModeCurDirection(Sae))

27655

Opc = IntrData->Opc0;

27656

else if (isRoundModeSAE(Sae))

27657

Opc = IntrData->Opc1;

27658

else

27659

return SDValue();

27660

27661

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

27662

}

27663

case INTR_TYPE_2OP: {

27664

SDValue Src2 = Op.getOperand(2);

27665

27666

// We specify 2 possible opcodes for intrinsics with rounding modes.

27667

// First, we check if the intrinsic may have non-default rounding mode,

27668

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27669

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27670

if (IntrWithRoundingModeOpcode != 0) {

27671

SDValue Rnd = Op.getOperand(3);

27672

unsigned RC = 0;

27673

if (isRoundModeSAEToX(Rnd, RC))

27674

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27675

Op.getOperand(1), Src2,

27676

DAG.getTargetConstant(RC, dl, MVT::i32));

27677

if (!isRoundModeCurDirection(Rnd))

27678

return SDValue();

27679

}

27680

27681

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27682

Op.getOperand(1), Src2);

27683

}

27684

case INTR_TYPE_2OP_SAE: {

27685

SDValue Sae = Op.getOperand(3);

27686

27687

unsigned Opc;

27688

if (isRoundModeCurDirection(Sae))

27689

Opc = IntrData->Opc0;

27690

else if (isRoundModeSAE(Sae))

27691

Opc = IntrData->Opc1;

27692

else

27693

return SDValue();

27694

27695

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

27696

Op.getOperand(2));

27697

}

27698

case INTR_TYPE_3OP:

27699

case INTR_TYPE_3OP_IMM8: {

27700

SDValue Src1 = Op.getOperand(1);

27701

SDValue Src2 = Op.getOperand(2);

27702

SDValue Src3 = Op.getOperand(3);

27703

27704

if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

27705

Src3.getValueType() != MVT::i8) {

27706

Src3 = DAG.getTargetConstant(

27707

cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);

27708

}

27709

27710

// We specify 2 possible opcodes for intrinsics with rounding modes.

27711

// First, we check if the intrinsic may have non-default rounding mode,

27712

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27713

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27714

if (IntrWithRoundingModeOpcode != 0) {

27715

SDValue Rnd = Op.getOperand(4);

27716

unsigned RC = 0;

27717

if (isRoundModeSAEToX(Rnd, RC))

27718

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27719

Src1, Src2, Src3,

27720

DAG.getTargetConstant(RC, dl, MVT::i32));

27721

if (!isRoundModeCurDirection(Rnd))

27722

return SDValue();

27723

}

27724

27725

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27726

{Src1, Src2, Src3});

27727

}

27728

case INTR_TYPE_4OP_IMM8: {

27729

assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27729, __extension__
__PRETTY_FUNCTION__));

27730

SDValue Src4 = Op.getOperand(4);

27731

if (Src4.getValueType() != MVT::i8) {

27732

Src4 = DAG.getTargetConstant(

27733

cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);

27734

}

27735

27736

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27737

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

27738

Src4);

27739

}

27740

case INTR_TYPE_1OP_MASK: {

27741

SDValue Src = Op.getOperand(1);

27742

SDValue PassThru = Op.getOperand(2);

27743

SDValue Mask = Op.getOperand(3);

27744

// We add rounding mode to the Node when

27745

// - RC Opcode is specified and

27746

// - RC is not "current direction".

27747

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27748

if (IntrWithRoundingModeOpcode != 0) {

27749

SDValue Rnd = Op.getOperand(4);

27750

unsigned RC = 0;

27751

if (isRoundModeSAEToX(Rnd, RC))

27752

return getVectorMaskingNode(

27753

DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27754

Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

27755

Mask, PassThru, Subtarget, DAG);

27756

if (!isRoundModeCurDirection(Rnd))

27757

return SDValue();

27758

}

27759

return getVectorMaskingNode(

27760

DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

27761

Subtarget, DAG);

27762

}

27763

case INTR_TYPE_1OP_MASK_SAE: {

27764

SDValue Src = Op.getOperand(1);

27765

SDValue PassThru = Op.getOperand(2);

27766

SDValue Mask = Op.getOperand(3);

27767

SDValue Rnd = Op.getOperand(4);

27768

27769

unsigned Opc;

27770

if (isRoundModeCurDirection(Rnd))

27771

Opc = IntrData->Opc0;

27772

else if (isRoundModeSAE(Rnd))

27773

Opc = IntrData->Opc1;

27774

else

27775

return SDValue();

27776

27777

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

27778

Subtarget, DAG);

27779

}

27780

case INTR_TYPE_SCALAR_MASK: {

27781

SDValue Src1 = Op.getOperand(1);

27782

SDValue Src2 = Op.getOperand(2);

27783

SDValue passThru = Op.getOperand(3);

27784

SDValue Mask = Op.getOperand(4);

27785

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27786

// There are 2 kinds of intrinsics in this group:

27787

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

27788

// (2) With rounding mode and sae - 7 operands.

27789

bool HasRounding = IntrWithRoundingModeOpcode != 0;

27790

if (Op.getNumOperands() == (5U + HasRounding)) {

27791

if (HasRounding) {

27792

SDValue Rnd = Op.getOperand(5);

27793

unsigned RC = 0;

27794

if (isRoundModeSAEToX(Rnd, RC))

27795

return getScalarMaskingNode(

27796

DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

27797

DAG.getTargetConstant(RC, dl, MVT::i32)),

27798

Mask, passThru, Subtarget, DAG);

27799

if (!isRoundModeCurDirection(Rnd))

27800

return SDValue();

27801

}

27802

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

27803

Src2),

27804

Mask, passThru, Subtarget, DAG);

27805

}

27806

27807

assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__
__PRETTY_FUNCTION__))

27808

"Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__
__PRETTY_FUNCTION__));

27809

SDValue RoundingMode = Op.getOperand(5);

27810

unsigned Opc = IntrData->Opc0;

27811

if (HasRounding) {

27812

SDValue Sae = Op.getOperand(6);

27813

if (isRoundModeSAE(Sae))

27814

Opc = IntrWithRoundingModeOpcode;

27815

else if (!isRoundModeCurDirection(Sae))

27816

return SDValue();

27817

}

27818

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

27819

Src2, RoundingMode),

27820

Mask, passThru, Subtarget, DAG);

27821

}

27822

case INTR_TYPE_SCALAR_MASK_RND: {

27823

SDValue Src1 = Op.getOperand(1);

27824

SDValue Src2 = Op.getOperand(2);

27825

SDValue passThru = Op.getOperand(3);

27826

SDValue Mask = Op.getOperand(4);

27827

SDValue Rnd = Op.getOperand(5);

27828

27829

SDValue NewOp;

27830

unsigned RC = 0;

27831

if (isRoundModeCurDirection(Rnd))

27832

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27833

else if (isRoundModeSAEToX(Rnd, RC))

27834

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27835

DAG.getTargetConstant(RC, dl, MVT::i32));

27836

else

27837

return SDValue();

27838

27839

return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

27840

}

27841

case INTR_TYPE_SCALAR_MASK_SAE: {

27842

SDValue Src1 = Op.getOperand(1);

27843

SDValue Src2 = Op.getOperand(2);

27844

SDValue passThru = Op.getOperand(3);

27845

SDValue Mask = Op.getOperand(4);

27846

SDValue Sae = Op.getOperand(5);

27847

unsigned Opc;

27848

if (isRoundModeCurDirection(Sae))

27849

Opc = IntrData->Opc0;

27850

else if (isRoundModeSAE(Sae))

27851

Opc = IntrData->Opc1;

27852

else

27853

return SDValue();

27854

27855

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27856

Mask, passThru, Subtarget, DAG);

27857

}

27858

case INTR_TYPE_2OP_MASK: {

27859

SDValue Src1 = Op.getOperand(1);

27860

SDValue Src2 = Op.getOperand(2);

27861

SDValue PassThru = Op.getOperand(3);

27862

SDValue Mask = Op.getOperand(4);

27863

SDValue NewOp;

27864

if (IntrData->Opc1 != 0) {

27865

SDValue Rnd = Op.getOperand(5);

27866

unsigned RC = 0;

27867

if (isRoundModeSAEToX(Rnd, RC))

27868

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27869

DAG.getTargetConstant(RC, dl, MVT::i32));

27870

else if (!isRoundModeCurDirection(Rnd))

27871

return SDValue();

27872

}

27873

if (!NewOp)

27874

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27875

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27876

}

27877

case INTR_TYPE_2OP_MASK_SAE: {

27878

SDValue Src1 = Op.getOperand(1);

27879

SDValue Src2 = Op.getOperand(2);

27880

SDValue PassThru = Op.getOperand(3);

27881

SDValue Mask = Op.getOperand(4);

27882

27883

unsigned Opc = IntrData->Opc0;

27884

if (IntrData->Opc1 != 0) {

27885

SDValue Sae = Op.getOperand(5);

27886

if (isRoundModeSAE(Sae))

27887

Opc = IntrData->Opc1;

27888

else if (!isRoundModeCurDirection(Sae))

27889

return SDValue();

27890

}

27891

27892

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27893

Mask, PassThru, Subtarget, DAG);

27894

}

27895

case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

27896

SDValue Src1 = Op.getOperand(1);

27897

SDValue Src2 = Op.getOperand(2);

27898

SDValue Src3 = Op.getOperand(3);

27899

SDValue PassThru = Op.getOperand(4);

27900

SDValue Mask = Op.getOperand(5);

27901

SDValue Sae = Op.getOperand(6);

27902

unsigned Opc;

27903

if (isRoundModeCurDirection(Sae))

27904

Opc = IntrData->Opc0;

27905

else if (isRoundModeSAE(Sae))

27906

Opc = IntrData->Opc1;

27907

else

27908

return SDValue();

27909

27910

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27911

Mask, PassThru, Subtarget, DAG);

27912

}

27913

case INTR_TYPE_3OP_MASK_SAE: {

27914

SDValue Src1 = Op.getOperand(1);

27915

SDValue Src2 = Op.getOperand(2);

27916

SDValue Src3 = Op.getOperand(3);

27917

SDValue PassThru = Op.getOperand(4);

27918

SDValue Mask = Op.getOperand(5);

27919

27920

unsigned Opc = IntrData->Opc0;

27921

if (IntrData->Opc1 != 0) {

27922

SDValue Sae = Op.getOperand(6);

27923

if (isRoundModeSAE(Sae))

27924

Opc = IntrData->Opc1;

27925

else if (!isRoundModeCurDirection(Sae))

27926

return SDValue();

27927

}

27928

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27929

Mask, PassThru, Subtarget, DAG);

27930

}

27931

case BLENDV: {

27932

SDValue Src1 = Op.getOperand(1);

27933

SDValue Src2 = Op.getOperand(2);

27934

SDValue Src3 = Op.getOperand(3);

27935

27936

EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

27937

Src3 = DAG.getBitcast(MaskVT, Src3);

27938

27939

// Reverse the operands to match VSELECT order.

27940

return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

27941

}

27942

case VPERM_2OP : {

27943

SDValue Src1 = Op.getOperand(1);

27944

SDValue Src2 = Op.getOperand(2);

27945

27946

// Swap Src1 and Src2 in the node creation

27947

return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

27948

}

27949

case CFMA_OP_MASKZ:

27950

case CFMA_OP_MASK: {

27951

SDValue Src1 = Op.getOperand(1);

27952

SDValue Src2 = Op.getOperand(2);

27953

SDValue Src3 = Op.getOperand(3);

27954

SDValue Mask = Op.getOperand(4);

27955

MVT VT = Op.getSimpleValueType();

27956

27957

SDValue PassThru = Src3;

27958

if (IntrData->Type == CFMA_OP_MASKZ)

27959

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

27960

27961

// We add rounding mode to the Node when

27962

// - RC Opcode is specified and

27963

// - RC is not "current direction".

27964

SDValue NewOp;

27965

if (IntrData->Opc1 != 0) {

27966

SDValue Rnd = Op.getOperand(5);

27967

unsigned RC = 0;

27968

if (isRoundModeSAEToX(Rnd, RC))

27969

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,

27970

DAG.getTargetConstant(RC, dl, MVT::i32));

27971

else if (!isRoundModeCurDirection(Rnd))

27972

return SDValue();

27973

}

27974

if (!NewOp)

27975

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);

27976

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27977

}

27978

case IFMA_OP:

27979

// NOTE: We need to swizzle the operands to pass the multiply operands

27980

// first.

27981

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27982

Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

27983

case FPCLASSS: {

27984

SDValue Src1 = Op.getOperand(1);

27985

SDValue Imm = Op.getOperand(2);

27986

SDValue Mask = Op.getOperand(3);

27987

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

27988

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

27989

Subtarget, DAG);

27990

// Need to fill with zeros to ensure the bitcast will produce zeroes

27991

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27992

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27993

DAG.getConstant(0, dl, MVT::v8i1),

27994

FPclassMask, DAG.getIntPtrConstant(0, dl));

27995

return DAG.getBitcast(MVT::i8, Ins);

27996

}

27997

27998

case CMP_MASK_CC: {

27999

MVT MaskVT = Op.getSimpleValueType();

28000

SDValue CC = Op.getOperand(3);

28001

SDValue Mask = Op.getOperand(4);

28002

// We specify 2 possible opcodes for intrinsics with rounding modes.

28003

// First, we check if the intrinsic may have non-default rounding mode,

28004

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

28005

if (IntrData->Opc1 != 0) {

28006

SDValue Sae = Op.getOperand(5);

28007

if (isRoundModeSAE(Sae))

28008

return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

28009

Op.getOperand(2), CC, Mask, Sae);

28010

if (!isRoundModeCurDirection(Sae))

28011

return SDValue();

28012

}

28013

//default rounding mode

28014

return DAG.getNode(IntrData->Opc0, dl, MaskVT,

28015

{Op.getOperand(1), Op.getOperand(2), CC, Mask});

28016

}

28017

case CMP_MASK_SCALAR_CC: {

28018

SDValue Src1 = Op.getOperand(1);

28019

SDValue Src2 = Op.getOperand(2);

28020

SDValue CC = Op.getOperand(3);

28021

SDValue Mask = Op.getOperand(4);

28022

28023

SDValue Cmp;

28024

if (IntrData->Opc1 != 0) {

28025

SDValue Sae = Op.getOperand(5);

28026

if (isRoundModeSAE(Sae))

28027

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

28028

else if (!isRoundModeCurDirection(Sae))

28029

return SDValue();

28030

}

28031

//default rounding mode

28032

if (!Cmp.getNode())

28033

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

28034

28035

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

28036

Subtarget, DAG);

28037

// Need to fill with zeros to ensure the bitcast will produce zeroes

28038

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

28039

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

28040

DAG.getConstant(0, dl, MVT::v8i1),

28041

CmpMask, DAG.getIntPtrConstant(0, dl));

28042

return DAG.getBitcast(MVT::i8, Ins);

28043

}

28044

case COMI: { // Comparison intrinsics

28045

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

28046

SDValue LHS = Op.getOperand(1);

28047

SDValue RHS = Op.getOperand(2);

28048

// Some conditions require the operands to be swapped.

28049

if (CC == ISD::SETLT || CC == ISD::SETLE)

28050

std::swap(LHS, RHS);

28051

28052

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

28053

SDValue SetCC;

28054

switch (CC) {

28055

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

28056

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

28057

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

28058

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

28059

break;

28060

}

28061

case ISD::SETNE: { // (ZF = 1 or PF = 1)

28062

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

28063

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

28064

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

28065

break;

28066

}

28067

case ISD::SETGT: // (CF = 0 and ZF = 0)

28068

case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

28069

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

28070

break;

28071

}

28072

case ISD::SETGE: // CF = 0

28073

case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

28074

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

28075

break;

28076

default:

28077

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28077);

28078

}

28079

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28080

}

28081

case COMI_RM: { // Comparison intrinsics with Sae

28082

SDValue LHS = Op.getOperand(1);

28083

SDValue RHS = Op.getOperand(2);

28084

unsigned CondVal = Op.getConstantOperandVal(3);

28085

SDValue Sae = Op.getOperand(4);

28086

28087

SDValue FCmp;

28088

if (isRoundModeCurDirection(Sae))

28089

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

28090

DAG.getTargetConstant(CondVal, dl, MVT::i8));

28091

else if (isRoundModeSAE(Sae))

28092

FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

28093

DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

28094

else

28095

return SDValue();

28096

// Need to fill with zeros to ensure the bitcast will produce zeroes

28097

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

28098

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

28099

DAG.getConstant(0, dl, MVT::v16i1),

28100

FCmp, DAG.getIntPtrConstant(0, dl));

28101

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

28102

DAG.getBitcast(MVT::i16, Ins));

28103

}

28104

case VSHIFT: {

28105

SDValue SrcOp = Op.getOperand(1);

28106

SDValue ShAmt = Op.getOperand(2);

28107

assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__
__PRETTY_FUNCTION__))

28108

"Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__
__PRETTY_FUNCTION__));

28109

28110

// Catch shift-by-constant.

28111

if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

28112

return getTargetVShiftByConstNode(IntrData->Opc0, dl,

28113

Op.getSimpleValueType(), SrcOp,

28114

CShAmt->getZExtValue(), DAG);

28115

28116

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

28117

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

28118

SrcOp, ShAmt, 0, Subtarget, DAG);

28119

}

28120

case COMPRESS_EXPAND_IN_REG: {

28121

SDValue Mask = Op.getOperand(3);

28122

SDValue DataToCompress = Op.getOperand(1);

28123

SDValue PassThru = Op.getOperand(2);

28124

if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

28125

return Op.getOperand(1);

28126

28127

// Avoid false dependency.

28128

if (PassThru.isUndef())

28129

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

28130

28131

return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

28132

Mask);

28133

}

28134

case FIXUPIMM:

28135

case FIXUPIMM_MASKZ: {

28136

SDValue Src1 = Op.getOperand(1);

28137

SDValue Src2 = Op.getOperand(2);

28138

SDValue Src3 = Op.getOperand(3);

28139

SDValue Imm = Op.getOperand(4);

28140

SDValue Mask = Op.getOperand(5);

28141

SDValue Passthru = (IntrData->Type == FIXUPIMM)

28142

? Src1

28143

: getZeroVector(VT, Subtarget, DAG, dl);

28144

28145

unsigned Opc = IntrData->Opc0;

28146

if (IntrData->Opc1 != 0) {

28147

SDValue Sae = Op.getOperand(6);

28148

if (isRoundModeSAE(Sae))

28149

Opc = IntrData->Opc1;

28150

else if (!isRoundModeCurDirection(Sae))

28151

return SDValue();

28152

}

28153

28154

SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

28155

28156

if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

28157

return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

28158

28159

return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

28160

}

28161

case ROUNDP: {

28162

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28162, __extension__
__PRETTY_FUNCTION__));

28163

// Clear the upper bits of the rounding immediate so that the legacy

28164

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

28165

auto Round = cast<ConstantSDNode>(Op.getOperand(2));

28166

SDValue RoundingMode =

28167

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

28168

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28169

Op.getOperand(1), RoundingMode);

28170

}

28171

case ROUNDS: {

28172

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28172, __extension__
__PRETTY_FUNCTION__));

28173

// Clear the upper bits of the rounding immediate so that the legacy

28174

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

28175

auto Round = cast<ConstantSDNode>(Op.getOperand(3));

28176

SDValue RoundingMode =

28177

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

28178

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28179

Op.getOperand(1), Op.getOperand(2), RoundingMode);

28180

}

28181

case BEXTRI: {

28182

assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28182, __extension__
__PRETTY_FUNCTION__));

28183

28184

uint64_t Imm = Op.getConstantOperandVal(2);

28185

SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,

28186

Op.getValueType());

28187

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28188

Op.getOperand(1), Control);

28189

}

28190

// ADC/ADCX/SBB

28191

case ADX: {

28192

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

28193

SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

28194

28195

SDValue Res;

28196

// If the carry in is zero, then we should just use ADD/SUB instead of

28197

// ADC/SBB.

28198

if (isNullConstant(Op.getOperand(1))) {

28199

Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

28200

Op.getOperand(3));

28201

} else {

28202

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

28203

DAG.getConstant(-1, dl, MVT::i8));

28204

Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

28205

Op.getOperand(3), GenCF.getValue(1));

28206

}

28207

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

28208

SDValue Results[] = { SetCC, Res };

28209

return DAG.getMergeValues(Results, dl);

28210

}

28211

case CVTPD2PS_MASK:

28212

case CVTPD2DQ_MASK:

28213

case CVTQQ2PS_MASK:

28214

case TRUNCATE_TO_REG: {

28215

SDValue Src = Op.getOperand(1);

28216

SDValue PassThru = Op.getOperand(2);

28217

SDValue Mask = Op.getOperand(3);

28218

28219

if (isAllOnesConstant(Mask))

28220

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

28221

28222

MVT SrcVT = Src.getSimpleValueType();

28223

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

28224

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28225

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

28226

{Src, PassThru, Mask});

28227

}

28228

case CVTPS2PH_MASK: {

28229

SDValue Src = Op.getOperand(1);

28230

SDValue Rnd = Op.getOperand(2);

28231

SDValue PassThru = Op.getOperand(3);

28232

SDValue Mask = Op.getOperand(4);

28233

28234

unsigned RC = 0;

28235

unsigned Opc = IntrData->Opc0;

28236

bool SAE = Src.getValueType().is512BitVector() &&

28237

(isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));

28238

if (SAE) {

28239

Opc = X86ISD::CVTPS2PH_SAE;

28240

Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);

28241

}

28242

28243

if (isAllOnesConstant(Mask))

28244

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);

28245

28246

if (SAE)

28247

Opc = X86ISD::MCVTPS2PH_SAE;

28248

else

28249

Opc = IntrData->Opc1;

28250

MVT SrcVT = Src.getSimpleValueType();

28251

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

28252

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28253

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);

28254

}

28255

case CVTNEPS2BF16_MASK: {

28256

SDValue Src = Op.getOperand(1);

28257

SDValue PassThru = Op.getOperand(2);

28258

SDValue Mask = Op.getOperand(3);

28259

28260

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

28261

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

28262

28263

// Break false dependency.

28264

if (PassThru.isUndef())

28265

PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

28266

28267

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

28268

Mask);

28269

}

28270

default:

28271

break;

28272

}

28273

}

28274

28275

switch (IntNo) {

28276

default: return SDValue(); // Don't custom lower most intrinsics.

28277

28278

// ptest and testp intrinsics. The intrinsic these come from are designed to

28279

// return an integer value, not just an instruction so lower it to the ptest

28280

// or testp pattern and a setcc for the result.

28281

case Intrinsic::x86_avx512_ktestc_b:

28282

case Intrinsic::x86_avx512_ktestc_w:

28283

case Intrinsic::x86_avx512_ktestc_d:

28284

case Intrinsic::x86_avx512_ktestc_q:

28285

case Intrinsic::x86_avx512_ktestz_b:

28286

case Intrinsic::x86_avx512_ktestz_w:

28287

case Intrinsic::x86_avx512_ktestz_d:

28288

case Intrinsic::x86_avx512_ktestz_q:

28289

case Intrinsic::x86_sse41_ptestz:

28290

case Intrinsic::x86_sse41_ptestc:

28291

case Intrinsic::x86_sse41_ptestnzc:

28292

case Intrinsic::x86_avx_ptestz_256:

28293

case Intrinsic::x86_avx_ptestc_256:

28294

case Intrinsic::x86_avx_ptestnzc_256:

28295

case Intrinsic::x86_avx_vtestz_ps:

28296

case Intrinsic::x86_avx_vtestc_ps:

28297

case Intrinsic::x86_avx_vtestnzc_ps:

28298

case Intrinsic::x86_avx_vtestz_pd:

28299

case Intrinsic::x86_avx_vtestc_pd:

28300

case Intrinsic::x86_avx_vtestnzc_pd:

28301

case Intrinsic::x86_avx_vtestz_ps_256:

28302

case Intrinsic::x86_avx_vtestc_ps_256:

28303

case Intrinsic::x86_avx_vtestnzc_ps_256:

28304

case Intrinsic::x86_avx_vtestz_pd_256:

28305

case Intrinsic::x86_avx_vtestc_pd_256:

28306

case Intrinsic::x86_avx_vtestnzc_pd_256: {

28307

unsigned TestOpc = X86ISD::PTEST;

28308

X86::CondCode X86CC;

28309

switch (IntNo) {

28310

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28310);

28311

case Intrinsic::x86_avx512_ktestc_b:

28312

case Intrinsic::x86_avx512_ktestc_w:

28313

case Intrinsic::x86_avx512_ktestc_d:

28314

case Intrinsic::x86_avx512_ktestc_q:

28315

// CF = 1

28316

TestOpc = X86ISD::KTEST;

28317

X86CC = X86::COND_B;

28318

break;

28319

case Intrinsic::x86_avx512_ktestz_b:

28320

case Intrinsic::x86_avx512_ktestz_w:

28321

case Intrinsic::x86_avx512_ktestz_d:

28322

case Intrinsic::x86_avx512_ktestz_q:

28323

TestOpc = X86ISD::KTEST;

28324

X86CC = X86::COND_E;

28325

break;

28326

case Intrinsic::x86_avx_vtestz_ps:

28327

case Intrinsic::x86_avx_vtestz_pd:

28328

case Intrinsic::x86_avx_vtestz_ps_256:

28329

case Intrinsic::x86_avx_vtestz_pd_256:

28330

TestOpc = X86ISD::TESTP;

28331

[[fallthrough]];

28332

case Intrinsic::x86_sse41_ptestz:

28333

case Intrinsic::x86_avx_ptestz_256:

28334

// ZF = 1

28335

X86CC = X86::COND_E;

28336

break;

28337

case Intrinsic::x86_avx_vtestc_ps:

28338

case Intrinsic::x86_avx_vtestc_pd:

28339

case Intrinsic::x86_avx_vtestc_ps_256:

28340

case Intrinsic::x86_avx_vtestc_pd_256:

28341

TestOpc = X86ISD::TESTP;

28342

[[fallthrough]];

28343

case Intrinsic::x86_sse41_ptestc:

28344

case Intrinsic::x86_avx_ptestc_256:

28345

// CF = 1

28346

X86CC = X86::COND_B;

28347

break;

28348

case Intrinsic::x86_avx_vtestnzc_ps:

28349

case Intrinsic::x86_avx_vtestnzc_pd:

28350

case Intrinsic::x86_avx_vtestnzc_ps_256:

28351

case Intrinsic::x86_avx_vtestnzc_pd_256:

28352

TestOpc = X86ISD::TESTP;

28353

[[fallthrough]];

28354

case Intrinsic::x86_sse41_ptestnzc:

28355

case Intrinsic::x86_avx_ptestnzc_256:

28356

// ZF and CF = 0

28357

X86CC = X86::COND_A;

28358

break;

28359

}

28360

28361

SDValue LHS = Op.getOperand(1);

28362

SDValue RHS = Op.getOperand(2);

28363

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

28364

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

28365

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28366

}

28367

28368

case Intrinsic::x86_sse42_pcmpistria128:

28369

case Intrinsic::x86_sse42_pcmpestria128:

28370

case Intrinsic::x86_sse42_pcmpistric128:

28371

case Intrinsic::x86_sse42_pcmpestric128:

28372

case Intrinsic::x86_sse42_pcmpistrio128:

28373

case Intrinsic::x86_sse42_pcmpestrio128:

28374

case Intrinsic::x86_sse42_pcmpistris128:

28375

case Intrinsic::x86_sse42_pcmpestris128:

28376

case Intrinsic::x86_sse42_pcmpistriz128:

28377

case Intrinsic::x86_sse42_pcmpestriz128: {

28378

unsigned Opcode;

28379

X86::CondCode X86CC;

28380

switch (IntNo) {

28381

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28381); // Can't reach here.

28382

case Intrinsic::x86_sse42_pcmpistria128:

28383

Opcode = X86ISD::PCMPISTR;

28384

X86CC = X86::COND_A;

28385

break;

28386

case Intrinsic::x86_sse42_pcmpestria128:

28387

Opcode = X86ISD::PCMPESTR;

28388

X86CC = X86::COND_A;

28389

break;

28390

case Intrinsic::x86_sse42_pcmpistric128:

28391

Opcode = X86ISD::PCMPISTR;

28392

X86CC = X86::COND_B;

28393

break;

28394

case Intrinsic::x86_sse42_pcmpestric128:

28395

Opcode = X86ISD::PCMPESTR;

28396

X86CC = X86::COND_B;

28397

break;

28398

case Intrinsic::x86_sse42_pcmpistrio128:

28399

Opcode = X86ISD::PCMPISTR;

28400

X86CC = X86::COND_O;

28401

break;

28402

case Intrinsic::x86_sse42_pcmpestrio128:

28403

Opcode = X86ISD::PCMPESTR;

28404

X86CC = X86::COND_O;

28405

break;

28406

case Intrinsic::x86_sse42_pcmpistris128:

28407

Opcode = X86ISD::PCMPISTR;

28408

X86CC = X86::COND_S;

28409

break;

28410

case Intrinsic::x86_sse42_pcmpestris128:

28411

Opcode = X86ISD::PCMPESTR;

28412

X86CC = X86::COND_S;

28413

break;

28414

case Intrinsic::x86_sse42_pcmpistriz128:

28415

Opcode = X86ISD::PCMPISTR;

28416

X86CC = X86::COND_E;

28417

break;

28418

case Intrinsic::x86_sse42_pcmpestriz128:

28419

Opcode = X86ISD::PCMPESTR;

28420

X86CC = X86::COND_E;

28421

break;

28422

}

28423

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28424

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28425

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

28426

SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

28427

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28428

}

28429

28430

case Intrinsic::x86_sse42_pcmpistri128:

28431

case Intrinsic::x86_sse42_pcmpestri128: {

28432

unsigned Opcode;

28433

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

28434

Opcode = X86ISD::PCMPISTR;

28435

else

28436

Opcode = X86ISD::PCMPESTR;

28437

28438

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28439

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28440

return DAG.getNode(Opcode, dl, VTs, NewOps);

28441

}

28442

28443

case Intrinsic::x86_sse42_pcmpistrm128:

28444

case Intrinsic::x86_sse42_pcmpestrm128: {

28445

unsigned Opcode;

28446

if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

28447

Opcode = X86ISD::PCMPISTR;

28448

else

28449

Opcode = X86ISD::PCMPESTR;

28450

28451

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28452

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28453

return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

28454

}

28455

28456

case Intrinsic::eh_sjlj_lsda: {

28457

MachineFunction &MF = DAG.getMachineFunction();

28458

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28459

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

28460

auto &Context = MF.getMMI().getContext();

28461

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

28462

Twine(MF.getFunctionNumber()));

28463

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

28464

DAG.getMCSymbol(S, PtrVT));

28465

}

28466

28467

case Intrinsic::x86_seh_lsda: {

28468

// Compute the symbol for the LSDA. We know it'll get emitted later.

28469

MachineFunction &MF = DAG.getMachineFunction();

28470

SDValue Op1 = Op.getOperand(1);

28471

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

28472

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

28473

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

28474

28475

// Generate a simple absolute symbol reference. This intrinsic is only

28476

// supported on 32-bit Windows, which isn't PIC.

28477

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

28478

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

28479

}

28480

28481

case Intrinsic::eh_recoverfp: {

28482

SDValue FnOp = Op.getOperand(1);

28483

SDValue IncomingFPOp = Op.getOperand(2);

28484

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

28485

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

28486

if (!Fn)

28487

report_fatal_error(

28488

"llvm.eh.recoverfp must take a function as the first argument");

28489

return recoverFramePointer(DAG, Fn, IncomingFPOp);

28490

}

28491

28492

case Intrinsic::localaddress: {

28493

// Returns one of the stack, base, or frame pointer registers, depending on

28494

// which is used to reference local variables.

28495

MachineFunction &MF = DAG.getMachineFunction();

28496

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28497

unsigned Reg;

28498

if (RegInfo->hasBasePointer(MF))

28499

Reg = RegInfo->getBaseRegister();

28500

else { // Handles the SP or FP case.

28501

bool CantUseFP = RegInfo->hasStackRealignment(MF);

28502

if (CantUseFP)

28503

Reg = RegInfo->getPtrSizedStackRegister(MF);

28504

else

28505

Reg = RegInfo->getPtrSizedFrameRegister(MF);

28506

}

28507

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

28508

}

28509

case Intrinsic::x86_avx512_vp2intersect_q_512:

28510

case Intrinsic::x86_avx512_vp2intersect_q_256:

28511

case Intrinsic::x86_avx512_vp2intersect_q_128:

28512

case Intrinsic::x86_avx512_vp2intersect_d_512:

28513

case Intrinsic::x86_avx512_vp2intersect_d_256:

28514

case Intrinsic::x86_avx512_vp2intersect_d_128: {

28515

MVT MaskVT = Op.getSimpleValueType();

28516

28517

SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

28518

SDLoc DL(Op);

28519

28520

SDValue Operation =

28521

DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

28522

Op->getOperand(1), Op->getOperand(2));

28523

28524

SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,

28525

MaskVT, Operation);

28526

SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,

28527

MaskVT, Operation);

28528

return DAG.getMergeValues({Result0, Result1}, DL);

28529

}

28530

case Intrinsic::x86_mmx_pslli_w:

28531

case Intrinsic::x86_mmx_pslli_d:

28532

case Intrinsic::x86_mmx_pslli_q:

28533

case Intrinsic::x86_mmx_psrli_w:

28534

case Intrinsic::x86_mmx_psrli_d:

28535

case Intrinsic::x86_mmx_psrli_q:

28536

case Intrinsic::x86_mmx_psrai_w:

28537

case Intrinsic::x86_mmx_psrai_d: {

28538

SDLoc DL(Op);

28539

SDValue ShAmt = Op.getOperand(2);

28540

// If the argument is a constant, convert it to a target constant.

28541

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

28542

// Clamp out of bounds shift amounts since they will otherwise be masked

28543

// to 8-bits which may make it no longer out of bounds.

28544

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

28545

if (ShiftAmount == 0)

28546

return Op.getOperand(1);

28547

28548

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

28549

Op.getOperand(0), Op.getOperand(1),

28550

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

28551

}

28552

28553

unsigned NewIntrinsic;

28554

switch (IntNo) {

28555

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28555); // Can't reach here.

28556

case Intrinsic::x86_mmx_pslli_w:

28557

NewIntrinsic = Intrinsic::x86_mmx_psll_w;

28558

break;

28559

case Intrinsic::x86_mmx_pslli_d:

28560

NewIntrinsic = Intrinsic::x86_mmx_psll_d;

28561

break;

28562

case Intrinsic::x86_mmx_pslli_q:

28563

NewIntrinsic = Intrinsic::x86_mmx_psll_q;

28564

break;

28565

case Intrinsic::x86_mmx_psrli_w:

28566

NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

28567

break;

28568

case Intrinsic::x86_mmx_psrli_d:

28569

NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

28570

break;

28571

case Intrinsic::x86_mmx_psrli_q:

28572

NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

28573

break;

28574

case Intrinsic::x86_mmx_psrai_w:

28575

NewIntrinsic = Intrinsic::x86_mmx_psra_w;

28576

break;

28577

case Intrinsic::x86_mmx_psrai_d:

28578

NewIntrinsic = Intrinsic::x86_mmx_psra_d;

28579

break;

28580

}

28581

28582

// The vector shift intrinsics with scalars uses 32b shift amounts but

28583

// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

28584

// MMX register.

28585

ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

28586

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

28587

DAG.getTargetConstant(NewIntrinsic, DL,

28588

getPointerTy(DAG.getDataLayout())),

28589

Op.getOperand(1), ShAmt);

28590

}

28591

case Intrinsic::thread_pointer: {

28592

if (Subtarget.isTargetELF()) {

28593

SDLoc dl(Op);

28594

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28595

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

28596

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(

28597

*DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));

28598

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

28599

DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));

28600

}

28601

report_fatal_error(

28602

"Target OS doesn't support __builtin_thread_pointer() yet.");

28603

}

28604

}

28605

}

28606

28607

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28608

SDValue Src, SDValue Mask, SDValue Base,

28609

SDValue Index, SDValue ScaleOp, SDValue Chain,

28610

const X86Subtarget &Subtarget) {

28611

SDLoc dl(Op);

28612

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28613

// Scale must be constant.

28614

if (!C)

28615

return SDValue();

28616

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28617

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28618

TLI.getPointerTy(DAG.getDataLayout()));

28619

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

28620

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28621

// If source is undef or we know it won't be used, use a zero vector

28622

// to break register dependency.

28623

// TODO: use undef instead and let BreakFalseDeps deal with it?

28624

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28625

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28626

28627

// Cast mask to an integer type.

28628

Mask = DAG.getBitcast(MaskVT, Mask);

28629

28630

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28631

28632

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28633

SDValue Res =

28634

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28635

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28636

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28637

}

28638

28639

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

28640

SDValue Src, SDValue Mask, SDValue Base,

28641

SDValue Index, SDValue ScaleOp, SDValue Chain,

28642

const X86Subtarget &Subtarget) {

28643

MVT VT = Op.getSimpleValueType();

28644

SDLoc dl(Op);

28645

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28646

// Scale must be constant.

28647

if (!C)

28648

return SDValue();

28649

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28650

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28651

TLI.getPointerTy(DAG.getDataLayout()));

28652

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28653

VT.getVectorNumElements());

28654

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28655

28656

// We support two versions of the gather intrinsics. One with scalar mask and

28657

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28658

if (Mask.getValueType() != MaskVT)

28659

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28660

28661

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28662

// If source is undef or we know it won't be used, use a zero vector

28663

// to break register dependency.

28664

// TODO: use undef instead and let BreakFalseDeps deal with it?

28665

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28666

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28667

28668

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28669

28670

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28671

SDValue Res =

28672

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28673

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28674

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28675

}

28676

28677

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28678

SDValue Src, SDValue Mask, SDValue Base,

28679

SDValue Index, SDValue ScaleOp, SDValue Chain,

28680

const X86Subtarget &Subtarget) {

28681

SDLoc dl(Op);

28682

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28683

// Scale must be constant.

28684

if (!C)

28685

return SDValue();

28686

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28687

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28688

TLI.getPointerTy(DAG.getDataLayout()));

28689

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28690

Src.getSimpleValueType().getVectorNumElements());

28691

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28692

28693

// We support two versions of the scatter intrinsics. One with scalar mask and

28694

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28695

if (Mask.getValueType() != MaskVT)

28696

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28697

28698

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28699

28700

SDVTList VTs = DAG.getVTList(MVT::Other);

28701

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

28702

SDValue Res =

28703

DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

28704

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28705

return Res;

28706

}

28707

28708

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28709

SDValue Mask, SDValue Base, SDValue Index,

28710

SDValue ScaleOp, SDValue Chain,

28711

const X86Subtarget &Subtarget) {

28712

SDLoc dl(Op);

28713

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28714

// Scale must be constant.

28715

if (!C)

28716

return SDValue();

28717

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28718

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28719

TLI.getPointerTy(DAG.getDataLayout()));

28720

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

28721

SDValue Segment = DAG.getRegister(0, MVT::i32);

28722

MVT MaskVT =

28723

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

28724

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28725

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

28726

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

28727

return SDValue(Res, 0);

28728

}

28729

28730

/// Handles the lowering of builtin intrinsics with chain that return their

28731

/// value into registers EDX:EAX.

28732

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

28733

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

28734

/// TargetOpcode.

28735

/// Returns a Glue value which can be used to add extra copy-from-reg if the

28736

/// expanded intrinsics implicitly defines extra registers (i.e. not just

28737

/// EDX:EAX).

28738

static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

28739

SelectionDAG &DAG,

28740

unsigned TargetOpcode,

28741

unsigned SrcReg,

28742

const X86Subtarget &Subtarget,

28743

SmallVectorImpl<SDValue> &Results) {

28744

SDValue Chain = N->getOperand(0);

28745

SDValue Glue;

28746

28747

if (SrcReg) {

28748

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28748, __extension__
__PRETTY_FUNCTION__));

28749

Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

28750

Glue = Chain.getValue(1);

28751

}

28752

28753

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

28754

SDValue N1Ops[] = {Chain, Glue};

28755

SDNode *N1 = DAG.getMachineNode(

28756

TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

28757

Chain = SDValue(N1, 0);

28758

28759

// Reads the content of XCR and returns it in registers EDX:EAX.

28760

SDValue LO, HI;

28761

if (Subtarget.is64Bit()) {

28762

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

28763

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

28764

LO.getValue(2));

28765

} else {

28766

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

28767

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

28768

LO.getValue(2));

28769

}

28770

Chain = HI.getValue(1);

28771

Glue = HI.getValue(2);

28772

28773

if (Subtarget.is64Bit()) {

28774

// Merge the two 32-bit values into a 64-bit one.

28775

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

28776

DAG.getConstant(32, DL, MVT::i8));

28777

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

28778

Results.push_back(Chain);

28779

return Glue;

28780

}

28781

28782

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

28783

SDValue Ops[] = { LO, HI };

28784

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

28785

Results.push_back(Pair);

28786

Results.push_back(Chain);

28787

return Glue;

28788

}

28789

28790

/// Handles the lowering of builtin intrinsics that read the time stamp counter

28791

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

28792

/// READCYCLECOUNTER nodes.

28793

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

28794

SelectionDAG &DAG,

28795

const X86Subtarget &Subtarget,

28796

SmallVectorImpl<SDValue> &Results) {

28797

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

28798

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

28799

// and the EAX register is loaded with the low-order 32 bits.

28800

SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

28801

/* NoRegister */0, Subtarget,

28802

Results);

28803

if (Opcode != X86::RDTSCP)

28804

return;

28805

28806

SDValue Chain = Results[1];

28807

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

28808

// the ECX register. Add 'ecx' explicitly to the chain.

28809

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

28810

Results[1] = ecx;

28811

Results.push_back(ecx.getValue(1));

28812

}

28813

28814

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

28815

SelectionDAG &DAG) {

28816

SmallVector<SDValue, 3> Results;

28817

SDLoc DL(Op);

28818

getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

28819

Results);

28820

return DAG.getMergeValues(Results, DL);

28821

}

28822

28823

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

28824

MachineFunction &MF = DAG.getMachineFunction();

28825

SDValue Chain = Op.getOperand(0);

28826

SDValue RegNode = Op.getOperand(2);

28827

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28828

if (!EHInfo)

28829

report_fatal_error("EH registrations only live in functions using WinEH");

28830

28831

// Cast the operand to an alloca, and remember the frame index.

28832

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

28833

if (!FINode)

28834

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

28835

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

28836

28837

// Return the chain operand without making any DAG nodes.

28838

return Chain;

28839

}

28840

28841

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

28842

MachineFunction &MF = DAG.getMachineFunction();

28843

SDValue Chain = Op.getOperand(0);

28844

SDValue EHGuard = Op.getOperand(2);

28845

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28846

if (!EHInfo)

28847

report_fatal_error("EHGuard only live in functions using WinEH");

28848

28849

// Cast the operand to an alloca, and remember the frame index.

28850

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

28851

if (!FINode)

28852

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

28853

EHInfo->EHGuardFrameIndex = FINode->getIndex();

28854

28855

// Return the chain operand without making any DAG nodes.

28856

return Chain;

28857

}

28858

28859

/// Emit Truncating Store with signed or unsigned saturation.

28860

static SDValue

28861

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

28862

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

28863

SelectionDAG &DAG) {

28864

SDVTList VTs = DAG.getVTList(MVT::Other);

28865

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

28866

SDValue Ops[] = { Chain, Val, Ptr, Undef };

28867

unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

28868

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28869

}

28870

28871

/// Emit Masked Truncating Store with signed or unsigned saturation.

28872

static SDValue

28873

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

28874

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

28875

MachineMemOperand *MMO, SelectionDAG &DAG) {

28876

SDVTList VTs = DAG.getVTList(MVT::Other);

28877

SDValue Ops[] = { Chain, Val, Ptr, Mask };

28878

unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

28879

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28880

}

28881

28882

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

28883

SelectionDAG &DAG) {

28884

unsigned IntNo = Op.getConstantOperandVal(1);

28885

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

28886

if (!IntrData) {

28887

switch (IntNo) {

28888

28889

case Intrinsic::swift_async_context_addr: {

28890

SDLoc dl(Op);

28891

auto &MF = DAG.getMachineFunction();

28892

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

28893

if (Subtarget.is64Bit()) {

28894

MF.getFrameInfo().setFrameAddressIsTaken(true);

28895

X86FI->setHasSwiftAsyncContext(true);

28896

SDValue Chain = Op->getOperand(0);

28897

SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);

28898

SDValue Result =

28899

SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,

28900

DAG.getTargetConstant(8, dl, MVT::i32)),

28901

0);

28902

// Return { result, chain }.

28903

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28904

CopyRBP.getValue(1));

28905

} else {

28906

// 32-bit so no special extended frame, create or reuse an existing

28907

// stack slot.

28908

if (!X86FI->getSwiftAsyncContextFrameIdx())

28909

X86FI->setSwiftAsyncContextFrameIdx(

28910

MF.getFrameInfo().CreateStackObject(4, Align(4), false));

28911

SDValue Result =

28912

DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);

28913

// Return { result, chain }.

28914

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28915

Op->getOperand(0));

28916

}

28917

}

28918

28919

case llvm::Intrinsic::x86_seh_ehregnode:

28920

return MarkEHRegistrationNode(Op, DAG);

28921

case llvm::Intrinsic::x86_seh_ehguard:

28922

return MarkEHGuard(Op, DAG);

28923

case llvm::Intrinsic::x86_rdpkru: {

28924

SDLoc dl(Op);

28925

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28926

// Create a RDPKRU node and pass 0 to the ECX parameter.

28927

return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

28928

DAG.getConstant(0, dl, MVT::i32));

28929

}

28930

case llvm::Intrinsic::x86_wrpkru: {

28931

SDLoc dl(Op);

28932

// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0

28933

// to the EDX and ECX parameters.

28934

return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

28935

Op.getOperand(0), Op.getOperand(2),

28936

DAG.getConstant(0, dl, MVT::i32),

28937

DAG.getConstant(0, dl, MVT::i32));

28938

}

28939

case llvm::Intrinsic::asan_check_memaccess: {

28940

// Mark this as adjustsStack because it will be lowered to a call.

28941

DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);

28942

// Don't do anything here, we will expand these intrinsics out later.

28943

return Op;

28944

}

28945

case llvm::Intrinsic::x86_flags_read_u32:

28946

case llvm::Intrinsic::x86_flags_read_u64:

28947

case llvm::Intrinsic::x86_flags_write_u32:

28948

case llvm::Intrinsic::x86_flags_write_u64: {

28949

// We need a frame pointer because this will get lowered to a PUSH/POP

28950

// sequence.

28951

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

28952

MFI.setHasCopyImplyingStackAdjustment(true);

28953

// Don't do anything here, we will expand these intrinsics out later

28954

// during FinalizeISel in EmitInstrWithCustomInserter.

28955

return Op;

28956

}

28957

case Intrinsic::x86_lwpins32:

28958

case Intrinsic::x86_lwpins64:

28959

case Intrinsic::x86_umwait:

28960

case Intrinsic::x86_tpause: {

28961

SDLoc dl(Op);

28962

SDValue Chain = Op->getOperand(0);

28963

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28964

unsigned Opcode;

28965

28966

switch (IntNo) {

28967

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28967);

28968

case Intrinsic::x86_umwait:

28969

Opcode = X86ISD::UMWAIT;

28970

break;

28971

case Intrinsic::x86_tpause:

28972

Opcode = X86ISD::TPAUSE;

28973

break;

28974

case Intrinsic::x86_lwpins32:

28975

case Intrinsic::x86_lwpins64:

28976

Opcode = X86ISD::LWPINS;

28977

break;

28978

}

28979

28980

SDValue Operation =

28981

DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

28982

Op->getOperand(3), Op->getOperand(4));

28983

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28984

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28985

Operation.getValue(1));

28986

}

28987

case Intrinsic::x86_enqcmd:

28988

case Intrinsic::x86_enqcmds: {

28989

SDLoc dl(Op);

28990

SDValue Chain = Op.getOperand(0);

28991

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28992

unsigned Opcode;

28993

switch (IntNo) {

28994

default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28994);

28995

case Intrinsic::x86_enqcmd:

28996

Opcode = X86ISD::ENQCMD;

28997

break;

28998

case Intrinsic::x86_enqcmds:

28999

Opcode = X86ISD::ENQCMDS;

29000

break;

29001

}

29002

SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

29003

Op.getOperand(3));

29004

SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

29005

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

29006

Operation.getValue(1));

29007

}

29008

case Intrinsic::x86_aesenc128kl:

29009

case Intrinsic::x86_aesdec128kl:

29010

case Intrinsic::x86_aesenc256kl:

29011

case Intrinsic::x86_aesdec256kl: {

29012

SDLoc DL(Op);

29013

SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);

29014

SDValue Chain = Op.getOperand(0);

29015

unsigned Opcode;

29016

29017

switch (IntNo) {

29018

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29018);

29019

case Intrinsic::x86_aesenc128kl:

29020

Opcode = X86ISD::AESENC128KL;

29021

break;

29022

case Intrinsic::x86_aesdec128kl:

29023

Opcode = X86ISD::AESDEC128KL;

29024

break;

29025

case Intrinsic::x86_aesenc256kl:

29026

Opcode = X86ISD::AESENC256KL;

29027

break;

29028

case Intrinsic::x86_aesdec256kl:

29029

Opcode = X86ISD::AESDEC256KL;

29030

break;

29031

}

29032

29033

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

29034

MachineMemOperand *MMO = MemIntr->getMemOperand();

29035

EVT MemVT = MemIntr->getMemoryVT();

29036

SDValue Operation = DAG.getMemIntrinsicNode(

29037

Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,

29038

MMO);

29039

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);

29040

29041

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

29042

{ZF, Operation.getValue(0), Operation.getValue(2)});

29043

}

29044

case Intrinsic::x86_aesencwide128kl:

29045

case Intrinsic::x86_aesdecwide128kl:

29046

case Intrinsic::x86_aesencwide256kl:

29047

case Intrinsic::x86_aesdecwide256kl: {

29048

SDLoc DL(Op);

29049

SDVTList VTs = DAG.getVTList(

29050

{MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,

29051

MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});

29052

SDValue Chain = Op.getOperand(0);

29053

unsigned Opcode;

29054

29055

switch (IntNo) {

29056

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29056);

29057

case Intrinsic::x86_aesencwide128kl:

29058

Opcode = X86ISD::AESENCWIDE128KL;

29059

break;

29060

case Intrinsic::x86_aesdecwide128kl:

29061

Opcode = X86ISD::AESDECWIDE128KL;

29062

break;

29063

case Intrinsic::x86_aesencwide256kl:

29064

Opcode = X86ISD::AESENCWIDE256KL;

29065

break;

29066

case Intrinsic::x86_aesdecwide256kl:

29067

Opcode = X86ISD::AESDECWIDE256KL;

29068

break;

29069

}

29070

29071

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

29072

MachineMemOperand *MMO = MemIntr->getMemOperand();

29073

EVT MemVT = MemIntr->getMemoryVT();

29074

SDValue Operation = DAG.getMemIntrinsicNode(

29075

Opcode, DL, VTs,

29076

{Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),

29077

Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),

29078

Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},

29079

MemVT, MMO);

29080

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);

29081

29082

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

29083

{ZF, Operation.getValue(1), Operation.getValue(2),

29084

Operation.getValue(3), Operation.getValue(4),

29085

Operation.getValue(5), Operation.getValue(6),

29086

Operation.getValue(7), Operation.getValue(8),

29087

Operation.getValue(9)});

29088

}

29089

case Intrinsic::x86_testui: {

29090

SDLoc dl(Op);

29091

SDValue Chain = Op.getOperand(0);

29092

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

29093

SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);

29094

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

29095

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

29096

Operation.getValue(1));

29097

}

29098

case Intrinsic::x86_atomic_bts_rm:

29099

case Intrinsic::x86_atomic_btc_rm:

29100

case Intrinsic::x86_atomic_btr_rm: {

29101

SDLoc DL(Op);

29102

MVT VT = Op.getSimpleValueType();

29103

SDValue Chain = Op.getOperand(0);

29104

SDValue Op1 = Op.getOperand(2);

29105

SDValue Op2 = Op.getOperand(3);

29106

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM

29107

: IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM

29108

: X86ISD::LBTR_RM;

29109

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29110

SDValue Res =

29111

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29112

{Chain, Op1, Op2}, VT, MMO);

29113

Chain = Res.getValue(1);

29114

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

29115

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

29116

}

29117

case Intrinsic::x86_atomic_bts:

29118

case Intrinsic::x86_atomic_btc:

29119

case Intrinsic::x86_atomic_btr: {

29120

SDLoc DL(Op);

29121

MVT VT = Op.getSimpleValueType();

29122

SDValue Chain = Op.getOperand(0);

29123

SDValue Op1 = Op.getOperand(2);

29124

SDValue Op2 = Op.getOperand(3);

29125

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS

29126

: IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC

29127

: X86ISD::LBTR;

29128

SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);

29129

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29130

SDValue Res =

29131

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29132

{Chain, Op1, Op2, Size}, VT, MMO);

29133

Chain = Res.getValue(1);

29134

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

29135

unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();

29136

if (Imm)

29137

Res = DAG.getNode(ISD::SHL, DL, VT, Res,

29138

DAG.getShiftAmountConstant(Imm, VT, DL));

29139

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

29140

}

29141

case Intrinsic::x86_cmpccxadd32:

29142

case Intrinsic::x86_cmpccxadd64: {

29143

SDLoc DL(Op);

29144

SDValue Chain = Op.getOperand(0);

29145

SDValue Addr = Op.getOperand(2);

29146

SDValue Src1 = Op.getOperand(3);

29147

SDValue Src2 = Op.getOperand(4);

29148

SDValue CC = Op.getOperand(5);

29149

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29150

SDValue Operation = DAG.getMemIntrinsicNode(

29151

X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},

29152

MVT::i32, MMO);

29153

return Operation;

29154

}

29155

case Intrinsic::x86_aadd32:

29156

case Intrinsic::x86_aadd64:

29157

case Intrinsic::x86_aand32:

29158

case Intrinsic::x86_aand64:

29159

case Intrinsic::x86_aor32:

29160

case Intrinsic::x86_aor64:

29161

case Intrinsic::x86_axor32:

29162

case Intrinsic::x86_axor64: {

29163

SDLoc DL(Op);

29164

SDValue Chain = Op.getOperand(0);

29165

SDValue Op1 = Op.getOperand(2);

29166

SDValue Op2 = Op.getOperand(3);

29167

MVT VT = Op2.getSimpleValueType();

29168

unsigned Opc = 0;

29169

switch (IntNo) {

29170

default:

29171

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29171);

29172

case Intrinsic::x86_aadd32:

29173

case Intrinsic::x86_aadd64:

29174

Opc = X86ISD::AADD;

29175

break;

29176

case Intrinsic::x86_aand32:

29177

case Intrinsic::x86_aand64:

29178

Opc = X86ISD::AAND;

29179

break;

29180

case Intrinsic::x86_aor32:

29181

case Intrinsic::x86_aor64:

29182

Opc = X86ISD::AOR;

29183

break;

29184

case Intrinsic::x86_axor32:

29185

case Intrinsic::x86_axor64:

29186

Opc = X86ISD::AXOR;

29187

break;

29188

}

29189

MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();

29190

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),

29191

{Chain, Op1, Op2}, VT, MMO);

29192

}

29193

case Intrinsic::x86_atomic_add_cc:

29194

case Intrinsic::x86_atomic_sub_cc:

29195

case Intrinsic::x86_atomic_or_cc:

29196

case Intrinsic::x86_atomic_and_cc:

29197

case Intrinsic::x86_atomic_xor_cc: {

29198

SDLoc DL(Op);

29199

SDValue Chain = Op.getOperand(0);

29200

SDValue Op1 = Op.getOperand(2);

29201

SDValue Op2 = Op.getOperand(3);

29202

X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);

29203

MVT VT = Op2.getSimpleValueType();

29204

unsigned Opc = 0;

29205

switch (IntNo) {

29206

default:

29207

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29207);

29208

case Intrinsic::x86_atomic_add_cc:

29209

Opc = X86ISD::LADD;

29210

break;

29211

case Intrinsic::x86_atomic_sub_cc:

29212

Opc = X86ISD::LSUB;

29213

break;

29214

case Intrinsic::x86_atomic_or_cc:

29215

Opc = X86ISD::LOR;

29216

break;

29217

case Intrinsic::x86_atomic_and_cc:

29218

Opc = X86ISD::LAND;

29219

break;

29220

case Intrinsic::x86_atomic_xor_cc:

29221

Opc = X86ISD::LXOR;

29222

break;

29223

}

29224

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29225

SDValue LockArith =

29226

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29227

{Chain, Op1, Op2}, VT, MMO);

29228

Chain = LockArith.getValue(1);

29229

return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);

29230

}

29231

}

29232

return SDValue();

29233

}

29234

29235

SDLoc dl(Op);

29236

switch(IntrData->Type) {

29237

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29237);

29238

case RDSEED:

29239

case RDRAND: {

29240

// Emit the node with the right value type.

29241

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

29242

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

29243

29244

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

29245

// Otherwise return the value from Rand, which is always 0, casted to i32.

29246

SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

29247

DAG.getConstant(1, dl, Op->getValueType(1)),

29248

DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

29249

SDValue(Result.getNode(), 1)};

29250

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

29251

29252

// Return { result, isValid, chain }.

29253

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

29254

SDValue(Result.getNode(), 2));

29255

}

29256

case GATHER_AVX2: {

29257

SDValue Chain = Op.getOperand(0);

29258

SDValue Src = Op.getOperand(2);

29259

SDValue Base = Op.getOperand(3);

29260

SDValue Index = Op.getOperand(4);

29261

SDValue Mask = Op.getOperand(5);

29262

SDValue Scale = Op.getOperand(6);

29263

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

29264

Scale, Chain, Subtarget);

29265

}

29266

case GATHER: {

29267

//gather(v1, mask, index, base, scale);

29268

SDValue Chain = Op.getOperand(0);

29269

SDValue Src = Op.getOperand(2);

29270

SDValue Base = Op.getOperand(3);

29271

SDValue Index = Op.getOperand(4);

29272

SDValue Mask = Op.getOperand(5);

29273

SDValue Scale = Op.getOperand(6);

29274

return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

29275

Chain, Subtarget);

29276

}

29277

case SCATTER: {

29278

//scatter(base, mask, index, v1, scale);

29279

SDValue Chain = Op.getOperand(0);

29280

SDValue Base = Op.getOperand(2);

29281

SDValue Mask = Op.getOperand(3);

29282

SDValue Index = Op.getOperand(4);

29283

SDValue Src = Op.getOperand(5);

29284

SDValue Scale = Op.getOperand(6);

29285

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

29286

Scale, Chain, Subtarget);

29287

}

29288

case PREFETCH: {

29289

const APInt &HintVal = Op.getConstantOperandAPInt(6);

29290

assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__
__PRETTY_FUNCTION__))

29291

"Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__
__PRETTY_FUNCTION__));

29292

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

29293

SDValue Chain = Op.getOperand(0);

29294

SDValue Mask = Op.getOperand(2);

29295

SDValue Index = Op.getOperand(3);

29296

SDValue Base = Op.getOperand(4);

29297

SDValue Scale = Op.getOperand(5);

29298

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

29299

Subtarget);

29300

}

29301

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

29302

case RDTSC: {

29303

SmallVector<SDValue, 2> Results;

29304

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

29305

Results);

29306

return DAG.getMergeValues(Results, dl);

29307

}

29308

// Read Performance Monitoring Counters.

29309

case RDPMC:

29310

// Read Processor Register.

29311

case RDPRU:

29312

// GetExtended Control Register.

29313

case XGETBV: {

29314

SmallVector<SDValue, 2> Results;

29315

29316

// RDPMC uses ECX to select the index of the performance counter to read.

29317

// RDPRU uses ECX to select the processor register to read.

29318

// XGETBV uses ECX to select the index of the XCR register to return.

29319

// The result is stored into registers EDX:EAX.

29320

expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

29321

Subtarget, Results);

29322

return DAG.getMergeValues(Results, dl);

29323

}

29324

// XTEST intrinsics.

29325

case XTEST: {

29326

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

29327

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

29328

29329

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

29330

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

29331

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

29332

Ret, SDValue(InTrans.getNode(), 1));

29333

}

29334

case TRUNCATE_TO_MEM_VI8:

29335

case TRUNCATE_TO_MEM_VI16:

29336

case TRUNCATE_TO_MEM_VI32: {

29337

SDValue Mask = Op.getOperand(4);

29338

SDValue DataToTruncate = Op.getOperand(3);

29339

SDValue Addr = Op.getOperand(2);

29340

SDValue Chain = Op.getOperand(0);

29341

29342

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

29343

assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29343, __extension__
__PRETTY_FUNCTION__));

29344

29345

EVT MemVT = MemIntr->getMemoryVT();

29346

29347

uint16_t TruncationOp = IntrData->Opc0;

29348

switch (TruncationOp) {

29349

case X86ISD::VTRUNC: {

29350

if (isAllOnesConstant(Mask)) // return just a truncate store

29351

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

29352

MemIntr->getMemOperand());

29353

29354

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

29355

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

29356

SDValue Offset = DAG.getUNDEF(VMask.getValueType());

29357

29358

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

29359

MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

29360

true /* truncating */);

29361

}

29362

case X86ISD::VTRUNCUS:

29363

case X86ISD::VTRUNCS: {

29364

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

29365

if (isAllOnesConstant(Mask))

29366

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

29367

MemIntr->getMemOperand(), DAG);

29368

29369

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

29370

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

29371

29372

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

29373

VMask, MemVT, MemIntr->getMemOperand(), DAG);

29374

}

29375

default:

29376

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29376);

29377

}

29378

}

29379

}

29380

}

29381

29382

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

29383

SelectionDAG &DAG) const {

29384

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

29385

MFI.setReturnAddressIsTaken(true);

29386

29387

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

29388

return SDValue();

29389

29390

unsigned Depth = Op.getConstantOperandVal(0);

29391

SDLoc dl(Op);

29392

EVT PtrVT = getPointerTy(DAG.getDataLayout());

29393

29394

if (Depth > 0) {

29395

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

29396

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29397

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

29398

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

29399

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

29400

MachinePointerInfo());

29401

}

29402

29403

// Just load the return address.

29404

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

29405

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

29406

MachinePointerInfo());

29407

}

29408

29409

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

29410

SelectionDAG &DAG) const {

29411

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

29412

return getReturnAddressFrameIndex(DAG);

29413

}

29414

29415

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

29416

MachineFunction &MF = DAG.getMachineFunction();

29417

MachineFrameInfo &MFI = MF.getFrameInfo();

29418

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

29419

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29420

EVT VT = Op.getValueType();

29421

29422

MFI.setFrameAddressIsTaken(true);

29423

29424

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

29425

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

29426

// is not possible to crawl up the stack without looking at the unwind codes

29427

// simultaneously.

29428

int FrameAddrIndex = FuncInfo->getFAIndex();

29429

if (!FrameAddrIndex) {

29430

// Set up a frame object for the return address.

29431

unsigned SlotSize = RegInfo->getSlotSize();

29432

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

29433

SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

29434

FuncInfo->setFAIndex(FrameAddrIndex);

29435

}

29436

return DAG.getFrameIndex(FrameAddrIndex, VT);

29437

}

29438

29439

unsigned FrameReg =

29440

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

29441

SDLoc dl(Op); // FIXME probably not meaningful

29442

unsigned Depth = Op.getConstantOperandVal(0);

29443

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))

29444

(FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__))

29445

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__
__PRETTY_FUNCTION__));

29446

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

29447

while (Depth--)

29448

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

29449

MachinePointerInfo());

29450

return FrameAddr;

29451

}

29452

29453

// FIXME? Maybe this could be a TableGen attribute on some registers and

29454

// this table could be generated automatically from RegInfo.

29455

Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

29456

const MachineFunction &MF) const {

29457

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

29458

29459

Register Reg = StringSwitch<unsigned>(RegName)

29460

.Case("esp", X86::ESP)

29461

.Case("rsp", X86::RSP)

29462

.Case("ebp", X86::EBP)

29463

.Case("rbp", X86::RBP)

29464

.Default(0);

29465

29466

if (Reg == X86::EBP || Reg == X86::RBP) {

29467

if (!TFI.hasFP(MF))

29468

report_fatal_error("register " + StringRef(RegName) +

29469

" is allocatable: function has no frame pointer");

29470

#ifndef NDEBUG

29471

else {

29472

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29473

Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

29474

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__
__PRETTY_FUNCTION__))

29475

"Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__
__PRETTY_FUNCTION__));

29476

}

29477

#endif

29478

}

29479

29480

if (Reg)

29481

return Reg;

29482

29483

report_fatal_error("Invalid register name global variable");

29484

}

29485

29486

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

29487

SelectionDAG &DAG) const {

29488

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29489

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

29490

}

29491

29492

Register X86TargetLowering::getExceptionPointerRegister(

29493

const Constant *PersonalityFn) const {

29494

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

29495

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

29496

29497

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

29498

}

29499

29500

Register X86TargetLowering::getExceptionSelectorRegister(

29501

const Constant *PersonalityFn) const {

29502

// Funclet personalities don't use selectors (the runtime does the selection).

29503

if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))

29504

return X86::NoRegister;

29505

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

29506

}

29507

29508

bool X86TargetLowering::needsFixedCatchObjects() const {

29509

return Subtarget.isTargetWin64();

29510

}

29511

29512

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

29513

SDValue Chain = Op.getOperand(0);

29514

SDValue Offset = Op.getOperand(1);

29515

SDValue Handler = Op.getOperand(2);

29516

SDLoc dl (Op);

29517

29518

EVT PtrVT = getPointerTy(DAG.getDataLayout());

29519

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29520

Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

29521

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))

29522

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__))

29523

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__
__PRETTY_FUNCTION__));

29524

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

29525

Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

29526

29527

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

29528

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

29529

dl));

29530

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

29531

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

29532

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

29533

29534

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

29535

DAG.getRegister(StoreAddrReg, PtrVT));

29536

}

29537

29538

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

29539

SelectionDAG &DAG) const {

29540

SDLoc DL(Op);

29541

// If the subtarget is not 64bit, we may need the global base reg

29542

// after isel expand pseudo, i.e., after CGBR pass ran.

29543

// Therefore, ask for the GlobalBaseReg now, so that the pass

29544

// inserts the code for us in case we need it.

29545

// Otherwise, we will end up in a situation where we will

29546

// reference a virtual register that is not defined!

29547

if (!Subtarget.is64Bit()) {

29548

const X86InstrInfo *TII = Subtarget.getInstrInfo();

29549

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

29550

}

29551

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

29552

DAG.getVTList(MVT::i32, MVT::Other),

29553

Op.getOperand(0), Op.getOperand(1));

29554

}

29555

29556

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

29557

SelectionDAG &DAG) const {

29558

SDLoc DL(Op);

29559

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

29560

Op.getOperand(0), Op.getOperand(1));

29561

}

29562

29563

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

29564

SelectionDAG &DAG) const {

29565

SDLoc DL(Op);

29566

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

29567

Op.getOperand(0));

29568

}

29569

29570

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

29571

return Op.getOperand(0);

29572

}

29573

29574

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

29575

SelectionDAG &DAG) const {

29576

SDValue Root = Op.getOperand(0);

29577

SDValue Trmp = Op.getOperand(1); // trampoline

29578

SDValue FPtr = Op.getOperand(2); // nested function

29579

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

29580

SDLoc dl (Op);

29581

29582

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

29583

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

29584

29585

if (Subtarget.is64Bit()) {

29586

SDValue OutChains[6];

29587

29588

// Large code-model.

29589

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

29590

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

29591

29592

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

29593

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

29594

29595

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

29596

29597

// Load the pointer to the nested function into R11.

29598

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

29599

SDValue Addr = Trmp;

29600

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29601

Addr, MachinePointerInfo(TrmpAddr));

29602

29603

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29604

DAG.getConstant(2, dl, MVT::i64));

29605

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

29606

MachinePointerInfo(TrmpAddr, 2), Align(2));

29607

29608

// Load the 'nest' parameter value into R10.

29609

// R10 is specified in X86CallingConv.td

29610

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

29611

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29612

DAG.getConstant(10, dl, MVT::i64));

29613

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29614

Addr, MachinePointerInfo(TrmpAddr, 10));

29615

29616

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29617

DAG.getConstant(12, dl, MVT::i64));

29618

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

29619

MachinePointerInfo(TrmpAddr, 12), Align(2));

29620

29621

// Jump to the nested function.

29622

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

29623

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29624

DAG.getConstant(20, dl, MVT::i64));

29625

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29626

Addr, MachinePointerInfo(TrmpAddr, 20));

29627

29628

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

29629

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29630

DAG.getConstant(22, dl, MVT::i64));

29631

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

29632

Addr, MachinePointerInfo(TrmpAddr, 22));

29633

29634

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29635

} else {

29636

const Function *Func =

29637

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

29638

CallingConv::ID CC = Func->getCallingConv();

29639

unsigned NestReg;

29640

29641

switch (CC) {

29642

default:

29643

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29643);

29644

case CallingConv::C:

29645

case CallingConv::X86_StdCall: {

29646

// Pass 'nest' parameter in ECX.

29647

// Must be kept in sync with X86CallingConv.td

29648

NestReg = X86::ECX;

29649

29650

// Check that ECX wasn't needed by an 'inreg' parameter.

29651

FunctionType *FTy = Func->getFunctionType();

29652

const AttributeList &Attrs = Func->getAttributes();

29653

29654

if (!Attrs.isEmpty() && !Func->isVarArg()) {

29655

unsigned InRegCount = 0;

29656

unsigned Idx = 0;

29657

29658

for (FunctionType::param_iterator I = FTy->param_begin(),

29659

E = FTy->param_end(); I != E; ++I, ++Idx)

29660

if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {

29661

const DataLayout &DL = DAG.getDataLayout();

29662

// FIXME: should only count parameters that are lowered to integers.

29663

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

29664

}

29665

29666

if (InRegCount > 2) {

29667

report_fatal_error("Nest register in use - reduce number of inreg"

29668

" parameters!");

29669

}

29670

}

29671

break;

29672

}

29673

case CallingConv::X86_FastCall:

29674

case CallingConv::X86_ThisCall:

29675

case CallingConv::Fast:

29676

case CallingConv::Tail:

29677

case CallingConv::SwiftTail:

29678

// Pass 'nest' parameter in EAX.

29679

// Must be kept in sync with X86CallingConv.td

29680

NestReg = X86::EAX;

29681

break;

29682

}

29683

29684

SDValue OutChains[4];

29685

SDValue Addr, Disp;

29686

29687

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29688

DAG.getConstant(10, dl, MVT::i32));

29689

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

29690

29691

// This is storing the opcode for MOV32ri.

29692

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

29693

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

29694

OutChains[0] =

29695

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

29696

Trmp, MachinePointerInfo(TrmpAddr));

29697

29698

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29699

DAG.getConstant(1, dl, MVT::i32));

29700

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

29701

MachinePointerInfo(TrmpAddr, 1), Align(1));

29702

29703

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

29704

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29705

DAG.getConstant(5, dl, MVT::i32));

29706

OutChains[2] =

29707

DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

29708

MachinePointerInfo(TrmpAddr, 5), Align(1));

29709

29710

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29711

DAG.getConstant(6, dl, MVT::i32));

29712

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

29713

MachinePointerInfo(TrmpAddr, 6), Align(1));

29714

29715

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29716

}

29717

}

29718

29719

SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,

29720

SelectionDAG &DAG) const {

29721

/*

29722

The rounding mode is in bits 11:10 of FPSR, and has the following

29723

settings:

29724

00 Round to nearest

29725

01 Round to -inf

29726

10 Round to +inf

29727

11 Round to 0

29728

29729

GET_ROUNDING, on the other hand, expects the following:

29730

-1 Undefined

29731

0 Round to 0

29732

1 Round to nearest

29733

2 Round to +inf

29734

3 Round to -inf

29735

29736

To perform the conversion, we use a packed lookup table of the four 2-bit

29737

values that we can index by FPSP[11:10]

29738

0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

29739

29740

(0x2d >> ((FPSR & 0xc00) >> 9)) & 3

29741

*/

29742

29743

MachineFunction &MF = DAG.getMachineFunction();

29744

MVT VT = Op.getSimpleValueType();

29745

SDLoc DL(Op);

29746

29747

// Save FP Control Word to stack slot

29748

int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

29749

SDValue StackSlot =

29750

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

29751

29752

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

29753

29754

SDValue Chain = Op.getOperand(0);

29755

SDValue Ops[] = {Chain, StackSlot};

29756

Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

29757

DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

29758

Align(2), MachineMemOperand::MOStore);

29759

29760

// Load FP Control Word from stack slot

29761

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

29762

Chain = CWD.getValue(1);

29763

29764

// Mask and turn the control bits into a shift for the lookup table.

29765

SDValue Shift =

29766

DAG.getNode(ISD::SRL, DL, MVT::i16,

29767

DAG.getNode(ISD::AND, DL, MVT::i16,

29768

CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

29769

DAG.getConstant(9, DL, MVT::i8));

29770

Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

29771

29772

SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

29773

SDValue RetVal =

29774

DAG.getNode(ISD::AND, DL, MVT::i32,

29775

DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

29776

DAG.getConstant(3, DL, MVT::i32));

29777

29778

RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

29779

29780

return DAG.getMergeValues({RetVal, Chain}, DL);

29781

}

29782

29783

SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,

29784

SelectionDAG &DAG) const {

29785

MachineFunction &MF = DAG.getMachineFunction();

29786

SDLoc DL(Op);

29787

SDValue Chain = Op.getNode()->getOperand(0);

29788

29789

// FP control word may be set only from data in memory. So we need to allocate

29790

// stack space to save/load FP control word.

29791

int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

29792

SDValue StackSlot =

29793

DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));

29794

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);

29795

MachineMemOperand *MMO =

29796

MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));

29797

29798

// Store FP control word into memory.

29799

SDValue Ops[] = {Chain, StackSlot};

29800

Chain = DAG.getMemIntrinsicNode(

29801

X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);

29802

29803

// Load FP Control Word from stack slot and clear RM field (bits 11:10).

29804

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);

29805

Chain = CWD.getValue(1);

29806

CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),

29807

DAG.getConstant(0xf3ff, DL, MVT::i16));

29808

29809

// Calculate new rounding mode.

29810

SDValue NewRM = Op.getNode()->getOperand(1);

29811

SDValue RMBits;

29812

if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {

29813

uint64_t RM = CVal->getZExtValue();

29814

int FieldVal;

29815

switch (static_cast<RoundingMode>(RM)) {

29816

case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;

29817

case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;

29818

case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;

29819

case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;

29820

default:

29821

llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29821);

29822

}

29823

RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);

29824

} else {

29825

// Need to convert argument into bits of control word:

29826

// 0 Round to 0 -> 11

29827

// 1 Round to nearest -> 00

29828

// 2 Round to +inf -> 10

29829

// 3 Round to -inf -> 01

29830

// The 2-bit value needs then to be shifted so that it occupies bits 11:10.

29831

// To make the conversion, put all these values into a value 0xc9 and shift

29832

// it left depending on the rounding mode:

29833

// (0xc9 << 4) & 0xc00 = X86::rmTowardZero

29834

// (0xc9 << 6) & 0xc00 = X86::rmToNearest

29835

// ...

29836

// (0xc9 << (2 * NewRM + 4)) & 0xc00

29837

SDValue ShiftValue =

29838

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

29839

DAG.getNode(ISD::ADD, DL, MVT::i32,

29840

DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,

29841

DAG.getConstant(1, DL, MVT::i8)),

29842

DAG.getConstant(4, DL, MVT::i32)));

29843

SDValue Shifted =

29844

DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),

29845

ShiftValue);

29846

RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,

29847

DAG.getConstant(0xc00, DL, MVT::i16));

29848

}

29849

29850

// Update rounding mode bits and store the new FP Control Word into stack.

29851

CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);

29852

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));

29853

29854

// Load FP control word from the slot.

29855

SDValue OpsLD[] = {Chain, StackSlot};

29856

MachineMemOperand *MMOL =

29857

MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));

29858

Chain = DAG.getMemIntrinsicNode(

29859

X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);

29860

29861

// If target supports SSE, set MXCSR as well. Rounding mode is encoded in the

29862

// same way but in bits 14:13.

29863

if (Subtarget.hasSSE1()) {

29864

// Store MXCSR into memory.

29865

Chain = DAG.getNode(

29866

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29867

DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

29868

StackSlot);

29869

29870

// Load MXCSR from stack slot and clear RM field (bits 14:13).

29871

SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);

29872

Chain = CWD.getValue(1);

29873

CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),

29874

DAG.getConstant(0xffff9fff, DL, MVT::i32));

29875

29876

// Shift X87 RM bits from 11:10 to 14:13.

29877

RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);

29878

RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,

29879

DAG.getConstant(3, DL, MVT::i8));

29880

29881

// Update rounding mode bits and store the new FP Control Word into stack.

29882

CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);

29883

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));

29884

29885

// Load MXCSR from the slot.

29886

Chain = DAG.getNode(

29887

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29888

DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

29889

StackSlot);

29890

}

29891

29892

return Chain;

29893

}

29894

29895

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

29896

//

29897

// i8/i16 vector implemented using dword LZCNT vector instruction

29898

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

29899

// split the vector, perform operation on it's Lo a Hi part and

29900

// concatenate the results.

29901

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

29902

const X86Subtarget &Subtarget) {

29903

assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29903, __extension__ __PRETTY_FUNCTION__));

29904

SDLoc dl(Op);

29905

MVT VT = Op.getSimpleValueType();

29906

MVT EltVT = VT.getVectorElementType();

29907

unsigned NumElems = VT.getVectorNumElements();

29908

29909

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__
__PRETTY_FUNCTION__))

29910

"Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__
__PRETTY_FUNCTION__));

29911

29912

// Split vector, it's Lo and Hi parts will be handled in next iteration.

29913

if (NumElems > 16 ||

29914

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

29915

return splitVectorIntUnary(Op, DAG);

29916

29917

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

29918

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__
__PRETTY_FUNCTION__))

29919

"Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__
__PRETTY_FUNCTION__));

29920

29921

// Use native supported vector instruction vplzcntd.

29922

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

29923

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

29924

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

29925

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

29926

29927

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

29928

}

29929

29930

// Lower CTLZ using a PSHUFB lookup table implementation.

29931

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

29932

const X86Subtarget &Subtarget,

29933

SelectionDAG &DAG) {

29934

MVT VT = Op.getSimpleValueType();

29935

int NumElts = VT.getVectorNumElements();

29936

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

29937

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

29938

29939

// Per-nibble leading zero PSHUFB lookup table.

29940

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

29941

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

29942

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

29943

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

29944

29945

SmallVector<SDValue, 64> LUTVec;

29946

for (int i = 0; i < NumBytes; ++i)

29947

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

29948

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

29949

29950

// Begin by bitcasting the input to byte vector, then split those bytes

29951

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

29952

// If the hi input nibble is zero then we add both results together, otherwise

29953

// we just take the hi result (by masking the lo result to zero before the

29954

// add).

29955

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

29956

SDValue Zero = DAG.getConstant(0, DL, CurrVT);

29957

29958

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

29959

SDValue Lo = Op0;

29960

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

29961

SDValue HiZ;

29962

if (CurrVT.is512BitVector()) {

29963

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29964

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

29965

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29966

} else {

29967

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

29968

}

29969

29970

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

29971

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

29972

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

29973

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

29974

29975

// Merge result back from vXi8 back to VT, working on the lo/hi halves

29976

// of the current vector width in the same way we did for the nibbles.

29977

// If the upper half of the input element is zero then add the halves'

29978

// leading zero counts together, otherwise just use the upper half's.

29979

// Double the width of the result until we are at target width.

29980

while (CurrVT != VT) {

29981

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

29982

int CurrNumElts = CurrVT.getVectorNumElements();

29983

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

29984

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

29985

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

29986

29987

// Check if the upper half of the input element is zero.

29988

if (CurrVT.is512BitVector()) {

29989

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29990

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

29991

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29992

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29993

} else {

29994

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

29995

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29996

}

29997

HiZ = DAG.getBitcast(NextVT, HiZ);

29998

29999

// Move the upper/lower halves to the lower bits as we'll be extending to

30000

// NextVT. Mask the lower result to zero if HiZ is true and add the results

30001

// together.

30002

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

30003

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

30004

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

30005

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

30006

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

30007

CurrVT = NextVT;

30008

}

30009

30010

return Res;

30011

}

30012

30013

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

30014

const X86Subtarget &Subtarget,

30015

SelectionDAG &DAG) {

30016

MVT VT = Op.getSimpleValueType();

30017

30018

if (Subtarget.hasCDI() &&

30019

// vXi8 vectors need to be promoted to 512-bits for vXi32.

30020

(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

30021

return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

30022

30023

// Decompose 256-bit ops into smaller 128-bit ops.

30024

if (VT.is256BitVector() && !Subtarget.hasInt256())

30025

return splitVectorIntUnary(Op, DAG);

30026

30027

// Decompose 512-bit ops into smaller 256-bit ops.

30028

if (VT.is512BitVector() && !Subtarget.hasBWI())

30029

return splitVectorIntUnary(Op, DAG);

30030

30031

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30031, __extension__
__PRETTY_FUNCTION__));

30032

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

30033

}

30034

30035

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

30036

SelectionDAG &DAG) {

30037

MVT VT = Op.getSimpleValueType();

30038

MVT OpVT = VT;

30039

unsigned NumBits = VT.getSizeInBits();

30040

SDLoc dl(Op);

30041

unsigned Opc = Op.getOpcode();

30042

30043

if (VT.isVector())

30044

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

30045

30046

Op = Op.getOperand(0);

30047

if (VT == MVT::i8) {

30048

// Zero extend to i32 since there is not an i8 bsr.

30049

OpVT = MVT::i32;

30050

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

30051

}

30052

30053

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

30054

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

30055

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

30056

30057

if (Opc == ISD::CTLZ) {

30058

// If src is zero (i.e. bsr sets ZF), returns NumBits.

30059

SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

30060

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

30061

Op.getValue(1)};

30062

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

30063

}

30064

30065

// Finally xor with NumBits-1.

30066

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

30067

DAG.getConstant(NumBits - 1, dl, OpVT));

30068

30069

if (VT == MVT::i8)

30070

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

30071

return Op;

30072

}

30073

30074

static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

30075

SelectionDAG &DAG) {

30076

MVT VT = Op.getSimpleValueType();

30077

unsigned NumBits = VT.getScalarSizeInBits();

30078

SDValue N0 = Op.getOperand(0);

30079

SDLoc dl(Op);

30080

30081

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__
__PRETTY_FUNCTION__))

30082

"Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__
__PRETTY_FUNCTION__));

30083

30084

// Issue a bsf (scan bits forward) which also sets EFLAGS.

30085

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

30086

Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

30087

30088

// If src is known never zero we can skip the CMOV.

30089

if (DAG.isKnownNeverZero(N0))

30090

return Op;

30091

30092

// If src is zero (i.e. bsf sets ZF), returns NumBits.

30093

SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

30094

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

30095

Op.getValue(1)};

30096

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

30097

}

30098

30099

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

30100

const X86Subtarget &Subtarget) {

30101

MVT VT = Op.getSimpleValueType();

30102

if (VT == MVT::i16 || VT == MVT::i32)

30103

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

30104

30105

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30106

return splitVectorIntBinary(Op, DAG);

30107

30108

assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))

30109

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__))

30110

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__
__PRETTY_FUNCTION__));

30111

return splitVectorIntBinary(Op, DAG);

30112

}

30113

30114

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

30115

const X86Subtarget &Subtarget) {

30116

MVT VT = Op.getSimpleValueType();

30117

SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

30118

unsigned Opcode = Op.getOpcode();

30119

SDLoc DL(Op);

30120

30121

if (VT == MVT::v32i16 || VT == MVT::v64i8 ||

30122

(VT.is256BitVector() && !Subtarget.hasInt256())) {

30123

assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__
__PRETTY_FUNCTION__))

30124

"Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__
__PRETTY_FUNCTION__));

30125

return splitVectorIntBinary(Op, DAG);

30126

}

30127

30128

// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

30129

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30130

EVT SetCCResultType =

30131

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30132

30133

unsigned BitWidth = VT.getScalarSizeInBits();

30134

if (Opcode == ISD::USUBSAT) {

30135

if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {

30136

// Handle a special-case with a bit-hack instead of cmp+select:

30137

// usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)

30138

// If the target can use VPTERNLOG, DAGToDAG will match this as

30139

// "vpsra + vpternlog" which is better than "vpmax + vpsub" with a

30140

// "broadcast" constant load.

30141

ConstantSDNode *C = isConstOrConstSplat(Y, true);

30142

if (C && C->getAPIntValue().isSignMask()) {

30143

SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);

30144

SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);

30145

SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);

30146

SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);

30147

return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);

30148

}

30149

}

30150

if (!TLI.isOperationLegal(ISD::UMAX, VT)) {

30151

// usubsat X, Y --> (X >u Y) ? X - Y : 0

30152

SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

30153

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

30154

// TODO: Move this to DAGCombiner?

30155

if (SetCCResultType == VT &&

30156

DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())

30157

return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);

30158

return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

30159

}

30160

}

30161

30162

if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&

30163

(!VT.isVector() || VT == MVT::v2i64)) {

30164

APInt MinVal = APInt::getSignedMinValue(BitWidth);

30165

APInt MaxVal = APInt::getSignedMaxValue(BitWidth);

30166

SDValue Zero = DAG.getConstant(0, DL, VT);

30167

SDValue Result =

30168

DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,

30169

DAG.getVTList(VT, SetCCResultType), X, Y);

30170

SDValue SumDiff = Result.getValue(0);

30171

SDValue Overflow = Result.getValue(1);

30172

SDValue SatMin = DAG.getConstant(MinVal, DL, VT);

30173

SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);

30174

SDValue SumNeg =

30175

DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);

30176

Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);

30177

return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);

30178

}

30179

30180

// Use default expansion.

30181

return SDValue();

30182

}

30183

30184

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

30185

SelectionDAG &DAG) {

30186

MVT VT = Op.getSimpleValueType();

30187

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

30188

// Since X86 does not have CMOV for 8-bit integer, we don't convert

30189

// 8-bit integer abs to NEG and CMOV.

30190

SDLoc DL(Op);

30191

SDValue N0 = Op.getOperand(0);

30192

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

30193

DAG.getConstant(0, DL, VT), N0);

30194

SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),

30195

SDValue(Neg.getNode(), 1)};

30196

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

30197

}

30198

30199

// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

30200

if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

30201

SDLoc DL(Op);

30202

SDValue Src = Op.getOperand(0);

30203

SDValue Sub =

30204

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);

30205

return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);

30206

}

30207

30208

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

30209

assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__
__PRETTY_FUNCTION__))

30210

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__
__PRETTY_FUNCTION__));

30211

return splitVectorIntUnary(Op, DAG);

30212

}

30213

30214

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30215

return splitVectorIntUnary(Op, DAG);

30216

30217

// Default to expand.

30218

return SDValue();

30219

}

30220

30221

static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,

30222

SelectionDAG &DAG) {

30223

MVT VT = Op.getSimpleValueType();

30224

30225

// For AVX1 cases, split to use legal ops.

30226

if (VT.is256BitVector() && !Subtarget.hasInt256())

30227

return splitVectorIntBinary(Op, DAG);

30228

30229

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30230

return splitVectorIntBinary(Op, DAG);

30231

30232

// Default to expand.

30233

return SDValue();

30234

}

30235

30236

static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,

30237

SelectionDAG &DAG) {

30238

MVT VT = Op.getSimpleValueType();

30239

30240

// For AVX1 cases, split to use legal ops.

30241

if (VT.is256BitVector() && !Subtarget.hasInt256())

30242

return splitVectorIntBinary(Op, DAG);

30243

30244

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30245

return splitVectorIntBinary(Op, DAG);

30246

30247

// Default to expand.

30248

return SDValue();

30249

}

30250

30251

static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,

30252

SelectionDAG &DAG) {

30253

assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM ||
Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__
__PRETTY_FUNCTION__))

30254

"Expected FMAXIMUM or FMINIMUM opcode")(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM ||
Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__
__PRETTY_FUNCTION__));

30255

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30256

EVT VT = Op.getValueType();

30257

SDValue X = Op.getOperand(0);

30258

SDValue Y = Op.getOperand(1);

30259

SDLoc DL(Op);

30260

uint64_t SizeInBits = VT.getFixedSizeInBits();

30261

APInt PreferredZero = APInt::getZero(SizeInBits);

30262

EVT IVT = MVT::getIntegerVT(SizeInBits);

30263

X86ISD::NodeType MinMaxOp;

30264

if (Op.getOpcode() == ISD::FMAXIMUM) {

30265

MinMaxOp = X86ISD::FMAX;

30266

} else {

30267

PreferredZero.setSignBit();

30268

MinMaxOp = X86ISD::FMIN;

30269

}

30270

EVT SetCCType =

30271

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30272

30273

// The tables below show the expected result of Max in cases of NaN and

30274

// signed zeros.

30275

//

30276

// Y Y

30277

// Num xNaN +0 -0

30278

// --------------- ---------------

30279

// Num | Max | Y | +0 | +0 | +0 |

30280

// X --------------- X ---------------

30281

// xNaN | X | X/Y | -0 | +0 | -0 |

30282

// --------------- ---------------

30283

//

30284

// It is achieved by means of FMAX/FMIN with preliminary checks and operand

30285

// reordering.

30286

//

30287

// We check if any of operands is NaN and return NaN. Then we check if any of

30288

// operands is zero or negative zero (for fmaximum and fminimum respectively)

30289

// to ensure the correct zero is returned.

30290

auto IsPreferredZero = [PreferredZero](SDValue Op) {

30291

Op = peekThroughBitcasts(Op);

30292

if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))

30293

return CstOp->getValueAPF().bitcastToAPInt() == PreferredZero;

30294

if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))

30295

return CstOp->getAPIntValue() == PreferredZero;

30296

return false;

30297

};

30298

30299

bool IsXNeverNaN = DAG.isKnownNeverNaN(X);

30300

bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);

30301

bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||

30302

Op->getFlags().hasNoSignedZeros() ||

30303

DAG.isKnownNeverZeroFloat(X) ||

30304

DAG.isKnownNeverZeroFloat(Y);

30305

SDValue NewX, NewY;

30306

if (IgnoreSignedZero || IsPreferredZero(Y)) {

30307

// Operands are already in right order or order does not matter.

30308

NewX = X;

30309

NewY = Y;

30310

} else if (IsPreferredZero(X)) {

30311

NewX = Y;

30312

NewY = X;

30313

} else if ((VT == MVT::f16 || Subtarget.hasDQI()) &&

30314

(Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {

30315

if (IsXNeverNaN)

30316

std::swap(X, Y);

30317

// VFPCLASSS consumes a vector type. So provide a minimal one corresponded

30318

// xmm register.

30319

MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);

30320

SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X);

30321

// Bits of classes:

30322

// Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]

30323

// Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN

30324

SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,

30325

DL, MVT::i32);

30326

SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);

30327

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,

30328

DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,

30329

DAG.getIntPtrConstant(0, DL));

30330

SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);

30331

NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);

30332

NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);

30333

return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());

30334

} else {

30335

SDValue IsXSigned;

30336

if (Subtarget.is64Bit() || VT != MVT::f64) {

30337

SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);

30338

SDValue ZeroCst = DAG.getConstant(0, DL, IVT);

30339

IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);

30340

} else {

30341

assert(VT == MVT::f64)(static_cast <bool> (VT == MVT::f64) ? void (0) : __assert_fail
("VT == MVT::f64", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30341, __extension__ __PRETTY_FUNCTION__));

30342

SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,

30343

DAG.getConstantFP(0, DL, MVT::v2f64), X,

30344

DAG.getIntPtrConstant(0, DL));

30345

SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);

30346

SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,

30347

DAG.getIntPtrConstant(1, DL));

30348

Hi = DAG.getBitcast(MVT::i32, Hi);

30349

SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);

30350

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),

30351

*DAG.getContext(), MVT::i32);

30352

IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);

30353

}

30354

if (MinMaxOp == X86ISD::FMAX) {

30355

NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);

30356

NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);

30357

} else {

30358

NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);

30359

NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);

30360

}

30361

}

30362

30363

bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||

30364

Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);

30365

30366

// If we did no ordering operands for singed zero handling and we need

30367

// to process NaN and we know that the second operand is not NaN then put

30368

// it in first operand and we will not need to post handle NaN after max/min.

30369

if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))

30370

std::swap(NewX, NewY);

30371

30372

SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());

30373

30374

if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))

30375

return MinMax;

30376

30377

SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);

30378

return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);

30379

}

30380

30381

static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,

30382

SelectionDAG &DAG) {

30383

MVT VT = Op.getSimpleValueType();

30384

30385

// For AVX1 cases, split to use legal ops.

30386

if (VT.is256BitVector() && !Subtarget.hasInt256())

30387

return splitVectorIntBinary(Op, DAG);

30388

30389

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())

30390

return splitVectorIntBinary(Op, DAG);

30391

30392

SDLoc dl(Op);

30393

bool IsSigned = Op.getOpcode() == ISD::ABDS;

30394

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30395

30396

// TODO: Move to TargetLowering expandABD() once we have ABD promotion.

30397

if (VT.isScalarInteger()) {

30398

unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);

30399

MVT WideVT = MVT::getIntegerVT(WideBits);

30400

if (TLI.isTypeLegal(WideVT)) {

30401

// abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))

30402

// abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))

30403

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30404

SDValue LHS = DAG.getFreeze(Op.getOperand(0));

30405

SDValue RHS = DAG.getFreeze(Op.getOperand(1));

30406

LHS = DAG.getNode(ExtOpc, dl, WideVT, LHS);

30407

RHS = DAG.getNode(ExtOpc, dl, WideVT, RHS);

30408

SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);

30409

SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);

30410

return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);

30411

}

30412

}

30413

30414

// Default to expand.

30415

return SDValue();

30416

}

30417

30418

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

30419

SelectionDAG &DAG) {

30420

SDLoc dl(Op);

30421

MVT VT = Op.getSimpleValueType();

30422

30423

// Decompose 256-bit ops into 128-bit ops.

30424

if (VT.is256BitVector() && !Subtarget.hasInt256())

30425

return splitVectorIntBinary(Op, DAG);

30426

30427

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30428

return splitVectorIntBinary(Op, DAG);

30429

30430

SDValue A = Op.getOperand(0);

30431

SDValue B = Op.getOperand(1);

30432

30433

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

30434

// vector pairs, multiply and truncate.

30435

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

30436

unsigned NumElts = VT.getVectorNumElements();

30437

30438

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30439

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30440

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

30441

return DAG.getNode(

30442

ISD::TRUNCATE, dl, VT,

30443

DAG.getNode(ISD::MUL, dl, ExVT,

30444

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

30445

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

30446

}

30447

30448

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30449

30450

// Extract the lo/hi parts to any extend to i16.

30451

// We're going to mask off the low byte of each result element of the

30452

// pmullw, so it doesn't matter what's in the high byte of each 16-bit

30453

// element.

30454

SDValue Undef = DAG.getUNDEF(VT);

30455

SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

30456

SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

30457

30458

SDValue BLo, BHi;

30459

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

30460

// If the RHS is a constant, manually unpackl/unpackh.

30461

SmallVector<SDValue, 16> LoOps, HiOps;

30462

for (unsigned i = 0; i != NumElts; i += 16) {

30463

for (unsigned j = 0; j != 8; ++j) {

30464

LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

30465

MVT::i16));

30466

HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

30467

MVT::i16));

30468

}

30469

}

30470

30471

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

30472

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

30473

} else {

30474

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

30475

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

30476

}

30477

30478

// Multiply, mask the lower 8bits of the lo/hi results and pack.

30479

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

30480

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

30481

return getPack(DAG, Subtarget, dl, VT, RLo, RHi);

30482

}

30483

30484

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

30485

if (VT == MVT::v4i32) {

30486

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__
__PRETTY_FUNCTION__))

30487

"Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__
__PRETTY_FUNCTION__));

30488

30489

// Extract the odd parts.

30490

static const int UnpackMask[] = { 1, -1, 3, -1 };

30491

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

30492

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

30493

30494

// Multiply the even parts.

30495

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

30496

DAG.getBitcast(MVT::v2i64, A),

30497

DAG.getBitcast(MVT::v2i64, B));

30498

// Now multiply odd parts.

30499

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

30500

DAG.getBitcast(MVT::v2i64, Aodds),

30501

DAG.getBitcast(MVT::v2i64, Bodds));

30502

30503

Evens = DAG.getBitcast(VT, Evens);

30504

Odds = DAG.getBitcast(VT, Odds);

30505

30506

// Merge the two vectors back together with a shuffle. This expands into 2

30507

// shuffles.

30508

static const int ShufMask[] = { 0, 4, 2, 6 };

30509

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

30510

}

30511

30512

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__))

30513

"Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__));

30514

assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30514, __extension__
__PRETTY_FUNCTION__));

30515

30516

// Ahi = psrlqi(a, 32);

30517

// Bhi = psrlqi(b, 32);

30518

//

30519

// AloBlo = pmuludq(a, b);

30520

// AloBhi = pmuludq(a, Bhi);

30521

// AhiBlo = pmuludq(Ahi, b);

30522

//

30523

// Hi = psllqi(AloBhi + AhiBlo, 32);

30524

// return AloBlo + Hi;

30525

KnownBits AKnown = DAG.computeKnownBits(A);

30526

KnownBits BKnown = DAG.computeKnownBits(B);

30527

30528

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

30529

bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

30530

bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

30531

30532

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

30533

bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

30534

bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

30535

30536

SDValue Zero = DAG.getConstant(0, dl, VT);

30537

30538

// Only multiply lo/hi halves that aren't known to be zero.

30539

SDValue AloBlo = Zero;

30540

if (!ALoIsZero && !BLoIsZero)

30541

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

30542

30543

SDValue AloBhi = Zero;

30544

if (!ALoIsZero && !BHiIsZero) {

30545

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

30546

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

30547

}

30548

30549

SDValue AhiBlo = Zero;

30550

if (!AHiIsZero && !BLoIsZero) {

30551

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

30552

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

30553

}

30554

30555

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

30556

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

30557

30558

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

30559

}

30560

30561

static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,

30562

MVT VT, bool IsSigned,

30563

const X86Subtarget &Subtarget,

30564

SelectionDAG &DAG,

30565

SDValue *Low = nullptr) {

30566

unsigned NumElts = VT.getVectorNumElements();

30567

30568

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen

30569

// to a vXi16 type. Do the multiplies, shift the results and pack the half

30570

// lane results back together.

30571

30572

// We'll take different approaches for signed and unsigned.

30573

// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes

30574

// and use pmullw to calculate the full 16-bit product.

30575

// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and

30576

// shift them left into the upper byte of each word. This allows us to use

30577

// pmulhw to calculate the full 16-bit product. This trick means we don't

30578

// need to sign extend the bytes to use pmullw.

30579

30580

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30581

SDValue Zero = DAG.getConstant(0, dl, VT);

30582

30583

SDValue ALo, AHi;

30584

if (IsSigned) {

30585

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));

30586

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));

30587

} else {

30588

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));

30589

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));

30590

}

30591

30592

SDValue BLo, BHi;

30593

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

30594

// If the RHS is a constant, manually unpackl/unpackh and extend.

30595

SmallVector<SDValue, 16> LoOps, HiOps;

30596

for (unsigned i = 0; i != NumElts; i += 16) {

30597

for (unsigned j = 0; j != 8; ++j) {

30598

SDValue LoOp = B.getOperand(i + j);

30599

SDValue HiOp = B.getOperand(i + j + 8);

30600

30601

if (IsSigned) {

30602

LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);

30603

HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);

30604

LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,

30605

DAG.getConstant(8, dl, MVT::i16));

30606

HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,

30607

DAG.getConstant(8, dl, MVT::i16));

30608

} else {

30609

LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);

30610

HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);

30611

}

30612

30613

LoOps.push_back(LoOp);

30614

HiOps.push_back(HiOp);

30615

}

30616

}

30617

30618

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

30619

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

30620

} else if (IsSigned) {

30621

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));

30622

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));

30623

} else {

30624

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));

30625

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));

30626

}

30627

30628

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

30629

// pack back to vXi8.

30630

unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;

30631

SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);

30632

SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);

30633

30634

if (Low)

30635

*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);

30636

30637

return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);

30638

}

30639

30640

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

30641

SelectionDAG &DAG) {

30642

SDLoc dl(Op);

30643

MVT VT = Op.getSimpleValueType();

30644

bool IsSigned = Op->getOpcode() == ISD::MULHS;

30645

unsigned NumElts = VT.getVectorNumElements();

30646

SDValue A = Op.getOperand(0);

30647

SDValue B = Op.getOperand(1);

30648

30649

// Decompose 256-bit ops into 128-bit ops.

30650

if (VT.is256BitVector() && !Subtarget.hasInt256())

30651

return splitVectorIntBinary(Op, DAG);

30652

30653

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30654

return splitVectorIntBinary(Op, DAG);

30655

30656

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

30657

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))

30658

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__))

30659

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__
__PRETTY_FUNCTION__));

30660

30661

// PMULxD operations multiply each even value (starting at 0) of LHS with

30662

// the related value of RHS and produce a widen result.

30663

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

30664

// => <2 x i64> <ae|cg>

30665

//

30666

// In other word, to have all the results, we need to perform two PMULxD:

30667

// 1. one with the even values.

30668

// 2. one with the odd values.

30669

// To achieve #2, with need to place the odd values at an even position.

30670

//

30671

// Place the odd value at an even position (basically, shift all values 1

30672

// step to the left):

30673

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,

30674

9, -1, 11, -1, 13, -1, 15, -1};

30675

// <a|b|c|d> => <b|undef|d|undef>

30676

SDValue Odd0 =

30677

DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));

30678

// <e|f|g|h> => <f|undef|h|undef>

30679

SDValue Odd1 =

30680

DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));

30681

30682

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

30683

// ints.

30684

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

30685

unsigned Opcode =

30686

(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

30687

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

30688

// => <2 x i64> <ae|cg>

30689

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

30690

DAG.getBitcast(MulVT, A),

30691

DAG.getBitcast(MulVT, B)));

30692

30693

// => <2 x i64> <bf|dh>

30694

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

30695

DAG.getBitcast(MulVT, Odd0),

30696

DAG.getBitcast(MulVT, Odd1)));

30697

30698

// Shuffle it back into the right order.

30699

SmallVector<int, 16> ShufMask(NumElts);

30700

for (int i = 0; i != (int)NumElts; ++i)

30701

ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

30702

30703

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

30704

30705

// If we have a signed multiply but no PMULDQ fix up the result of an

30706

// unsigned multiply.

30707

if (IsSigned && !Subtarget.hasSSE41()) {

30708

SDValue Zero = DAG.getConstant(0, dl, VT);

30709

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

30710

DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

30711

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

30712

DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

30713

30714

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

30715

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

30716

}

30717

30718

return Res;

30719

}

30720

30721

// Only i8 vectors should need custom lowering after this.

30722

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))

30723

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))

30724

"Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__));

30725

30726

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

30727

// logical shift down the upper half and pack back to i8.

30728

30729

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

30730

// and then ashr/lshr the upper bits down to the lower bits before multiply.

30731

30732

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30733

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30734

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30735

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30736

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30737

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30738

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30739

Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30740

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30741

}

30742

30743

return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);

30744

}

30745

30746

// Custom lowering for SMULO/UMULO.

30747

static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,

30748

SelectionDAG &DAG) {

30749

MVT VT = Op.getSimpleValueType();

30750

30751

// Scalars defer to LowerXALUO.

30752

if (!VT.isVector())

30753

return LowerXALUO(Op, DAG);

30754

30755

SDLoc dl(Op);

30756

bool IsSigned = Op->getOpcode() == ISD::SMULO;

30757

SDValue A = Op.getOperand(0);

30758

SDValue B = Op.getOperand(1);

30759

EVT OvfVT = Op->getValueType(1);

30760

30761

if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||

30762

(VT == MVT::v64i8 && !Subtarget.hasBWI())) {

30763

// Extract the LHS Lo/Hi vectors

30764

SDValue LHSLo, LHSHi;

30765

std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);

30766

30767

// Extract the RHS Lo/Hi vectors

30768

SDValue RHSLo, RHSHi;

30769

std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);

30770

30771

EVT LoOvfVT, HiOvfVT;

30772

std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);

30773

SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);

30774

SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);

30775

30776

// Issue the split operations.

30777

SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);

30778

SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);

30779

30780

// Join the separate data results and the overflow results.

30781

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

30782

SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),

30783

Hi.getValue(1));

30784

30785

return DAG.getMergeValues({Res, Ovf}, dl);

30786

}

30787

30788

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30789

EVT SetccVT =

30790

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30791

30792

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30793

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30794

unsigned NumElts = VT.getVectorNumElements();

30795

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30796

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30797

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30798

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30799

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30800

30801

SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30802

30803

SDValue Ovf;

30804

if (IsSigned) {

30805

SDValue High, LowSign;

30806

if (OvfVT.getVectorElementType() == MVT::i1 &&

30807

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30808

// Rather the truncating try to do the compare on vXi16 or vXi32.

30809

// Shift the high down filling with sign bits.

30810

High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);

30811

// Fill all 16 bits with the sign bit from the low.

30812

LowSign =

30813

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);

30814

LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,

30815

15, DAG);

30816

SetccVT = OvfVT;

30817

if (!Subtarget.hasBWI()) {

30818

// We can't do a vXi16 compare so sign extend to v16i32.

30819

High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);

30820

LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);

30821

}

30822

} else {

30823

// Otherwise do the compare at vXi8.

30824

High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30825

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30826

LowSign =

30827

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30828

}

30829

30830

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30831

} else {

30832

SDValue High =

30833

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30834

if (OvfVT.getVectorElementType() == MVT::i1 &&

30835

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30836

// Rather the truncating try to do the compare on vXi16 or vXi32.

30837

SetccVT = OvfVT;

30838

if (!Subtarget.hasBWI()) {

30839

// We can't do a vXi16 compare so sign extend to v16i32.

30840

High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);

30841

}

30842

} else {

30843

// Otherwise do the compare at vXi8.

30844

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30845

}

30846

30847

Ovf =

30848

DAG.getSetCC(dl, SetccVT, High,

30849

DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);

30850

}

30851

30852

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30853

30854

return DAG.getMergeValues({Low, Ovf}, dl);

30855

}

30856

30857

SDValue Low;

30858

SDValue High =

30859

LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);

30860

30861

SDValue Ovf;

30862

if (IsSigned) {

30863

// SMULO overflows if the high bits don't match the sign of the low.

30864

SDValue LowSign =

30865

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30866

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30867

} else {

30868

// UMULO overflows if the high bits are non-zero.

30869

Ovf =

30870

DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);

30871

}

30872

30873

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30874

30875

return DAG.getMergeValues({Low, Ovf}, dl);

30876

}

30877

30878

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

30879

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30879, __extension__
__PRETTY_FUNCTION__));

30880

EVT VT = Op.getValueType();

30881

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__
__PRETTY_FUNCTION__))

30882

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__
__PRETTY_FUNCTION__));

30883

30884

if (isa<ConstantSDNode>(Op->getOperand(1))) {

30885

SmallVector<SDValue> Result;

30886

if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))

30887

return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);

30888

}

30889

30890

RTLIB::Libcall LC;

30891

bool isSigned;

30892

switch (Op->getOpcode()) {

30893

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30893);

30894

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

30895

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

30896

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

30897

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

30898

}

30899

30900

SDLoc dl(Op);

30901

SDValue InChain = DAG.getEntryNode();

30902

30903

TargetLowering::ArgListTy Args;

30904

TargetLowering::ArgListEntry Entry;

30905

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

30906

EVT ArgVT = Op->getOperand(i).getValueType();

30907

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__
__PRETTY_FUNCTION__))

30908

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__
__PRETTY_FUNCTION__));

30909

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30910

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30911

MachinePointerInfo MPI =

30912

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30913

Entry.Node = StackPtr;

30914

InChain =

30915

DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

30916

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

30917

Entry.Ty = PointerType::get(ArgTy,0);

30918

Entry.IsSExt = false;

30919

Entry.IsZExt = false;

30920

Args.push_back(Entry);

30921

}

30922

30923

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

30924

getPointerTy(DAG.getDataLayout()));

30925

30926

TargetLowering::CallLoweringInfo CLI(DAG);

30927

CLI.setDebugLoc(dl)

30928

.setChain(InChain)

30929

.setLibCallee(

30930

getLibcallCallingConv(LC),

30931

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

30932

std::move(Args))

30933

.setInRegister()

30934

.setSExtResult(isSigned)

30935

.setZExtResult(!isSigned);

30936

30937

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

30938

return DAG.getBitcast(VT, CallInfo.first);

30939

}

30940

30941

SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,

30942

SelectionDAG &DAG,

30943

SDValue &Chain) const {

30944

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30944, __extension__
__PRETTY_FUNCTION__));

30945

EVT VT = Op.getValueType();

30946

bool IsStrict = Op->isStrictFPOpcode();

30947

30948

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30949

EVT ArgVT = Arg.getValueType();

30950

30951

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__
__PRETTY_FUNCTION__))

30952

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__
__PRETTY_FUNCTION__));

30953

30954

RTLIB::Libcall LC;

30955

if (Op->getOpcode() == ISD::FP_TO_SINT ||

30956

Op->getOpcode() == ISD::STRICT_FP_TO_SINT)

30957

LC = RTLIB::getFPTOSINT(ArgVT, VT);

30958

else

30959

LC = RTLIB::getFPTOUINT(ArgVT, VT);

30960

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30960, __extension__
__PRETTY_FUNCTION__));

30961

30962

SDLoc dl(Op);

30963

MakeLibCallOptions CallOptions;

30964

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30965

30966

SDValue Result;

30967

// Expect the i128 argument returned as a v2i64 in xmm0, cast back to the

30968

// expected VT (i128).

30969

std::tie(Result, Chain) =

30970

makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);

30971

Result = DAG.getBitcast(VT, Result);

30972

return Result;

30973

}

30974

30975

SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,

30976

SelectionDAG &DAG) const {

30977

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30977, __extension__
__PRETTY_FUNCTION__));

30978

EVT VT = Op.getValueType();

30979

bool IsStrict = Op->isStrictFPOpcode();

30980

30981

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30982

EVT ArgVT = Arg.getValueType();

30983

30984

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__
__PRETTY_FUNCTION__))

30985

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__
__PRETTY_FUNCTION__));

30986

30987

RTLIB::Libcall LC;

30988

if (Op->getOpcode() == ISD::SINT_TO_FP ||

30989

Op->getOpcode() == ISD::STRICT_SINT_TO_FP)

30990

LC = RTLIB::getSINTTOFP(ArgVT, VT);

30991

else

30992

LC = RTLIB::getUINTTOFP(ArgVT, VT);

30993

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30993, __extension__
__PRETTY_FUNCTION__));

30994

30995

SDLoc dl(Op);

30996

MakeLibCallOptions CallOptions;

30997

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30998

30999

// Pass the i128 argument as an indirect argument on the stack.

31000

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

31001

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

31002

MachinePointerInfo MPI =

31003

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

31004

Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));

31005

31006

SDValue Result;

31007

std::tie(Result, Chain) =

31008

makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);

31009

return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;

31010

}

31011

31012

// Return true if the required (according to Opcode) shift-imm form is natively

31013

// supported by the Subtarget

31014

static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,

31015

unsigned Opcode) {

31016

if (!VT.isSimple())

31017

return false;

31018

31019

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

31020

return false;

31021

31022

if (VT.getScalarSizeInBits() < 16)

31023

return false;

31024

31025

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

31026

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

31027

return true;

31028

31029

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

31030

(VT.is256BitVector() && Subtarget.hasInt256());

31031

31032

bool AShift = LShift && (Subtarget.hasAVX512() ||

31033

(VT != MVT::v2i64 && VT != MVT::v4i64));

31034

return (Opcode == ISD::SRA) ? AShift : LShift;

31035

}

31036

31037

// The shift amount is a variable, but it is the same for all vector lanes.

31038

// These instructions are defined together with shift-immediate.

31039

static

31040

bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,

31041

unsigned Opcode) {

31042

return supportedVectorShiftWithImm(VT, Subtarget, Opcode);

31043

}

31044

31045

// Return true if the required (according to Opcode) variable-shift form is

31046

// natively supported by the Subtarget

31047

static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,

31048

unsigned Opcode) {

31049

if (!VT.isSimple())

31050

return false;

31051

31052

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

31053

return false;

31054

31055

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

31056

return false;

31057

31058

// vXi16 supported only on AVX-512, BWI

31059

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

31060

return false;

31061

31062

if (Subtarget.hasAVX512() &&

31063

(Subtarget.useAVX512Regs() || !VT.is512BitVector()))

31064

return true;

31065

31066

bool LShift = VT.is128BitVector() || VT.is256BitVector();

31067

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

31068

return (Opcode == ISD::SRA) ? AShift : LShift;

31069

}

31070

31071

static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

31072

const X86Subtarget &Subtarget) {

31073

MVT VT = Op.getSimpleValueType();

31074

SDLoc dl(Op);

31075

SDValue R = Op.getOperand(0);

31076

SDValue Amt = Op.getOperand(1);

31077

unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

31078

31079

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

31080

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31080, __extension__
__PRETTY_FUNCTION__));

31081

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

31082

SDValue Ex = DAG.getBitcast(ExVT, R);

31083

31084

// ashr(R, 63) === cmp_slt(R, 0)

31085

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

31086

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__
__PRETTY_FUNCTION__))

31087

"Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__
__PRETTY_FUNCTION__));

31088

return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

31089

}

31090

31091

if (ShiftAmt >= 32) {

31092

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

31093

SDValue Upper =

31094

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

31095

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

31096

ShiftAmt - 32, DAG);

31097

if (VT == MVT::v2i64)

31098

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

31099

if (VT == MVT::v4i64)

31100

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

31101

{9, 1, 11, 3, 13, 5, 15, 7});

31102

} else {

31103

// SRA upper i32, SRL whole i64 and select lower i32.

31104

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

31105

ShiftAmt, DAG);

31106

SDValue Lower =

31107

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

31108

Lower = DAG.getBitcast(ExVT, Lower);

31109

if (VT == MVT::v2i64)

31110

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

31111

if (VT == MVT::v4i64)

31112

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

31113

{8, 1, 10, 3, 12, 5, 14, 7});

31114

}

31115

return DAG.getBitcast(VT, Ex);

31116

};

31117

31118

// Optimize shl/srl/sra with constant shift amount.

31119

APInt APIntShiftAmt;

31120

if (!X86::isConstantSplat(Amt, APIntShiftAmt))

31121

return SDValue();

31122

31123

// If the shift amount is out of range, return undef.

31124

if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))

31125

return DAG.getUNDEF(VT);

31126

31127

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

31128

31129

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {

31130

// Hardware support for vector shifts is sparse which makes us scalarize the

31131

// vector operations in many cases. Also, on sandybridge ADD is faster than

31132

// shl: (shl V, 1) -> (add (freeze V), (freeze V))

31133

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

31134

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

31135

// must be 0). (add undef, undef) however can be any value. To make this

31136

// safe, we must freeze R to ensure that register allocation uses the same

31137

// register for an undefined value. This ensures that the result will

31138

// still be even and preserves the original semantics.

31139

R = DAG.getFreeze(R);

31140

return DAG.getNode(ISD::ADD, dl, VT, R, R);

31141

}

31142

31143

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

31144

}

31145

31146

// i64 SRA needs to be performed as partial shifts.

31147

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

31148

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

31149

Op.getOpcode() == ISD::SRA)

31150

return ArithmeticShiftRight64(ShiftAmt);

31151

31152

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

31153

(Subtarget.hasBWI() && VT == MVT::v64i8)) {

31154

unsigned NumElts = VT.getVectorNumElements();

31155

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

31156

31157

// Simple i8 add case

31158

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

31159

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

31160

// must be 0). (add undef, undef) however can be any value. To make this

31161

// safe, we must freeze R to ensure that register allocation uses the same

31162

// register for an undefined value. This ensures that the result will

31163

// still be even and preserves the original semantics.

31164

R = DAG.getFreeze(R);

31165

return DAG.getNode(ISD::ADD, dl, VT, R, R);

31166

}

31167

31168

// ashr(R, 7) === cmp_slt(R, 0)

31169

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

31170

SDValue Zeros = DAG.getConstant(0, dl, VT);

31171

if (VT.is512BitVector()) {

31172

assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31172, __extension__
__PRETTY_FUNCTION__));

31173

SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

31174

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

31175

}

31176

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

31177

}

31178

31179

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

31180

if (VT == MVT::v16i8 && Subtarget.hasXOP())

31181

return SDValue();

31182

31183

if (Op.getOpcode() == ISD::SHL) {

31184

// Make a large shift.

31185

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

31186

ShiftAmt, DAG);

31187

SHL = DAG.getBitcast(VT, SHL);

31188

// Zero out the rightmost bits.

31189

APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

31190

return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

31191

}

31192

if (Op.getOpcode() == ISD::SRL) {

31193

// Make a large shift.

31194

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

31195

ShiftAmt, DAG);

31196

SRL = DAG.getBitcast(VT, SRL);

31197

// Zero out the leftmost bits.

31198

APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);

31199

return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));

31200

}

31201

if (Op.getOpcode() == ISD::SRA) {

31202

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

31203

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

31204

31205

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

31206

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

31207

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

31208

return Res;

31209

}

31210

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31210);

31211

}

31212

31213

return SDValue();

31214

}

31215

31216

static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,

31217

const X86Subtarget &Subtarget) {

31218

MVT VT = Op.getSimpleValueType();

31219

SDLoc dl(Op);

31220

SDValue R = Op.getOperand(0);

31221

SDValue Amt = Op.getOperand(1);

31222

unsigned Opcode = Op.getOpcode();

31223

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);

31224

31225

int BaseShAmtIdx = -1;

31226

if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {

31227

if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))

31228

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,

31229

Subtarget, DAG);

31230

31231

// vXi8 shifts - shift as v8i16 + mask result.

31232

if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

31233

(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

31234

VT == MVT::v64i8) &&

31235

!Subtarget.hasXOP()) {

31236

unsigned NumElts = VT.getVectorNumElements();

31237

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

31238

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

31239

unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

31240

unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);

31241

31242

// Create the mask using vXi16 shifts. For shift-rights we need to move

31243

// the upper byte down before splatting the vXi8 mask.

31244

SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);

31245

BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

31246

BaseShAmt, BaseShAmtIdx, Subtarget, DAG);

31247

if (Opcode != ISD::SHL)

31248

BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

31249

8, DAG);

31250

BitMask = DAG.getBitcast(VT, BitMask);

31251

BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

31252

SmallVector<int, 64>(NumElts, 0));

31253

31254

SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

31255

DAG.getBitcast(ExtVT, R), BaseShAmt,

31256

BaseShAmtIdx, Subtarget, DAG);

31257

Res = DAG.getBitcast(VT, Res);

31258

Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

31259

31260

if (Opcode == ISD::SRA) {

31261

// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

31262

// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

31263

SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

31264

SignMask =

31265

getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,

31266

BaseShAmtIdx, Subtarget, DAG);

31267

SignMask = DAG.getBitcast(VT, SignMask);

31268

Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

31269

Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

31270

}

31271

return Res;

31272

}

31273

}

31274

}

31275

31276

return SDValue();

31277

}

31278

31279

// Convert a shift/rotate left amount to a multiplication scale factor.

31280

static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

31281

const X86Subtarget &Subtarget,

31282

SelectionDAG &DAG) {

31283

MVT VT = Amt.getSimpleValueType();

31284

if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

31285

(Subtarget.hasInt256() && VT == MVT::v16i16) ||

31286

(Subtarget.hasAVX512() && VT == MVT::v32i16) ||

31287

(!Subtarget.hasAVX512() && VT == MVT::v16i8) ||

31288

(Subtarget.hasInt256() && VT == MVT::v32i8) ||

31289

(Subtarget.hasBWI() && VT == MVT::v64i8)))

31290

return SDValue();

31291

31292

MVT SVT = VT.getVectorElementType();

31293

unsigned SVTBits = SVT.getSizeInBits();

31294

unsigned NumElems = VT.getVectorNumElements();

31295

31296

APInt UndefElts;

31297

SmallVector<APInt> EltBits;

31298

if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {

31299

APInt One(SVTBits, 1);

31300

SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));

31301

for (unsigned I = 0; I != NumElems; ++I) {

31302

if (UndefElts[I] || EltBits[I].uge(SVTBits))

31303

continue;

31304

uint64_t ShAmt = EltBits[I].getZExtValue();

31305

Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);

31306

}

31307

return DAG.getBuildVector(VT, dl, Elts);

31308

}

31309

31310

// If the target doesn't support variable shifts, use either FP conversion

31311

// or integer multiplication to avoid shifting each element individually.

31312

if (VT == MVT::v4i32) {

31313

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

31314

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

31315

DAG.getConstant(0x3f800000U, dl, VT));

31316

Amt = DAG.getBitcast(MVT::v4f32, Amt);

31317

return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

31318

}

31319

31320

// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

31321

if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

31322

SDValue Z = DAG.getConstant(0, dl, VT);

31323

SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

31324

SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

31325

Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

31326

Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

31327

if (Subtarget.hasSSE41())

31328

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

31329

return getPack(DAG, Subtarget, dl, VT, Lo, Hi);

31330

}

31331

31332

return SDValue();

31333

}

31334

31335

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

31336

SelectionDAG &DAG) {

31337

MVT VT = Op.getSimpleValueType();

31338

SDLoc dl(Op);

31339

SDValue R = Op.getOperand(0);

31340

SDValue Amt = Op.getOperand(1);

31341

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31342

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31343

31344

unsigned Opc = Op.getOpcode();

31345

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

31346

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

31347

31348

assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31348, __extension__
__PRETTY_FUNCTION__));

31349

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31349, __extension__
__PRETTY_FUNCTION__));

31350

31351

if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))

31352

return V;

31353

31354

if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))

31355

return V;

31356

31357

if (supportedVectorVarShift(VT, Subtarget, Opc))

31358

return Op;

31359

31360

// i64 vector arithmetic shift can be emulated with the transform:

31361

// M = lshr(SIGN_MASK, Amt)

31362

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

31363

if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||

31364

(VT == MVT::v4i64 && Subtarget.hasInt256())) &&

31365

Opc == ISD::SRA) {

31366

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

31367

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

31368

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

31369

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

31370

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

31371

return R;

31372

}

31373

31374

// XOP has 128-bit variable logical/arithmetic shifts.

31375

// +ve/-ve Amt = shift left/right.

31376

if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

31377

VT == MVT::v8i16 || VT == MVT::v16i8)) {

31378

if (Opc == ISD::SRL || Opc == ISD::SRA) {

31379

SDValue Zero = DAG.getConstant(0, dl, VT);

31380

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

31381

}

31382

if (Opc == ISD::SHL || Opc == ISD::SRL)

31383

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

31384

if (Opc == ISD::SRA)

31385

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

31386

}

31387

31388

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

31389

// shifts per-lane and then shuffle the partial results back together.

31390

if (VT == MVT::v2i64 && Opc != ISD::SRA) {

31391

// Splat the shift amounts so the scalar shifts above will catch it.

31392

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

31393

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

31394

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

31395

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

31396

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

31397

}

31398

31399

// If possible, lower this shift as a sequence of two shifts by

31400

// constant plus a BLENDing shuffle instead of scalarizing it.

31401

// Example:

31402

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

31403

//

31404

// Could be rewritten as:

31405

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

31406

//

31407

// The advantage is that the two shifts from the example would be

31408

// lowered as X86ISD::VSRLI nodes in parallel before blending.

31409

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

31410

(VT == MVT::v16i16 && Subtarget.hasInt256()))) {

31411

SDValue Amt1, Amt2;

31412

unsigned NumElts = VT.getVectorNumElements();

31413

SmallVector<int, 8> ShuffleMask;

31414

for (unsigned i = 0; i != NumElts; ++i) {

31415

SDValue A = Amt->getOperand(i);

31416

if (A.isUndef()) {

31417

ShuffleMask.push_back(SM_SentinelUndef);

31418

continue;

31419

}

31420

if (!Amt1 || Amt1 == A) {

31421

ShuffleMask.push_back(i);

31422

Amt1 = A;

31423

continue;

31424

}

31425

if (!Amt2 || Amt2 == A) {

31426

ShuffleMask.push_back(i + NumElts);

31427

Amt2 = A;

31428

continue;

31429

}

31430

break;

31431

}

31432

31433

// Only perform this blend if we can perform it without loading a mask.

31434

if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&

31435

(VT != MVT::v16i16 ||

31436

is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

31437

(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

31438

canWidenShuffleElements(ShuffleMask))) {

31439

auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);

31440

auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);

31441

if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&

31442

Cst2->getAPIntValue().ult(EltSizeInBits)) {

31443

SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

31444

Cst1->getZExtValue(), DAG);

31445

SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

31446

Cst2->getZExtValue(), DAG);

31447

return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

31448

}

31449

}

31450

}

31451

31452

// If possible, lower this packed shift into a vector multiply instead of

31453

// expanding it into a sequence of scalar shifts.

31454

// For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

31455

if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||

31456

Subtarget.canExtendTo512BW())))

31457

if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

31458

return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

31459

31460

// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

31461

// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

31462

if (Opc == ISD::SRL && ConstantAmt &&

31463

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

31464

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

31465

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

31466

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

31467

SDValue Zero = DAG.getConstant(0, dl, VT);

31468

SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

31469

SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

31470

return DAG.getSelect(dl, VT, ZAmt, R, Res);

31471

}

31472

}

31473

31474

// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

31475

// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

31476

// TODO: Special case handling for shift by 0/1, really we can afford either

31477

// of these cases in pre-SSE41/XOP/AVX512 but not both.

31478

if (Opc == ISD::SRA && ConstantAmt &&

31479

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

31480

((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

31481

!Subtarget.hasAVX512()) ||

31482

DAG.isKnownNeverZero(Amt))) {

31483

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

31484

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

31485

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

31486

SDValue Amt0 =

31487

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

31488

SDValue Amt1 =

31489

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

31490

SDValue Sra1 =

31491

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

31492

SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

31493

Res = DAG.getSelect(dl, VT, Amt0, R, Res);

31494

return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

31495

}

31496

}

31497

31498

// v4i32 Non Uniform Shifts.

31499

// If the shift amount is constant we can shift each lane using the SSE2

31500

// immediate shifts, else we need to zero-extend each lane to the lower i64

31501

// and shift using the SSE2 variable shifts.

31502

// The separate results can then be blended together.

31503

if (VT == MVT::v4i32) {

31504

SDValue Amt0, Amt1, Amt2, Amt3;

31505

if (ConstantAmt) {

31506

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

31507

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

31508

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

31509

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

31510

} else {

31511

// The SSE2 shifts use the lower i64 as the same shift amount for

31512

// all lanes and the upper i64 is ignored. On AVX we're better off

31513

// just zero-extending, but for SSE just duplicating the top 16-bits is

31514

// cheaper and has the same effect for out of range values.

31515

if (Subtarget.hasAVX()) {

31516

SDValue Z = DAG.getConstant(0, dl, VT);

31517

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

31518

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

31519

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

31520

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

31521

} else {

31522

SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

31523

SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

31524

{4, 5, 6, 7, -1, -1, -1, -1});

31525

SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);

31526

SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);

31527

Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);

31528

Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);

31529

Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);

31530

Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);

31531

}

31532

}

31533

31534

unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

31535

SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

31536

SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

31537

SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

31538

SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

31539

31540

// Merge the shifted lane results optimally with/without PBLENDW.

31541

// TODO - ideally shuffle combining would handle this.

31542

if (Subtarget.hasSSE41()) {

31543

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

31544

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

31545

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

31546

}

31547

SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

31548

SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

31549

return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

31550

}

31551

31552

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

31553

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

31554

// make the existing SSE solution better.

31555

// NOTE: We honor prefered vector width before promoting to 512-bits.

31556

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

31557

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

31558

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

31559

(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

31560

(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

31561

assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__
__PRETTY_FUNCTION__))

31562

"Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__
__PRETTY_FUNCTION__));

31563

MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

31564

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

31565

unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

31566

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

31567

Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

31568

return DAG.getNode(ISD::TRUNCATE, dl, VT,

31569

DAG.getNode(Opc, dl, ExtVT, R, Amt));

31570

}

31571

31572

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

31573

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

31574

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

31575

(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

31576

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&

31577

!Subtarget.hasXOP()) {

31578

int NumElts = VT.getVectorNumElements();

31579

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

31580

31581

// Extend constant shift amount to vXi16 (it doesn't matter if the type

31582

// isn't legal).

31583

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

31584

Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

31585

Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

31586

Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

31587

assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__
__PRETTY_FUNCTION__))

31588

"Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__
__PRETTY_FUNCTION__));

31589

31590

if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

31591

R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)

31592

: DAG.getZExtOrTrunc(R, dl, ExVT);

31593

R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

31594

R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

31595

return DAG.getZExtOrTrunc(R, dl, VT);

31596

}

31597

31598

SmallVector<SDValue, 16> LoAmt, HiAmt;

31599

for (int i = 0; i != NumElts; i += 16) {

31600

for (int j = 0; j != 8; ++j) {

31601

LoAmt.push_back(Amt.getOperand(i + j));

31602

HiAmt.push_back(Amt.getOperand(i + j + 8));

31603

}

31604

}

31605

31606

MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

31607

SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

31608

SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

31609

31610

SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

31611

SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

31612

LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

31613

HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

31614

LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

31615

HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

31616

LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

31617

HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

31618

return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

31619

}

31620

31621

if (VT == MVT::v16i8 ||

31622

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

31623

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

31624

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

31625

31626

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

31627

if (VT.is512BitVector()) {

31628

// On AVX512BW targets we make use of the fact that VSELECT lowers

31629

// to a masked blend which selects bytes based just on the sign bit

31630

// extracted to a mask.

31631

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

31632

V0 = DAG.getBitcast(VT, V0);

31633

V1 = DAG.getBitcast(VT, V1);

31634

Sel = DAG.getBitcast(VT, Sel);

31635

Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

31636

ISD::SETGT);

31637

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

31638

} else if (Subtarget.hasSSE41()) {

31639

// On SSE41 targets we can use PBLENDVB which selects bytes based just

31640

// on the sign bit.

31641

V0 = DAG.getBitcast(VT, V0);

31642

V1 = DAG.getBitcast(VT, V1);

31643

Sel = DAG.getBitcast(VT, Sel);

31644

return DAG.getBitcast(SelVT,

31645

DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

31646

}

31647

// On pre-SSE41 targets we test for the sign bit by comparing to

31648

// zero - a negative value will set all bits of the lanes to true

31649

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

31650

SDValue Z = DAG.getConstant(0, dl, SelVT);

31651

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

31652

return DAG.getSelect(dl, SelVT, C, V0, V1);

31653

};

31654

31655

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

31656

// We can safely do this using i16 shifts as we're only interested in

31657

// the 3 lower bits of each byte.

31658

Amt = DAG.getBitcast(ExtVT, Amt);

31659

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

31660

Amt = DAG.getBitcast(VT, Amt);

31661

31662

if (Opc == ISD::SHL || Opc == ISD::SRL) {

31663

// r = VSELECT(r, shift(r, 4), a);

31664

SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

31665

R = SignBitSelect(VT, Amt, M, R);

31666

31667

// a += a

31668

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31669

31670

// r = VSELECT(r, shift(r, 2), a);

31671

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

31672

R = SignBitSelect(VT, Amt, M, R);

31673

31674

// a += a

31675

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31676

31677

// return VSELECT(r, shift(r, 1), a);

31678

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

31679

R = SignBitSelect(VT, Amt, M, R);

31680

return R;

31681

}

31682

31683

if (Opc == ISD::SRA) {

31684

// For SRA we need to unpack each byte to the higher byte of a i16 vector

31685

// so we can correctly sign extend. We don't care what happens to the

31686

// lower byte.

31687

SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

31688

SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

31689

SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

31690

SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

31691

ALo = DAG.getBitcast(ExtVT, ALo);

31692

AHi = DAG.getBitcast(ExtVT, AHi);

31693

RLo = DAG.getBitcast(ExtVT, RLo);

31694

RHi = DAG.getBitcast(ExtVT, RHi);

31695

31696

// r = VSELECT(r, shift(r, 4), a);

31697

SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

31698

SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

31699

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31700

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31701

31702

// a += a

31703

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

31704

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31705

31706

// r = VSELECT(r, shift(r, 2), a);

31707

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

31708

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

31709

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31710

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31711

31712

// a += a

31713

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

31714

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31715

31716

// r = VSELECT(r, shift(r, 1), a);

31717

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

31718

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

31719

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31720

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31721

31722

// Logical shift the result back to the lower byte, leaving a zero upper

31723

// byte meaning that we can safely pack with PACKUSWB.

31724

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

31725

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

31726

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

31727

}

31728

}

31729

31730

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

31731

MVT ExtVT = MVT::v8i32;

31732

SDValue Z = DAG.getConstant(0, dl, VT);

31733

SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

31734

SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

31735

SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

31736

SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

31737

ALo = DAG.getBitcast(ExtVT, ALo);

31738

AHi = DAG.getBitcast(ExtVT, AHi);

31739

RLo = DAG.getBitcast(ExtVT, RLo);

31740

RHi = DAG.getBitcast(ExtVT, RHi);

31741

SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

31742

SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

31743

Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

31744

Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

31745

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

31746

}

31747

31748

if (VT == MVT::v8i16) {

31749

// If we have a constant shift amount, the non-SSE41 path is best as

31750

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

31751

bool UseSSE41 = Subtarget.hasSSE41() &&

31752

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31753

31754

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

31755

// On SSE41 targets we can use PBLENDVB which selects bytes based just on

31756

// the sign bit.

31757

if (UseSSE41) {

31758

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

31759

V0 = DAG.getBitcast(ExtVT, V0);

31760

V1 = DAG.getBitcast(ExtVT, V1);

31761

Sel = DAG.getBitcast(ExtVT, Sel);

31762

return DAG.getBitcast(

31763

VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

31764

}

31765

// On pre-SSE41 targets we splat the sign bit - a negative value will

31766

// set all bits of the lanes to true and VSELECT uses that in

31767

// its OR(AND(V0,C),AND(V1,~C)) lowering.

31768

SDValue C =

31769

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

31770

return DAG.getSelect(dl, VT, C, V0, V1);

31771

};

31772

31773

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

31774

if (UseSSE41) {

31775

// On SSE41 targets we need to replicate the shift mask in both

31776

// bytes for PBLENDVB.

31777

Amt = DAG.getNode(

31778

ISD::OR, dl, VT,

31779

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

31780

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

31781

} else {

31782

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

31783

}

31784

31785

// r = VSELECT(r, shift(r, 8), a);

31786

SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

31787

R = SignBitSelect(Amt, M, R);

31788

31789

// a += a

31790

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31791

31792

// r = VSELECT(r, shift(r, 4), a);

31793

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

31794

R = SignBitSelect(Amt, M, R);

31795

31796

// a += a

31797

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31798

31799

// r = VSELECT(r, shift(r, 2), a);

31800

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

31801

R = SignBitSelect(Amt, M, R);

31802

31803

// a += a

31804

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31805

31806

// return VSELECT(r, shift(r, 1), a);

31807

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

31808

R = SignBitSelect(Amt, M, R);

31809

return R;

31810

}

31811

31812

// Decompose 256-bit shifts into 128-bit shifts.

31813

if (VT.is256BitVector())

31814

return splitVectorIntBinary(Op, DAG);

31815

31816

if (VT == MVT::v32i16 || VT == MVT::v64i8)

31817

return splitVectorIntBinary(Op, DAG);

31818

31819

return SDValue();

31820

}

31821

31822

static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

31823

SelectionDAG &DAG) {

31824

MVT VT = Op.getSimpleValueType();

31825

assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__
__PRETTY_FUNCTION__))

31826

"Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__
__PRETTY_FUNCTION__));

31827

31828

SDLoc DL(Op);

31829

SDValue Op0 = Op.getOperand(0);

31830

SDValue Op1 = Op.getOperand(1);

31831

SDValue Amt = Op.getOperand(2);

31832

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31833

bool IsFSHR = Op.getOpcode() == ISD::FSHR;

31834

31835

if (VT.isVector()) {

31836

APInt APIntShiftAmt;

31837

bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

31838

31839

if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {

31840

if (IsFSHR)

31841

std::swap(Op0, Op1);

31842

31843

if (IsCstSplat) {

31844

uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

31845

SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);

31846

return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,

31847

{Op0, Op1, Imm}, DAG, Subtarget);

31848

}

31849

return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

31850

{Op0, Op1, Amt}, DAG, Subtarget);

31851

}

31852

assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))

31853

VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))

31854

VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__))

31855

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__
__PRETTY_FUNCTION__));

31856

31857

// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.

31858

// fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).

31859

if (IsCstSplat)

31860

return SDValue();

31861

31862

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31863

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31864

bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());

31865

31866

// Constant vXi16 funnel shifts can be efficiently handled by default.

31867

if (IsCst && EltSizeInBits == 16)

31868

return SDValue();

31869

31870

unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;

31871

unsigned NumElts = VT.getVectorNumElements();

31872

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31873

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31874

31875

// Split 256-bit integers on XOP/pre-AVX2 targets.

31876

// Split 512-bit integers on non 512-bit BWI targets.

31877

if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||

31878

!Subtarget.hasAVX2())) ||

31879

(VT.is512BitVector() && !Subtarget.useBWIRegs() &&

31880

EltSizeInBits < 32)) {

31881

// Pre-mask the amount modulo using the wider vector.

31882

Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);

31883

return splitVectorOp(Op, DAG);

31884

}

31885

31886

// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))

31887

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {

31888

int ScalarAmtIdx = -1;

31889

if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {

31890

// Uniform vXi16 funnel shifts can be efficiently handled by default.

31891

if (EltSizeInBits == 16)

31892

return SDValue();

31893

31894

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31895

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31896

Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,

31897

ScalarAmtIdx, Subtarget, DAG);

31898

Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,

31899

ScalarAmtIdx, Subtarget, DAG);

31900

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31901

}

31902

}

31903

31904

MVT WideSVT = MVT::getIntegerVT(

31905

std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));

31906

MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);

31907

31908

// If per-element shifts are legal, fallback to generic expansion.

31909

if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())

31910

return SDValue();

31911

31912

// Attempt to fold as:

31913

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31914

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31915

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31916

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31917

Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);

31918

Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);

31919

AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31920

Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,

31921

EltSizeInBits, DAG);

31922

SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);

31923

Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);

31924

if (!IsFSHR)

31925

Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,

31926

EltSizeInBits, DAG);

31927

return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);

31928

}

31929

31930

// Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)

31931

if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||

31932

supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31933

SDValue Z = DAG.getConstant(0, DL, VT);

31934

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31935

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31936

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31937

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31938

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31939

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31940

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31941

}

31942

31943

// Fallback to generic expansion.

31944

return SDValue();

31945

}

31946

assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))

31947

(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__))

31948

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__
__PRETTY_FUNCTION__));

31949

31950

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

31951

bool OptForSize = DAG.shouldOptForSize();

31952

bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

31953

31954

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31955

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31956

if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

31957

!isa<ConstantSDNode>(Amt)) {

31958

SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

31959

SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

31960

Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

31961

Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

31962

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

31963

SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

31964

Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

31965

if (IsFSHR) {

31966

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

31967

} else {

31968

Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

31969

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

31970

}

31971

return DAG.getZExtOrTrunc(Res, DL, VT);

31972

}

31973

31974

if (VT == MVT::i8 || ExpandFunnel)

31975

return SDValue();

31976

31977

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

31978

if (VT == MVT::i16) {

31979

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

31980

DAG.getConstant(15, DL, Amt.getValueType()));

31981

unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

31982

return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

31983

}

31984

31985

return Op;

31986

}

31987

31988

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

31989

SelectionDAG &DAG) {

31990

MVT VT = Op.getSimpleValueType();

31991

assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31991, __extension__
__PRETTY_FUNCTION__));

31992

31993

SDLoc DL(Op);

31994

SDValue R = Op.getOperand(0);

31995

SDValue Amt = Op.getOperand(1);

31996

unsigned Opcode = Op.getOpcode();

31997

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31998

int NumElts = VT.getVectorNumElements();

31999

bool IsROTL = Opcode == ISD::ROTL;

32000

32001

// Check for constant splat rotation amount.

32002

APInt CstSplatValue;

32003

bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

32004

32005

// Check for splat rotate by zero.

32006

if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

32007

return R;

32008

32009

// AVX512 implicitly uses modulo rotation amounts.

32010

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

32011

// Attempt to rotate by immediate.

32012

if (IsCstSplat) {

32013

unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;

32014

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

32015

return DAG.getNode(RotOpc, DL, VT, R,

32016

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

32017

}

32018

32019

// Else, fall-back on VPROLV/VPRORV.

32020

return Op;

32021

}

32022

32023

// AVX512 VBMI2 vXi16 - lower to funnel shifts.

32024

if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {

32025

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

32026

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

32027

}

32028

32029

SDValue Z = DAG.getConstant(0, DL, VT);

32030

32031

if (!IsROTL) {

32032

// If the ISD::ROTR amount is constant, we're always better converting to

32033

// ISD::ROTL.

32034

if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))

32035

return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);

32036

32037

// XOP targets always prefers ISD::ROTL.

32038

if (Subtarget.hasXOP())

32039

return DAG.getNode(ISD::ROTL, DL, VT, R,

32040

DAG.getNode(ISD::SUB, DL, VT, Z, Amt));

32041

}

32042

32043

// Split 256-bit integers on XOP/pre-AVX2 targets.

32044

if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))

32045

return splitVectorIntBinary(Op, DAG);

32046

32047

// XOP has 128-bit vector variable + immediate rotates.

32048

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

32049

// XOP implicitly uses modulo rotation amounts.

32050

if (Subtarget.hasXOP()) {

32051

assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__));

32052

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32052, __extension__
__PRETTY_FUNCTION__));

32053

32054

// Attempt to rotate by immediate.

32055

if (IsCstSplat) {

32056

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

32057

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

32058

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

32059

}

32060

32061

// Use general rotate by variable (per-element).

32062

return Op;

32063

}

32064

32065

// Rotate by an uniform constant - expand back to shifts.

32066

if (IsCstSplat)

32067

return SDValue();

32068

32069

// Split 512-bit integers on non 512-bit BWI targets.

32070

if (VT.is512BitVector() && !Subtarget.useBWIRegs())

32071

return splitVectorIntBinary(Op, DAG);

32072

32073

assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))

32074

(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))

32075

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))

32076

Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))

32077

((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__))

32078

"Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__
__PRETTY_FUNCTION__));

32079

32080

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

32081

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

32082

32083

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

32084

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

32085

32086

// Attempt to fold as unpack(x,x) << zext(splat(y)):

32087

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

32088

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

32089

if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {

32090

int BaseRotAmtIdx = -1;

32091

if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {

32092

if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {

32093

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

32094

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

32095

}

32096

unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;

32097

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

32098

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

32099

Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,

32100

BaseRotAmtIdx, Subtarget, DAG);

32101

Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,

32102

BaseRotAmtIdx, Subtarget, DAG);

32103

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

32104

}

32105

}

32106

32107

// v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by

32108

// the amount bit.

32109

// TODO: We're doing nothing here that we couldn't do for funnel shifts.

32110

if (EltSizeInBits == 8) {

32111

bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

32112

MVT WideVT =

32113

MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);

32114

unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;

32115

32116

// Attempt to fold as:

32117

// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.

32118

// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).

32119

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

32120

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

32121

// If we're rotating by constant, just use default promotion.

32122

if (IsConstAmt)

32123

return SDValue();

32124

// See if we can perform this by widening to vXi16 or vXi32.

32125

R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);

32126

R = DAG.getNode(

32127

ISD::OR, DL, WideVT, R,

32128

getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));

32129

Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

32130

R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);

32131

if (IsROTL)

32132

R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);

32133

return DAG.getNode(ISD::TRUNCATE, DL, VT, R);

32134

}

32135

32136

// Attempt to fold as unpack(x,x) << zext(y):

32137

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

32138

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

32139

if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

32140

// See if we can perform this by unpacking to lo/hi vXi16.

32141

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

32142

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

32143

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

32144

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

32145

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

32146

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

32147

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

32148

}

32149

assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32149, __extension__
__PRETTY_FUNCTION__));

32150

32151

// We don't need ModuloAmt here as we just peek at individual bits.

32152

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

32153

if (Subtarget.hasSSE41()) {

32154

// On SSE41 targets we can use PBLENDVB which selects bytes based just

32155

// on the sign bit.

32156

V0 = DAG.getBitcast(VT, V0);

32157

V1 = DAG.getBitcast(VT, V1);

32158

Sel = DAG.getBitcast(VT, Sel);

32159

return DAG.getBitcast(SelVT,

32160

DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

32161

}

32162

// On pre-SSE41 targets we test for the sign bit by comparing to

32163

// zero - a negative value will set all bits of the lanes to true

32164

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

32165

SDValue Z = DAG.getConstant(0, DL, SelVT);

32166

SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

32167

return DAG.getSelect(DL, SelVT, C, V0, V1);

32168

};

32169

32170

// ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.

32171

if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {

32172

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

32173

IsROTL = true;

32174

}

32175

32176

unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;

32177

unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;

32178

32179

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

32180

// We can safely do this using i16 shifts as we're only interested in

32181

// the 3 lower bits of each byte.

32182

Amt = DAG.getBitcast(ExtVT, Amt);

32183

Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

32184

Amt = DAG.getBitcast(VT, Amt);

32185

32186

// r = VSELECT(r, rot(r, 4), a);

32187

SDValue M;

32188

M = DAG.getNode(

32189

ISD::OR, DL, VT,

32190

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),

32191

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));

32192

R = SignBitSelect(VT, Amt, M, R);

32193

32194

// a += a

32195

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

32196

32197

// r = VSELECT(r, rot(r, 2), a);

32198

M = DAG.getNode(

32199

ISD::OR, DL, VT,

32200

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),

32201

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));

32202

R = SignBitSelect(VT, Amt, M, R);

32203

32204

// a += a

32205

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

32206

32207

// return VSELECT(r, rot(r, 1), a);

32208

M = DAG.getNode(

32209

ISD::OR, DL, VT,

32210

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),

32211

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));

32212

return SignBitSelect(VT, Amt, M, R);

32213

}

32214

32215

bool IsSplatAmt = DAG.isSplatValue(Amt);

32216

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

32217

bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

32218

supportedVectorVarShift(VT, Subtarget, ISD::SRL);

32219

32220

// Fallback for splats + all supported variable shifts.

32221

// Fallback for non-constants AVX2 vXi16 as well.

32222

if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

32223

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

32224

SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

32225

AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

32226

SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);

32227

SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);

32228

return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

32229

}

32230

32231

// Everything below assumes ISD::ROTL.

32232

if (!IsROTL) {

32233

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

32234

IsROTL = true;

32235

}

32236

32237

// ISD::ROT* uses modulo rotate amounts.

32238

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

32239

32240

assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32240, __extension__
__PRETTY_FUNCTION__));

32241

32242

// As with shifts, attempt to convert the rotation amount to a multiplication

32243

// factor, fallback to general expansion.

32244

SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

32245

if (!Scale)

32246

return SDValue();

32247

32248

// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

32249

if (EltSizeInBits == 16) {

32250

SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

32251

SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

32252

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

32253

}

32254

32255

// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

32256

// to v2i64 results at a time. The upper 32-bits contain the wrapped bits

32257

// that can then be OR'd with the lower 32-bits.

32258

assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32258, __extension__
__PRETTY_FUNCTION__));

32259

static const int OddMask[] = {1, -1, 3, -1};

32260

SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

32261

SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

32262

32263

SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

32264

DAG.getBitcast(MVT::v2i64, R),

32265

DAG.getBitcast(MVT::v2i64, Scale));

32266

SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

32267

DAG.getBitcast(MVT::v2i64, R13),

32268

DAG.getBitcast(MVT::v2i64, Scale13));

32269

Res02 = DAG.getBitcast(VT, Res02);

32270

Res13 = DAG.getBitcast(VT, Res13);

32271

32272

return DAG.getNode(ISD::OR, DL, VT,

32273

DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

32274

DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

32275

}

32276

32277

/// Returns true if the operand type is exactly twice the native width, and

32278

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

32279

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

32280

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

32281

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

32282

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

32283

32284

if (OpWidth == 64)

32285

return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();

32286

if (OpWidth == 128)

32287

return Subtarget.canUseCMPXCHG16B();

32288

32289

return false;

32290

}

32291

32292

TargetLoweringBase::AtomicExpansionKind

32293

X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

32294

Type *MemType = SI->getValueOperand()->getType();

32295

32296

bool NoImplicitFloatOps =

32297

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

32298

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

32299

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

32300

(Subtarget.hasSSE1() || Subtarget.hasX87()))

32301

return AtomicExpansionKind::None;

32302

32303

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand

32304

: AtomicExpansionKind::None;

32305

}

32306

32307

// Note: this turns large loads into lock cmpxchg8b/16b.

32308

// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

32309

TargetLowering::AtomicExpansionKind

32310

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

32311

Type *MemType = LI->getType();

32312

32313

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

32314

// can use movq to do the load. If we have X87 we can load into an 80-bit

32315

// X87 register and store it to a stack temporary.

32316

bool NoImplicitFloatOps =

32317

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

32318

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

32319

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

32320

(Subtarget.hasSSE1() || Subtarget.hasX87()))

32321

return AtomicExpansionKind::None;

32322

32323

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

32324

: AtomicExpansionKind::None;

32325

}

32326

32327

enum BitTestKind : unsigned {

32328

UndefBit,

32329

ConstantBit,

32330

NotConstantBit,

32331

ShiftBit,

32332

NotShiftBit

32333

};

32334

32335

static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {

32336

using namespace llvm::PatternMatch;

32337

BitTestKind BTK = UndefBit;

32338

auto *C = dyn_cast<ConstantInt>(V);

32339

if (C) {

32340

// Check if V is a power of 2 or NOT power of 2.

32341

if (isPowerOf2_64(C->getZExtValue()))

32342

BTK = ConstantBit;

32343

else if (isPowerOf2_64((~C->getValue()).getZExtValue()))

32344

BTK = NotConstantBit;

32345

return {V, BTK};

32346

}

32347

32348

// Check if V is some power of 2 pattern known to be non-zero

32349

auto *I = dyn_cast<Instruction>(V);

32350

if (I) {

32351

bool Not = false;

32352

// Check if we have a NOT

32353

Value *PeekI;

32354

if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||

32355

match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {

32356

Not = true;

32357

I = dyn_cast<Instruction>(PeekI);

32358

32359

// If I is constant, it will fold and we can evaluate later. If its an

32360

// argument or something of that nature, we can't analyze.

32361

if (I == nullptr)

32362

return {nullptr, UndefBit};

32363

}

32364

// We can only use 1 << X without more sophisticated analysis. C << X where

32365

// C is a power of 2 but not 1 can result in zero which cannot be translated

32366

// to bittest. Likewise any C >> X (either arith or logical) can be zero.

32367

if (I->getOpcode() == Instruction::Shl) {

32368

// Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &

32369

// -X` and some other provable power of 2 patterns that we can use CTZ on

32370

// may be profitable.

32371

// Todo(2): It may be possible in some cases to prove that Shl(C, X) is

32372

// non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also

32373

// be provably a non-zero power of 2.

32374

// Todo(3): ROTL and ROTR patterns on a power of 2 C should also be

32375

// transformable to bittest.

32376

auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));

32377

if (!ShiftVal)

32378

return {nullptr, UndefBit};

32379

if (ShiftVal->equalsInt(1))

32380

BTK = Not ? NotShiftBit : ShiftBit;

32381

32382

if (BTK == UndefBit)

32383

return {nullptr, UndefBit};

32384

32385

Value *BitV = I->getOperand(1);

32386

32387

Value *AndOp;

32388

const APInt *AndC;

32389

if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {

32390

// Read past a shiftmask instruction to find count

32391

if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))

32392

BitV = AndOp;

32393

}

32394

return {BitV, BTK};

32395

}

32396

}

32397

return {nullptr, UndefBit};

32398

}

32399

32400

TargetLowering::AtomicExpansionKind

32401

X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {

32402

using namespace llvm::PatternMatch;

32403

// If the atomicrmw's result isn't actually used, we can just add a "lock"

32404

// prefix to a normal instruction for these operations.

32405

if (AI->use_empty())

32406

return AtomicExpansionKind::None;

32407

32408

if (AI->getOperation() == AtomicRMWInst::Xor) {

32409

// A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is

32410

// preferable to both `cmpxchg` and `btc`.

32411

if (match(AI->getOperand(1), m_SignMask()))

32412

return AtomicExpansionKind::None;

32413

}

32414

32415

// If the atomicrmw's result is used by a single bit AND, we may use

32416

// bts/btr/btc instruction for these operations.

32417

// Note: InstCombinePass can cause a de-optimization here. It replaces the

32418

// SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor

32419

// (depending on CC). This pattern can only use bts/btr/btc but we don't

32420

// detect it.

32421

Instruction *I = AI->user_back();

32422

auto BitChange = FindSingleBitChange(AI->getValOperand());

32423

if (BitChange.second == UndefBit || !AI->hasOneUse() ||

32424

I->getOpcode() != Instruction::And ||

32425

AI->getType()->getPrimitiveSizeInBits() == 8 ||

32426

AI->getParent() != I->getParent())

32427

return AtomicExpansionKind::CmpXChg;

32428

32429

unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;

32430

32431

// This is a redundant AND, it should get cleaned up elsewhere.

32432

if (AI == I->getOperand(OtherIdx))

32433

return AtomicExpansionKind::CmpXChg;

32434

32435

// The following instruction must be a AND single bit.

32436

if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {

32437

auto *C1 = cast<ConstantInt>(AI->getValOperand());

32438

auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));

32439

if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {

32440

return AtomicExpansionKind::CmpXChg;

32441

}

32442

if (AI->getOperation() == AtomicRMWInst::And) {

32443

return ~C1->getValue() == C2->getValue()

32444

? AtomicExpansionKind::BitTestIntrinsic

32445

: AtomicExpansionKind::CmpXChg;

32446

}

32447

return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic

32448

: AtomicExpansionKind::CmpXChg;

32449

}

32450

32451

assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32451, __extension__
__PRETTY_FUNCTION__));

32452

32453

auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));

32454

if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)

32455

return AtomicExpansionKind::CmpXChg;

32456

32457

assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32457, __extension__
__PRETTY_FUNCTION__));

32458

32459

// If shift amounts are not the same we can't use BitTestIntrinsic.

32460

if (BitChange.first != BitTested.first)

32461

return AtomicExpansionKind::CmpXChg;

32462

32463

// If atomic AND need to be masking all be one bit and testing the one bit

32464

// unset in the mask.

32465

if (AI->getOperation() == AtomicRMWInst::And)

32466

return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)

32467

? AtomicExpansionKind::BitTestIntrinsic

32468

: AtomicExpansionKind::CmpXChg;

32469

32470

// If atomic XOR/OR need to be setting and testing the same bit.

32471

return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)

32472

? AtomicExpansionKind::BitTestIntrinsic

32473

: AtomicExpansionKind::CmpXChg;

32474

}

32475

32476

void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {

32477

IRBuilder<> Builder(AI);

32478

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32479

Intrinsic::ID IID_C = Intrinsic::not_intrinsic;

32480

Intrinsic::ID IID_I = Intrinsic::not_intrinsic;

32481

switch (AI->getOperation()) {

32482

default:

32483

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32483);

32484

case AtomicRMWInst::Or:

32485

IID_C = Intrinsic::x86_atomic_bts;

32486

IID_I = Intrinsic::x86_atomic_bts_rm;

32487

break;

32488

case AtomicRMWInst::Xor:

32489

IID_C = Intrinsic::x86_atomic_btc;

32490

IID_I = Intrinsic::x86_atomic_btc_rm;

32491

break;

32492

case AtomicRMWInst::And:

32493

IID_C = Intrinsic::x86_atomic_btr;

32494

IID_I = Intrinsic::x86_atomic_btr_rm;

32495

break;

32496

}

32497

Instruction *I = AI->user_back();

32498

LLVMContext &Ctx = AI->getContext();

32499

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

32500

Type::getInt8PtrTy(Ctx));

32501

Function *BitTest = nullptr;

32502

Value *Result = nullptr;

32503

auto BitTested = FindSingleBitChange(AI->getValOperand());

32504

assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32504, __extension__ __PRETTY_FUNCTION__));

32505

32506

if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {

32507

auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));

32508

32509

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());

32510

32511

unsigned Imm = llvm::countr_zero(C->getZExtValue());

32512

Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});

32513

} else {

32514

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());

32515

32516

assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32516, __extension__
__PRETTY_FUNCTION__));

32517

32518

Value *SI = BitTested.first;

32519

assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
32519, __extension__ __PRETTY_FUNCTION__));

32520

32521

// BT{S|R|C} on memory operand don't modulo bit position so we need to

32522

// mask it.

32523

unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();

32524

Value *BitPos =

32525

Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));

32526

// Todo(1): In many cases it may be provable that SI is less than

32527

// ShiftBits in which case this mask is unnecessary

32528

// Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1

32529

// << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in

32530

// favor of just a raw BT{S|R|C}.

32531

32532

Result = Builder.CreateCall(BitTest, {Addr, BitPos});

32533

Result = Builder.CreateZExtOrTrunc(Result, AI->getType());

32534

32535

// If the result is only used for zero/non-zero status then we don't need to

32536

// shift value back. Otherwise do so.

32537

for (auto It = I->user_begin(); It != I->user_end(); ++It) {

32538

if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {

32539

if (ICmp->isEquality()) {

32540

auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));

32541

auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));

32542

if (C0 || C1) {

32543

assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32543, __extension__ __PRETTY_FUNCTION__));

32544

if ((C0 ? C0 : C1)->isZero())

32545

continue;

32546

}

32547

}

32548

}

32549

Result = Builder.CreateShl(Result, BitPos);

32550

break;

32551

}

32552

}

32553

32554

I->replaceAllUsesWith(Result);

32555

I->eraseFromParent();

32556

AI->eraseFromParent();

32557

}

32558

32559

static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {

32560

using namespace llvm::PatternMatch;

32561

if (!AI->hasOneUse())

32562

return false;

32563

32564

Value *Op = AI->getOperand(1);

32565

ICmpInst::Predicate Pred;

32566

Instruction *I = AI->user_back();

32567

AtomicRMWInst::BinOp Opc = AI->getOperation();

32568

if (Opc == AtomicRMWInst::Add) {

32569

if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))

32570

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32571

if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {

32572

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32573

return Pred == CmpInst::ICMP_SLT;

32574

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32575

return Pred == CmpInst::ICMP_SGT;

32576

}

32577

return false;

32578

}

32579

if (Opc == AtomicRMWInst::Sub) {

32580

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

32581

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32582

if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {

32583

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32584

return Pred == CmpInst::ICMP_SLT;

32585

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32586

return Pred == CmpInst::ICMP_SGT;

32587

}

32588

return false;

32589

}

32590

if ((Opc == AtomicRMWInst::Or &&

32591

match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||

32592

(Opc == AtomicRMWInst::And &&

32593

match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {

32594

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32595

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||

32596

Pred == CmpInst::ICMP_SLT;

32597

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32598

return Pred == CmpInst::ICMP_SGT;

32599

return false;

32600

}

32601

if (Opc == AtomicRMWInst::Xor) {

32602

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

32603

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32604

if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {

32605

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32606

return Pred == CmpInst::ICMP_SLT;

32607

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32608

return Pred == CmpInst::ICMP_SGT;

32609

}

32610

return false;

32611

}

32612

32613

return false;

32614

}

32615

32616

void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(

32617

AtomicRMWInst *AI) const {

32618

IRBuilder<> Builder(AI);

32619

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32620

Instruction *TempI = nullptr;

32621

LLVMContext &Ctx = AI->getContext();

32622

ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());

32623

if (!ICI) {

32624

TempI = AI->user_back();

32625

assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32625, __extension__
__PRETTY_FUNCTION__));

32626

ICI = cast<ICmpInst>(TempI->user_back());

32627

}

32628

X86::CondCode CC = X86::COND_INVALID;

32629

ICmpInst::Predicate Pred = ICI->getPredicate();

32630

switch (Pred) {

32631

default:

32632

llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32632);

32633

case CmpInst::ICMP_EQ:

32634

CC = X86::COND_E;

32635

break;

32636

case CmpInst::ICMP_NE:

32637

CC = X86::COND_NE;

32638

break;

32639

case CmpInst::ICMP_SLT:

32640

CC = X86::COND_S;

32641

break;

32642

case CmpInst::ICMP_SGT:

32643

CC = X86::COND_NS;

32644

break;

32645

}

32646

Intrinsic::ID IID = Intrinsic::not_intrinsic;

32647

switch (AI->getOperation()) {

32648

default:

32649

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32649);

32650

case AtomicRMWInst::Add:

32651

IID = Intrinsic::x86_atomic_add_cc;

32652

break;

32653

case AtomicRMWInst::Sub:

32654

IID = Intrinsic::x86_atomic_sub_cc;

32655

break;

32656

case AtomicRMWInst::Or:

32657

IID = Intrinsic::x86_atomic_or_cc;

32658

break;

32659

case AtomicRMWInst::And:

32660

IID = Intrinsic::x86_atomic_and_cc;

32661

break;

32662

case AtomicRMWInst::Xor:

32663

IID = Intrinsic::x86_atomic_xor_cc;

32664

break;

32665

}

32666

Function *CmpArith =

32667

Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());

32668

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

32669

Type::getInt8PtrTy(Ctx));

32670

Value *Call = Builder.CreateCall(

32671

CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});

32672

Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));

32673

ICI->replaceAllUsesWith(Result);

32674

ICI->eraseFromParent();

32675

if (TempI)

32676

TempI->eraseFromParent();

32677

AI->eraseFromParent();

32678

}

32679

32680

TargetLowering::AtomicExpansionKind

32681

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

32682

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

32683

Type *MemType = AI->getType();

32684

32685

// If the operand is too big, we must see if cmpxchg8/16b is available

32686

// and default to library calls otherwise.

32687

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

32688

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

32689

: AtomicExpansionKind::None;

32690

}

32691

32692

AtomicRMWInst::BinOp Op = AI->getOperation();

32693

switch (Op) {

32694

case AtomicRMWInst::Xchg:

32695

return AtomicExpansionKind::None;

32696

case AtomicRMWInst::Add:

32697

case AtomicRMWInst::Sub:

32698

if (shouldExpandCmpArithRMWInIR(AI))

32699

return AtomicExpansionKind::CmpArithIntrinsic;

32700

// It's better to use xadd, xsub or xchg for these in other cases.

32701

return AtomicExpansionKind::None;

32702

case AtomicRMWInst::Or:

32703

case AtomicRMWInst::And:

32704

case AtomicRMWInst::Xor:

32705

if (shouldExpandCmpArithRMWInIR(AI))

32706

return AtomicExpansionKind::CmpArithIntrinsic;

32707

return shouldExpandLogicAtomicRMWInIR(AI);

32708

case AtomicRMWInst::Nand:

32709

case AtomicRMWInst::Max:

32710

case AtomicRMWInst::Min:

32711

case AtomicRMWInst::UMax:

32712

case AtomicRMWInst::UMin:

32713

case AtomicRMWInst::FAdd:

32714

case AtomicRMWInst::FSub:

32715

case AtomicRMWInst::FMax:

32716

case AtomicRMWInst::FMin:

32717

case AtomicRMWInst::UIncWrap:

32718

case AtomicRMWInst::UDecWrap:

32719

default:

32720

// These always require a non-trivial set of data operations on x86. We must

32721

// use a cmpxchg loop.

32722

return AtomicExpansionKind::CmpXChg;

32723

}

32724

}

32725

32726

LoadInst *

32727

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

32728

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

32729

Type *MemType = AI->getType();

32730

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

32731

// there is no benefit in turning such RMWs into loads, and it is actually

32732

// harmful as it introduces a mfence.

32733

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

32734

return nullptr;

32735

32736

// If this is a canonical idempotent atomicrmw w/no uses, we have a better

32737

// lowering available in lowerAtomicArith.

32738

// TODO: push more cases through this path.

32739

if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

32740

if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

32741

AI->use_empty())

32742

return nullptr;

32743

32744

IRBuilder<> Builder(AI);

32745

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32746

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

32747

auto SSID = AI->getSyncScopeID();

32748

// We must restrict the ordering to avoid generating loads with Release or

32749

// ReleaseAcquire orderings.

32750

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

32751

32752

// Before the load we need a fence. Here is an example lifted from

32753

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

32754

// is required:

32755

// Thread 0:

32756

// x.store(1, relaxed);

32757

// r1 = y.fetch_add(0, release);

32758

// Thread 1:

32759

// y.fetch_add(42, acquire);

32760

// r2 = x.load(relaxed);

32761

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

32762

// lowered to just a load without a fence. A mfence flushes the store buffer,

32763

// making the optimization clearly correct.

32764

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

32765

// otherwise, we might be able to be more aggressive on relaxed idempotent

32766

// rmw. In practice, they do not look useful, so we don't try to be

32767

// especially clever.

32768

if (SSID == SyncScope::SingleThread)

32769

// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at

32770

// the IR level, so we must wrap it in an intrinsic.

32771

return nullptr;

32772

32773

if (!Subtarget.hasMFence())

32774

// FIXME: it might make sense to use a locked operation here but on a

32775

// different cache-line to prevent cache-line bouncing. In practice it

32776

// is probably a small win, and x86 processors without mfence are rare

32777

// enough that we do not bother.

32778

return nullptr;

32779

32780

Function *MFence =

32781

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

32782

Builder.CreateCall(MFence, {});

32783

32784

// Finally we can emit the atomic load.

32785

LoadInst *Loaded = Builder.CreateAlignedLoad(

32786

AI->getType(), AI->getPointerOperand(), AI->getAlign());

32787

Loaded->setAtomic(Order, SSID);

32788

AI->replaceAllUsesWith(Loaded);

32789

AI->eraseFromParent();

32790

return Loaded;

32791

}

32792

32793

bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {

32794

if (!SI.isUnordered())

32795

return false;

32796

return ExperimentalUnorderedISEL;

32797

}

32798

bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {

32799

if (!LI.isUnordered())

32800

return false;

32801

return ExperimentalUnorderedISEL;

32802

}

32803

32804

32805

/// Emit a locked operation on a stack location which does not change any

32806

/// memory location, but does involve a lock prefix. Location is chosen to be

32807

/// a) very likely accessed only by a single thread to minimize cache traffic,

32808

/// and b) definitely dereferenceable. Returns the new Chain result.

32809

static SDValue emitLockedStackOp(SelectionDAG &DAG,

32810

const X86Subtarget &Subtarget, SDValue Chain,

32811

const SDLoc &DL) {

32812

// Implementation notes:

32813

// 1) LOCK prefix creates a full read/write reordering barrier for memory

32814

// operations issued by the current processor. As such, the location

32815

// referenced is not relevant for the ordering properties of the instruction.

32816

// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

32817

// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions

32818

// 2) Using an immediate operand appears to be the best encoding choice

32819

// here since it doesn't require an extra register.

32820

// 3) OR appears to be very slightly faster than ADD. (Though, the difference

32821

// is small enough it might just be measurement noise.)

32822

// 4) When choosing offsets, there are several contributing factors:

32823

// a) If there's no redzone, we default to TOS. (We could allocate a cache

32824

// line aligned stack object to improve this case.)

32825

// b) To minimize our chances of introducing a false dependence, we prefer

32826

// to offset the stack usage from TOS slightly.

32827

// c) To minimize concerns about cross thread stack usage - in particular,

32828

// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

32829

// captures state in the TOS frame and accesses it from many threads -

32830

// we want to use an offset such that the offset is in a distinct cache

32831

// line from the TOS frame.

32832

//

32833

// For a general discussion of the tradeoffs and benchmark results, see:

32834

// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

32835

32836

auto &MF = DAG.getMachineFunction();

32837

auto &TFL = *Subtarget.getFrameLowering();

32838

const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

32839

32840

if (Subtarget.is64Bit()) {

32841

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32842

SDValue Ops[] = {

32843

DAG.getRegister(X86::RSP, MVT::i64), // Base

32844

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32845

DAG.getRegister(0, MVT::i64), // Index

32846

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32847

DAG.getRegister(0, MVT::i16), // Segment.

32848

Zero,

32849

Chain};

32850

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32851

MVT::Other, Ops);

32852

return SDValue(Res, 1);

32853

}

32854

32855

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32856

SDValue Ops[] = {

32857

DAG.getRegister(X86::ESP, MVT::i32), // Base

32858

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32859

DAG.getRegister(0, MVT::i32), // Index

32860

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32861

DAG.getRegister(0, MVT::i16), // Segment.

32862

Zero,

32863

Chain

32864

};

32865

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32866

MVT::Other, Ops);

32867

return SDValue(Res, 1);

32868

}

32869

32870

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

32871

SelectionDAG &DAG) {

32872

SDLoc dl(Op);

32873

AtomicOrdering FenceOrdering =

32874

static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

32875

SyncScope::ID FenceSSID =

32876

static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

32877

32878

// The only fence that needs an instruction is a sequentially-consistent

32879

// cross-thread fence.

32880

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

32881

FenceSSID == SyncScope::System) {

32882

if (Subtarget.hasMFence())

32883

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

32884

32885

SDValue Chain = Op.getOperand(0);

32886

return emitLockedStackOp(DAG, Subtarget, Chain, dl);

32887

}

32888

32889

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

32890

return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

32891

}

32892

32893

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

32894

SelectionDAG &DAG) {

32895

MVT T = Op.getSimpleValueType();

32896

SDLoc DL(Op);

32897

unsigned Reg = 0;

32898

unsigned size = 0;

32899

switch(T.SimpleTy) {

32900

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32900);

32901

case MVT::i8: Reg = X86::AL; size = 1; break;

32902

case MVT::i16: Reg = X86::AX; size = 2; break;

32903

case MVT::i32: Reg = X86::EAX; size = 4; break;

32904

case MVT::i64:

32905

assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32905, __extension__
__PRETTY_FUNCTION__));

32906

Reg = X86::RAX; size = 8;

32907

break;

32908

}

32909

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

32910

Op.getOperand(2), SDValue());

32911

SDValue Ops[] = { cpIn.getValue(0),

32912

Op.getOperand(1),

32913

Op.getOperand(3),

32914

DAG.getTargetConstant(size, DL, MVT::i8),

32915

cpIn.getValue(1) };

32916

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

32917

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

32918

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

32919

Ops, T, MMO);

32920

32921

SDValue cpOut =

32922

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

32923

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

32924

MVT::i32, cpOut.getValue(2));

32925

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

32926

32927

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

32928

cpOut, Success, EFLAGS.getValue(1));

32929

}

32930

32931

// Create MOVMSKB, taking into account whether we need to split for AVX1.

32932

static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

32933

const X86Subtarget &Subtarget) {

32934

MVT InVT = V.getSimpleValueType();

32935

32936

if (InVT == MVT::v64i8) {

32937

SDValue Lo, Hi;

32938

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32939

Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

32940

Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

32941

Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

32942

Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

32943

Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

32944

DAG.getConstant(32, DL, MVT::i8));

32945

return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

32946

}

32947

if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

32948

SDValue Lo, Hi;

32949

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32950

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

32951

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

32952

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

32953

DAG.getConstant(16, DL, MVT::i8));

32954

return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

32955

}

32956

32957

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

32958

}

32959

32960

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

32961

SelectionDAG &DAG) {

32962

SDValue Src = Op.getOperand(0);

32963

MVT SrcVT = Src.getSimpleValueType();

32964

MVT DstVT = Op.getSimpleValueType();

32965

32966

// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

32967

// half to v32i1 and concatenating the result.

32968

if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

32969

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32969, __extension__
__PRETTY_FUNCTION__));

32970

assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32970, __extension__
__PRETTY_FUNCTION__));

32971

SDLoc dl(Op);

32972

SDValue Lo, Hi;

32973

std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);

32974

Lo = DAG.getBitcast(MVT::v32i1, Lo);

32975

Hi = DAG.getBitcast(MVT::v32i1, Hi);

32976

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

32977

}

32978

32979

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

32980

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

32981

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__
__PRETTY_FUNCTION__));

32982

MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

32983

SDLoc DL(Op);

32984

SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

32985

V = getPMOVMSKB(DL, V, DAG, Subtarget);

32986

return DAG.getZExtOrTrunc(V, DL, DstVT);

32987

}

32988

32989

assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__
__PRETTY_FUNCTION__))

32990

SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__
__PRETTY_FUNCTION__));

32991

32992

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32992, __extension__
__PRETTY_FUNCTION__));

32993

if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

32994

!(DstVT == MVT::x86mmx && SrcVT.isVector()))

32995

// This conversion needs to be expanded.

32996

return SDValue();

32997

32998

SDLoc dl(Op);

32999

if (SrcVT.isVector()) {

33000

// Widen the vector in input in the case of MVT::v2i32.

33001

// Example: from MVT::v2i32 to MVT::v4i32.

33002

MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

33003

SrcVT.getVectorNumElements() * 2);

33004

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

33005

DAG.getUNDEF(SrcVT));

33006

} else {

33007

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__
__PRETTY_FUNCTION__))

33008

"Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__
__PRETTY_FUNCTION__));

33009

Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

33010

}

33011

33012

MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

33013

Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

33014

33015

if (DstVT == MVT::x86mmx)

33016

return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

33017

33018

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

33019

DAG.getIntPtrConstant(0, dl));

33020

}

33021

33022

/// Compute the horizontal sum of bytes in V for the elements of VT.

33023

///

33024

/// Requires V to be a byte vector and VT to be an integer vector type with

33025

/// wider elements than V's type. The width of the elements of VT determines

33026

/// how many bytes of V are summed horizontally to produce each element of the

33027

/// result.

33028

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

33029

const X86Subtarget &Subtarget,

33030

SelectionDAG &DAG) {

33031

SDLoc DL(V);

33032

MVT ByteVecVT = V.getSimpleValueType();

33033

MVT EltVT = VT.getVectorElementType();

33034

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__
__PRETTY_FUNCTION__))

33035

"Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__
__PRETTY_FUNCTION__));

33036

assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__
__PRETTY_FUNCTION__))

33037

"Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__
__PRETTY_FUNCTION__));

33038

unsigned VecSize = VT.getSizeInBits();

33039

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33039, __extension__
__PRETTY_FUNCTION__));

33040

33041

// PSADBW instruction horizontally add all bytes and leave the result in i64

33042

// chunks, thus directly computes the pop count for v2i64 and v4i64.

33043

if (EltVT == MVT::i64) {

33044

SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

33045

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

33046

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

33047

return DAG.getBitcast(VT, V);

33048

}

33049

33050

if (EltVT == MVT::i32) {

33051

// We unpack the low half and high half into i32s interleaved with zeros so

33052

// that we can use PSADBW to horizontally sum them. The most useful part of

33053

// this is that it lines up the results of two PSADBW instructions to be

33054

// two v2i64 vectors which concatenated are the 4 population counts. We can

33055

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

33056

SDValue Zeros = DAG.getConstant(0, DL, VT);

33057

SDValue V32 = DAG.getBitcast(VT, V);

33058

SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

33059

SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

33060

33061

// Do the horizontal sums into two v2i64s.

33062

Zeros = DAG.getConstant(0, DL, ByteVecVT);

33063

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

33064

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

33065

DAG.getBitcast(ByteVecVT, Low), Zeros);

33066

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

33067

DAG.getBitcast(ByteVecVT, High), Zeros);

33068

33069

// Merge them together.

33070

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

33071

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

33072

DAG.getBitcast(ShortVecVT, Low),

33073

DAG.getBitcast(ShortVecVT, High));

33074

33075

return DAG.getBitcast(VT, V);

33076

}

33077

33078

// The only element type left is i16.

33079

assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33079, __extension__
__PRETTY_FUNCTION__));

33080

33081

// To obtain pop count for each i16 element starting from the pop count for

33082

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

33083

// right by 8. It is important to shift as i16s as i8 vector shift isn't

33084

// directly supported.

33085

SDValue ShifterV = DAG.getConstant(8, DL, VT);

33086

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

33087

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

33088

DAG.getBitcast(ByteVecVT, V));

33089

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

33090

}

33091

33092

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

33093

const X86Subtarget &Subtarget,

33094

SelectionDAG &DAG) {

33095

MVT VT = Op.getSimpleValueType();

33096

MVT EltVT = VT.getVectorElementType();

33097

int NumElts = VT.getVectorNumElements();

33098

(void)EltVT;

33099

assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33099, __extension__
__PRETTY_FUNCTION__));

33100

33101

// Implement a lookup table in register by using an algorithm based on:

33102

// http://wm.ite.pl/articles/sse-popcount.html

33103

//

33104

// The general idea is that every lower byte nibble in the input vector is an

33105

// index into a in-register pre-computed pop count table. We then split up the

33106

// input vector in two new ones: (1) a vector with only the shifted-right

33107

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

33108

// masked out higher ones) for each byte. PSHUFB is used separately with both

33109

// to index the in-register table. Next, both are added and the result is a

33110

// i8 vector where each element contains the pop count for input byte.

33111

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

33112

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

33113

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

33114

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

33115

33116

SmallVector<SDValue, 64> LUTVec;

33117

for (int i = 0; i < NumElts; ++i)

33118

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

33119

SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

33120

SDValue M0F = DAG.getConstant(0x0F, DL, VT);

33121

33122

// High nibbles

33123

SDValue FourV = DAG.getConstant(4, DL, VT);

33124

SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

33125

33126

// Low nibbles

33127

SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

33128

33129

// The input vector is used as the shuffle mask that index elements into the

33130

// LUT. After counting low and high nibbles, add the vector to obtain the

33131

// final pop count per i8 element.

33132

SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

33133

SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

33134

return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

33135

}

33136

33137

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

33138

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

33139

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

33140

SelectionDAG &DAG) {

33141

MVT VT = Op.getSimpleValueType();

33142

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__
__PRETTY_FUNCTION__))

33143

"Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__
__PRETTY_FUNCTION__));

33144

SDLoc DL(Op.getNode());

33145

SDValue Op0 = Op.getOperand(0);

33146

33147

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

33148

if (Subtarget.hasVPOPCNTDQ()) {

33149

unsigned NumElems = VT.getVectorNumElements();

33150

assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__
__PRETTY_FUNCTION__))

33151

VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__
__PRETTY_FUNCTION__));

33152

if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

33153

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

33154

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

33155

Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

33156

return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

33157

}

33158

}

33159

33160

// Decompose 256-bit ops into smaller 128-bit ops.

33161

if (VT.is256BitVector() && !Subtarget.hasInt256())

33162

return splitVectorIntUnary(Op, DAG);

33163

33164

// Decompose 512-bit ops into smaller 256-bit ops.

33165

if (VT.is512BitVector() && !Subtarget.hasBWI())

33166

return splitVectorIntUnary(Op, DAG);

33167

33168

// For element types greater than i8, do vXi8 pop counts and a bytesum.

33169

if (VT.getScalarType() != MVT::i8) {

33170

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

33171

SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

33172

SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

33173

return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

33174

}

33175

33176

// We can't use the fast LUT approach, so fall back on LegalizeDAG.

33177

if (!Subtarget.hasSSSE3())

33178

return SDValue();

33179

33180

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

33181

}

33182

33183

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

33184

SelectionDAG &DAG) {

33185

assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__))

33186

"We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__));

33187

return LowerVectorCTPOP(Op, Subtarget, DAG);

33188

}

33189

33190

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

33191

MVT VT = Op.getSimpleValueType();

33192

SDValue In = Op.getOperand(0);

33193

SDLoc DL(Op);

33194

33195

// For scalars, its still beneficial to transfer to/from the SIMD unit to

33196

// perform the BITREVERSE.

33197

if (!VT.isVector()) {

33198

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

33199

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

33200

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

33201

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

33202

DAG.getIntPtrConstant(0, DL));

33203

}

33204

33205

int NumElts = VT.getVectorNumElements();

33206

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

33207

33208

// Decompose 256-bit ops into smaller 128-bit ops.

33209

if (VT.is256BitVector())

33210

return splitVectorIntUnary(Op, DAG);

33211

33212

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__
__PRETTY_FUNCTION__))

33213

"Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__
__PRETTY_FUNCTION__));

33214

33215

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

33216

// perform the BSWAP in the shuffle.

33217

// Its best to shuffle using the second operand as this will implicitly allow

33218

// memory folding for multiple vectors.

33219

SmallVector<SDValue, 16> MaskElts;

33220

for (int i = 0; i != NumElts; ++i) {

33221

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

33222

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

33223

int PermuteByte = SourceByte | (2 << 5);

33224

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

33225

}

33226

}

33227

33228

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

33229

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

33230

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

33231

Res, Mask);

33232

return DAG.getBitcast(VT, Res);

33233

}

33234

33235

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

33236

SelectionDAG &DAG) {

33237

MVT VT = Op.getSimpleValueType();

33238

33239

if (Subtarget.hasXOP() && !VT.is512BitVector())

33240

return LowerBITREVERSE_XOP(Op, DAG);

33241

33242

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33242, __extension__
__PRETTY_FUNCTION__));

33243

33244

SDValue In = Op.getOperand(0);

33245

SDLoc DL(Op);

33246

33247

assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__
__PRETTY_FUNCTION__))

33248

"Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__
__PRETTY_FUNCTION__));

33249

33250

// Split v64i8 without BWI so that we can still use the PSHUFB lowering.

33251

if (VT == MVT::v64i8 && !Subtarget.hasBWI())

33252

return splitVectorIntUnary(Op, DAG);

33253

33254

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

33255

if (VT == MVT::v32i8 && !Subtarget.hasInt256())

33256

return splitVectorIntUnary(Op, DAG);

33257

33258

unsigned NumElts = VT.getVectorNumElements();

33259

33260

// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

33261

if (Subtarget.hasGFNI()) {

33262

MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);

33263

SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);

33264

Matrix = DAG.getBitcast(VT, Matrix);

33265

return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,

33266

DAG.getTargetConstant(0, DL, MVT::i8));

33267

}

33268

33269

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

33270

// two nibbles and a PSHUFB lookup to find the bitreverse of each

33271

// 0-15 value (moved to the other nibble).

33272

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

33273

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

33274

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

33275

33276

const int LoLUT[16] = {

33277

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

33278

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

33279

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

33280

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

33281

const int HiLUT[16] = {

33282

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

33283

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

33284

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

33285

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

33286

33287

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

33288

for (unsigned i = 0; i < NumElts; ++i) {

33289

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

33290

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

33291

}

33292

33293

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

33294

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

33295

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

33296

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

33297

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

33298

}

33299

33300

static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

33301

SelectionDAG &DAG) {

33302

SDLoc DL(Op);

33303

SDValue X = Op.getOperand(0);

33304

MVT VT = Op.getSimpleValueType();

33305

33306

// Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

33307

if (VT == MVT::i8 ||

33308

DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

33309

X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

33310

SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

33311

DAG.getConstant(0, DL, MVT::i8));

33312

// Copy the inverse of the parity flag into a register with setcc.

33313

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

33314

// Extend to the original type.

33315

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

33316

}

33317

33318

// If we have POPCNT, use the default expansion.

33319

if (Subtarget.hasPOPCNT())

33320

return SDValue();

33321

33322

if (VT == MVT::i64) {

33323

// Xor the high and low 16-bits together using a 32-bit operation.

33324

SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

33325

DAG.getNode(ISD::SRL, DL, MVT::i64, X,

33326

DAG.getConstant(32, DL, MVT::i8)));

33327

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

33328

X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

33329

}

33330

33331

if (VT != MVT::i16) {

33332

// Xor the high and low 16-bits together using a 32-bit operation.

33333

SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

33334

DAG.getConstant(16, DL, MVT::i8));

33335

X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

33336

} else {

33337

// If the input is 16-bits, we need to extend to use an i32 shift below.

33338

X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

33339

}

33340

33341

// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

33342

// This should allow an h-reg to be used to save a shift.

33343

SDValue Hi = DAG.getNode(

33344

ISD::TRUNCATE, DL, MVT::i8,

33345

DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

33346

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

33347

SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

33348

SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

33349

33350

// Copy the inverse of the parity flag into a register with setcc.

33351

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

33352

// Extend to the original type.

33353

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

33354

}

33355

33356

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

33357

const X86Subtarget &Subtarget) {

33358

unsigned NewOpc = 0;

33359

switch (N->getOpcode()) {

33360

case ISD::ATOMIC_LOAD_ADD:

33361

NewOpc = X86ISD::LADD;

33362

break;

33363

case ISD::ATOMIC_LOAD_SUB:

33364

NewOpc = X86ISD::LSUB;

33365

break;

33366

case ISD::ATOMIC_LOAD_OR:

33367

NewOpc = X86ISD::LOR;

33368

break;

33369

case ISD::ATOMIC_LOAD_XOR:

33370

NewOpc = X86ISD::LXOR;

33371

break;

33372

case ISD::ATOMIC_LOAD_AND:

33373

NewOpc = X86ISD::LAND;

33374

break;

33375

default:

33376

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33376);

33377

}

33378

33379

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

33380

33381

return DAG.getMemIntrinsicNode(

33382

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

33383

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

33384

/*MemVT=*/N->getSimpleValueType(0), MMO);

33385

}

33386

33387

/// Lower atomic_load_ops into LOCK-prefixed operations.

33388

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

33389

const X86Subtarget &Subtarget) {

33390

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

33391

SDValue Chain = N->getOperand(0);

33392

SDValue LHS = N->getOperand(1);

33393

SDValue RHS = N->getOperand(2);

33394

unsigned Opc = N->getOpcode();

33395

MVT VT = N->getSimpleValueType(0);

33396

SDLoc DL(N);

33397

33398

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

33399

// can only be lowered when the result is unused. They should have already

33400

// been transformed into a cmpxchg loop in AtomicExpand.

33401

if (N->hasAnyUseOfValue(0)) {

33402

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

33403

// select LXADD if LOCK_SUB can't be selected.

33404

// Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we

33405

// can use LXADD as opposed to cmpxchg.

33406

if (Opc == ISD::ATOMIC_LOAD_SUB ||

33407

(Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) {

33408

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

33409

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,

33410

AN->getMemOperand());

33411

}

33412

assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__
__PRETTY_FUNCTION__))

33413

"Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__
__PRETTY_FUNCTION__));

33414

return N;

33415

}

33416

33417

// Specialized lowering for the canonical form of an idemptotent atomicrmw.

33418

// The core idea here is that since the memory location isn't actually

33419

// changing, all we need is a lowering for the *ordering* impacts of the

33420

// atomicrmw. As such, we can chose a different operation and memory

33421

// location to minimize impact on other code.

33422

if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {

33423

// On X86, the only ordering which actually requires an instruction is

33424

// seq_cst which isn't SingleThread, everything just needs to be preserved

33425

// during codegen and then dropped. Note that we expect (but don't assume),

33426

// that orderings other than seq_cst and acq_rel have been canonicalized to

33427

// a store or load.

33428

if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&

33429

AN->getSyncScopeID() == SyncScope::System) {

33430

// Prefer a locked operation against a stack location to minimize cache

33431

// traffic. This assumes that stack locations are very likely to be

33432

// accessed only by the owning thread.

33433

SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

33434

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33434, __extension__ __PRETTY_FUNCTION__));

33435

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33436

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33437

DAG.getUNDEF(VT), NewChain);

33438

}

33439

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

33440

SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);

33441

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33441, __extension__ __PRETTY_FUNCTION__));

33442

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33443

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33444

DAG.getUNDEF(VT), NewChain);

33445

}

33446

33447

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

33448

// RAUW the chain, but don't worry about the result, as it's unused.

33449

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33449, __extension__ __PRETTY_FUNCTION__));

33450

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33451

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33452

DAG.getUNDEF(VT), LockOp.getValue(1));

33453

}

33454

33455

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

33456

const X86Subtarget &Subtarget) {

33457

auto *Node = cast<AtomicSDNode>(Op.getNode());

33458

SDLoc dl(Node);

33459

EVT VT = Node->getMemoryVT();

33460

33461

bool IsSeqCst =

33462

Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;

33463

bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

33464

33465

// If this store is not sequentially consistent and the type is legal

33466

// we can just keep it.

33467

if (!IsSeqCst && IsTypeLegal)

33468

return Op;

33469

33470

if (VT == MVT::i64 && !IsTypeLegal) {

33471

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

33472

// is enabled.

33473

bool NoImplicitFloatOps =

33474

DAG.getMachineFunction().getFunction().hasFnAttribute(

33475

Attribute::NoImplicitFloat);

33476

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

33477

SDValue Chain;

33478

if (Subtarget.hasSSE1()) {

33479

SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

33480

Node->getOperand(2));

33481

MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

33482

SclToVec = DAG.getBitcast(StVT, SclToVec);

33483

SDVTList Tys = DAG.getVTList(MVT::Other);

33484

SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

33485

Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

33486

MVT::i64, Node->getMemOperand());

33487

} else if (Subtarget.hasX87()) {

33488

// First load this into an 80-bit X87 register using a stack temporary.

33489

// This will put the whole integer into the significand.

33490

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

33491

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

33492

MachinePointerInfo MPI =

33493

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

33494

Chain =

33495

DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

33496

MPI, MaybeAlign(), MachineMemOperand::MOStore);

33497

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

33498

SDValue LdOps[] = {Chain, StackPtr};

33499

SDValue Value = DAG.getMemIntrinsicNode(

33500

X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

33501

/*Align*/ std::nullopt, MachineMemOperand::MOLoad);

33502

Chain = Value.getValue(1);

33503

33504

// Now use an FIST to do the atomic store.

33505

SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

33506

Chain =

33507

DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

33508

StoreOps, MVT::i64, Node->getMemOperand());

33509

}

33510

33511

if (Chain) {

33512

// If this is a sequentially consistent store, also emit an appropriate

33513

// barrier.

33514

if (IsSeqCst)

33515

Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

33516

33517

return Chain;

33518

}

33519

}

33520

}

33521

33522

// Convert seq_cst store -> xchg

33523

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

33524

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

33525

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

33526

Node->getMemoryVT(),

33527

Node->getOperand(0),

33528

Node->getOperand(1), Node->getOperand(2),

33529

Node->getMemOperand());

33530

return Swap.getValue(1);

33531

}

33532

33533

static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {

33534

SDNode *N = Op.getNode();

33535

MVT VT = N->getSimpleValueType(0);

33536

unsigned Opc = Op.getOpcode();

33537

33538

// Let legalize expand this if it isn't a legal type yet.

33539

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

33540

return SDValue();

33541

33542

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

33543

SDLoc DL(N);

33544

33545

// Set the carry flag.

33546

SDValue Carry = Op.getOperand(2);

33547

EVT CarryVT = Carry.getValueType();

33548

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

33549

Carry, DAG.getAllOnesConstant(DL, CarryVT));

33550

33551

bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;

33552

SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,

33553

Op.getOperand(0), Op.getOperand(1),

33554

Carry.getValue(1));

33555

33556

bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;

33557

SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,

33558

Sum.getValue(1), DL, DAG);

33559

if (N->getValueType(1) == MVT::i1)

33560

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

33561

33562

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

33563

}

33564

33565

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

33566

SelectionDAG &DAG) {

33567

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33567, __extension__
__PRETTY_FUNCTION__));

33568

33569

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

33570

// which returns the values as { float, float } (in XMM0) or

33571

// { double, double } (which is returned in XMM0, XMM1).

33572

SDLoc dl(Op);

33573

SDValue Arg = Op.getOperand(0);

33574

EVT ArgVT = Arg.getValueType();

33575

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

33576

33577

TargetLowering::ArgListTy Args;

33578

TargetLowering::ArgListEntry Entry;

33579

33580

Entry.Node = Arg;

33581

Entry.Ty = ArgTy;

33582

Entry.IsSExt = false;

33583

Entry.IsZExt = false;

33584

Args.push_back(Entry);

33585

33586

bool isF64 = ArgVT == MVT::f64;

33587

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

33588

// the small struct {f32, f32} is returned in (eax, edx). For f64,

33589

// the results are returned via SRet in memory.

33590

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33591

RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

33592

const char *LibcallName = TLI.getLibcallName(LC);

33593

SDValue Callee =

33594

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

33595

33596

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

33597

: (Type *)FixedVectorType::get(ArgTy, 4);

33598

33599

TargetLowering::CallLoweringInfo CLI(DAG);

33600

CLI.setDebugLoc(dl)

33601

.setChain(DAG.getEntryNode())

33602

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

33603

33604

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

33605

33606

if (isF64)

33607

// Returned in xmm0 and xmm1.

33608

return CallResult.first;

33609

33610

// Returned in bits 0:31 and 32:64 xmm0.

33611

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

33612

CallResult.first, DAG.getIntPtrConstant(0, dl));

33613

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

33614

CallResult.first, DAG.getIntPtrConstant(1, dl));

33615

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

33616

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

33617

}

33618

33619

/// Widen a vector input to a vector of NVT. The

33620

/// input vector must have the same element type as NVT.

33621

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

33622

bool FillWithZeroes = false) {

33623

// Check if InOp already has the right width.

33624

MVT InVT = InOp.getSimpleValueType();

33625

if (InVT == NVT)

33626

return InOp;

33627

33628

if (InOp.isUndef())

33629

return DAG.getUNDEF(NVT);

33630

33631

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__
__PRETTY_FUNCTION__))

33632

"input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__
__PRETTY_FUNCTION__));

33633

33634

unsigned InNumElts = InVT.getVectorNumElements();

33635

unsigned WidenNumElts = NVT.getVectorNumElements();

33636

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__
__PRETTY_FUNCTION__))

33637

"Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__
__PRETTY_FUNCTION__));

33638

33639

SDLoc dl(InOp);

33640

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

33641

InOp.getNumOperands() == 2) {

33642

SDValue N1 = InOp.getOperand(1);

33643

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

33644

N1.isUndef()) {

33645

InOp = InOp.getOperand(0);

33646

InVT = InOp.getSimpleValueType();

33647

InNumElts = InVT.getVectorNumElements();

33648

}

33649

}

33650

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

33651

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

33652

SmallVector<SDValue, 16> Ops;

33653

for (unsigned i = 0; i < InNumElts; ++i)

33654

Ops.push_back(InOp.getOperand(i));

33655

33656

EVT EltVT = InOp.getOperand(0).getValueType();

33657

33658

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

33659

DAG.getUNDEF(EltVT);

33660

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

33661

Ops.push_back(FillVal);

33662

return DAG.getBuildVector(NVT, dl, Ops);

33663

}

33664

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

33665

DAG.getUNDEF(NVT);

33666

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

33667

InOp, DAG.getIntPtrConstant(0, dl));

33668

}

33669

33670

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

33671

SelectionDAG &DAG) {

33672

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__
__PRETTY_FUNCTION__))

33673

"MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__
__PRETTY_FUNCTION__));

33674

33675

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

33676

SDValue Src = N->getValue();

33677

MVT VT = Src.getSimpleValueType();

33678

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33678, __extension__
__PRETTY_FUNCTION__));

33679

SDLoc dl(Op);

33680

33681

SDValue Scale = N->getScale();

33682

SDValue Index = N->getIndex();

33683

SDValue Mask = N->getMask();

33684

SDValue Chain = N->getChain();

33685

SDValue BasePtr = N->getBasePtr();

33686

33687

if (VT == MVT::v2f32 || VT == MVT::v2i32) {

33688

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33688, __extension__
__PRETTY_FUNCTION__));

33689

// If the index is v2i64 and we have VLX we can use xmm for data and index.

33690

if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

33691

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33692

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

33693

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

33694

SDVTList VTs = DAG.getVTList(MVT::Other);

33695

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

33696

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

33697

N->getMemoryVT(), N->getMemOperand());

33698

}

33699

return SDValue();

33700

}

33701

33702

MVT IndexVT = Index.getSimpleValueType();

33703

33704

// If the index is v2i32, we're being called by type legalization and we

33705

// should just let the default handling take care of it.

33706

if (IndexVT == MVT::v2i32)

33707

return SDValue();

33708

33709

// If we don't have VLX and neither the passthru or index is 512-bits, we

33710

// need to widen until one is.

33711

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

33712

!Index.getSimpleValueType().is512BitVector()) {

33713

// Determine how much we need to widen by to get a 512-bit type.

33714

unsigned Factor = std::min(512/VT.getSizeInBits(),

33715

512/IndexVT.getSizeInBits());

33716

unsigned NumElts = VT.getVectorNumElements() * Factor;

33717

33718

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33719

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33720

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33721

33722

Src = ExtendToType(Src, VT, DAG);

33723

Index = ExtendToType(Index, IndexVT, DAG);

33724

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33725

}

33726

33727

SDVTList VTs = DAG.getVTList(MVT::Other);

33728

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

33729

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

33730

N->getMemoryVT(), N->getMemOperand());

33731

}

33732

33733

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

33734

SelectionDAG &DAG) {

33735

33736

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

33737

MVT VT = Op.getSimpleValueType();

33738

MVT ScalarVT = VT.getScalarType();

33739

SDValue Mask = N->getMask();

33740

MVT MaskVT = Mask.getSimpleValueType();

33741

SDValue PassThru = N->getPassThru();

33742

SDLoc dl(Op);

33743

33744

// Handle AVX masked loads which don't support passthru other than 0.

33745

if (MaskVT.getVectorElementType() != MVT::i1) {

33746

// We also allow undef in the isel pattern.

33747

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

33748

return Op;

33749

33750

SDValue NewLoad = DAG.getMaskedLoad(

33751

VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33752

getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

33753

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

33754

N->isExpandingLoad());

33755

// Emit a blend.

33756

SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

33757

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

33758

}

33759

33760

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__
__PRETTY_FUNCTION__))

33761

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__
__PRETTY_FUNCTION__));

33762

33763

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__
__PRETTY_FUNCTION__))

33764

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__
__PRETTY_FUNCTION__));

33765

33766

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__
__PRETTY_FUNCTION__))

33767

"Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__
__PRETTY_FUNCTION__));

33768

33769

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))

33770

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))

33771

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))

33772

"Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__));

33773

33774

// This operation is legal for targets with VLX, but without

33775

// VLX the vector should be widened to 512 bit

33776

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

33777

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33778

PassThru = ExtendToType(PassThru, WideDataVT, DAG);

33779

33780

// Mask element has to be i1.

33781

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__
__PRETTY_FUNCTION__))

33782

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__
__PRETTY_FUNCTION__));

33783

33784

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33785

33786

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33787

SDValue NewLoad = DAG.getMaskedLoad(

33788

WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33789

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

33790

N->getExtensionType(), N->isExpandingLoad());

33791

33792

SDValue Extract =

33793

DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

33794

DAG.getIntPtrConstant(0, dl));

33795

SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

33796

return DAG.getMergeValues(RetOps, dl);

33797

}

33798

33799

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

33800

SelectionDAG &DAG) {

33801

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

33802

SDValue DataToStore = N->getValue();

33803

MVT VT = DataToStore.getSimpleValueType();

33804

MVT ScalarVT = VT.getScalarType();

33805

SDValue Mask = N->getMask();

33806

SDLoc dl(Op);

33807

33808

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__
__PRETTY_FUNCTION__))

33809

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__
__PRETTY_FUNCTION__));

33810

33811

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__
__PRETTY_FUNCTION__))

33812

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__
__PRETTY_FUNCTION__));

33813

33814

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__
__PRETTY_FUNCTION__))

33815

"Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__
__PRETTY_FUNCTION__));

33816

33817

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))

33818

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))

33819

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__))

33820

"Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__
__PRETTY_FUNCTION__));

33821

33822

// This operation is legal for targets with VLX, but without

33823

// VLX the vector should be widened to 512 bit

33824

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

33825

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33826

33827

// Mask element has to be i1.

33828

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__
__PRETTY_FUNCTION__))

33829

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__
__PRETTY_FUNCTION__));

33830

33831

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33832

33833

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

33834

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33835

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

33836

N->getOffset(), Mask, N->getMemoryVT(),

33837

N->getMemOperand(), N->getAddressingMode(),

33838

N->isTruncatingStore(), N->isCompressingStore());

33839

}

33840

33841

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

33842

SelectionDAG &DAG) {

33843

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__
__PRETTY_FUNCTION__))

33844

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__
__PRETTY_FUNCTION__));

33845

33846

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

33847

SDLoc dl(Op);

33848

MVT VT = Op.getSimpleValueType();

33849

SDValue Index = N->getIndex();

33850

SDValue Mask = N->getMask();

33851

SDValue PassThru = N->getPassThru();

33852

MVT IndexVT = Index.getSimpleValueType();

33853

33854

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33854, __extension__
__PRETTY_FUNCTION__));

33855

33856

// If the index is v2i32, we're being called by type legalization.

33857

if (IndexVT == MVT::v2i32)

33858

return SDValue();

33859

33860

// If we don't have VLX and neither the passthru or index is 512-bits, we

33861

// need to widen until one is.

33862

MVT OrigVT = VT;

33863

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

33864

!IndexVT.is512BitVector()) {

33865

// Determine how much we need to widen by to get a 512-bit type.

33866

unsigned Factor = std::min(512/VT.getSizeInBits(),

33867

512/IndexVT.getSizeInBits());

33868

33869

unsigned NumElts = VT.getVectorNumElements() * Factor;

33870

33871

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33872

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33873

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33874

33875

PassThru = ExtendToType(PassThru, VT, DAG);

33876

Index = ExtendToType(Index, IndexVT, DAG);

33877

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33878

}

33879

33880

// Break dependency on the data register.

33881

if (PassThru.isUndef())

33882

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

33883

33884

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

33885

N->getScale() };

33886

SDValue NewGather = DAG.getMemIntrinsicNode(

33887

X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

33888

N->getMemOperand());

33889

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

33890

NewGather, DAG.getIntPtrConstant(0, dl));

33891

return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

33892

}

33893

33894

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

33895

SDLoc dl(Op);

33896

SDValue Src = Op.getOperand(0);

33897

MVT DstVT = Op.getSimpleValueType();

33898

33899

AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

33900

unsigned SrcAS = N->getSrcAddressSpace();

33901

33902

assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__
__PRETTY_FUNCTION__))

33903

"addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__
__PRETTY_FUNCTION__));

33904

33905

if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

33906

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

33907

} else if (DstVT == MVT::i64) {

33908

Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

33909

} else if (DstVT == MVT::i32) {

33910

Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

33911

} else {

33912

report_fatal_error("Bad address space in addrspacecast");

33913

}

33914

return Op;

33915

}

33916

33917

SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

33918

SelectionDAG &DAG) const {

33919

// TODO: Eventually, the lowering of these nodes should be informed by or

33920

// deferred to the GC strategy for the function in which they appear. For

33921

// now, however, they must be lowered to something. Since they are logically

33922

// no-ops in the case of a null GC strategy (or a GC strategy which does not

33923

// require special handling for these nodes), lower them as literal NOOPs for

33924

// the time being.

33925

SmallVector<SDValue, 2> Ops;

33926

Ops.push_back(Op.getOperand(0));

33927

if (Op->getGluedNode())

33928

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

33929

33930

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

33931

return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

33932

}

33933

33934

// Custom split CVTPS2PH with wide types.

33935

static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

33936

SDLoc dl(Op);

33937

EVT VT = Op.getValueType();

33938

SDValue Lo, Hi;

33939

std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

33940

EVT LoVT, HiVT;

33941

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33942

SDValue RC = Op.getOperand(1);

33943

Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

33944

Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

33945

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33946

}

33947

33948

static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,

33949

unsigned OpNo) {

33950

const APInt Operand(32, OpNo);

33951

std::string OpNoStr = llvm::toString(Operand, 10, false);

33952

std::string Str(" $");

33953

33954

std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)

33955

std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}

33956

33957

auto I = StringRef::npos;

33958

for (auto &AsmStr : AsmStrs) {

33959

// Match the OpNo string. We should match exactly to exclude match

33960

// sub-string, e.g. "$12" contain "$1"

33961

if (AsmStr.endswith(OpNoStr1))

33962

I = AsmStr.size() - OpNoStr1.size();

33963

33964

// Get the index of operand in AsmStr.

33965

if (I == StringRef::npos)

33966

I = AsmStr.find(OpNoStr1 + ",");

33967

if (I == StringRef::npos)

33968

I = AsmStr.find(OpNoStr2);

33969

33970

if (I == StringRef::npos)

33971

continue;

33972

33973

assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33973, __extension__
__PRETTY_FUNCTION__));

33974

// Remove the operand string and label (if exsit).

33975

// For example:

33976

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"

33977

// ==>

33978

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr "

33979

// ==>

33980

// "call dword ptr "

33981

auto TmpStr = AsmStr.substr(0, I);

33982

I = TmpStr.rfind(':');

33983

if (I != StringRef::npos)

33984

TmpStr = TmpStr.substr(I + 1);

33985

return TmpStr.take_while(llvm::isAlpha);

33986

}

33987

33988

return StringRef();

33989

}

33990

33991

bool X86TargetLowering::isInlineAsmTargetBranch(

33992

const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {

33993

// In a __asm block, __asm inst foo where inst is CALL or JMP should be

33994

// changed from indirect TargetLowering::C_Memory to direct

33995

// TargetLowering::C_Address.

33996

// We don't need to special case LOOP* and Jcc, which cannot target a memory

33997

// location.

33998

StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);

33999

return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");

34000

}

34001

34002

/// Provide custom lowering hooks for some operations.

34003

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

34004

switch (Op.getOpcode()) {

34005

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34005);

34006

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

34007

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

34008

return LowerCMP_SWAP(Op, Subtarget, DAG);

34009

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

34010

case ISD::ATOMIC_LOAD_ADD:

34011

case ISD::ATOMIC_LOAD_SUB:

34012

case ISD::ATOMIC_LOAD_OR:

34013

case ISD::ATOMIC_LOAD_XOR:

34014

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

34015

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);

34016

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

34017

case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);

34018

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

34019

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

34020

case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

34021

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

34022

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

34023

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

34024

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

34025

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

34026

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

34027

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

34028

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

34029

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

34030

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

34031

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

34032

case ISD::SHL_PARTS:

34033

case ISD::SRA_PARTS:

34034

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

34035

case ISD::FSHL:

34036

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

34037

case ISD::STRICT_SINT_TO_FP:

34038

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

34039

case ISD::STRICT_UINT_TO_FP:

34040

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

34041

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

34042

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

34043

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

34044

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

34045

case ISD::ZERO_EXTEND_VECTOR_INREG:

34046

case ISD::SIGN_EXTEND_VECTOR_INREG:

34047

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

34048

case ISD::FP_TO_SINT:

34049

case ISD::STRICT_FP_TO_SINT:

34050

case ISD::FP_TO_UINT:

34051

case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

34052

case ISD::FP_TO_SINT_SAT:

34053

case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);

34054

case ISD::FP_EXTEND:

34055

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

34056

case ISD::FP_ROUND:

34057

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

34058

case ISD::FP16_TO_FP:

34059

case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

34060

case ISD::FP_TO_FP16:

34061

case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

34062

case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);

34063

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

34064

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

34065

case ISD::FADD:

34066

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

34067

case ISD::FROUND: return LowerFROUND(Op, DAG);

34068

case ISD::FABS:

34069

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

34070

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

34071

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

34072

case ISD::LRINT:

34073

case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

34074

case ISD::SETCC:

34075

case ISD::STRICT_FSETCC:

34076

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

34077

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

34078

case ISD::SELECT: return LowerSELECT(Op, DAG);

34079

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

34080

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

34081

case ISD::VASTART: return LowerVASTART(Op, DAG);

34082

case ISD::VAARG: return LowerVAARG(Op, DAG);

34083

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

34084

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

34085

case ISD::INTRINSIC_VOID:

34086

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

34087

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

34088

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

34089

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

34090

case ISD::FRAME_TO_ARGS_OFFSET:

34091

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

34092

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

34093

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

34094

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

34095

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

34096

case ISD::EH_SJLJ_SETUP_DISPATCH:

34097

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

34098

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

34099

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

34100

case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);

34101

case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);

34102

case ISD::CTLZ:

34103

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

34104

case ISD::CTTZ:

34105

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);

34106

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

34107

case ISD::MULHS:

34108

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

34109

case ISD::ROTL:

34110

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

34111

case ISD::SRA:

34112

case ISD::SRL:

34113

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

34114

case ISD::SADDO:

34115

case ISD::UADDO:

34116

case ISD::SSUBO:

34117

case ISD::USUBO: return LowerXALUO(Op, DAG);

34118

case ISD::SMULO:

34119

case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);

34120

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

34121

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

34122

case ISD::SADDO_CARRY:

34123

case ISD::SSUBO_CARRY:

34124

case ISD::UADDO_CARRY:

34125

case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);

34126

case ISD::ADD:

34127

case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);

34128

case ISD::UADDSAT:

34129

case ISD::SADDSAT:

34130

case ISD::USUBSAT:

34131

case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

34132

case ISD::SMAX:

34133

case ISD::SMIN:

34134

case ISD::UMAX:

34135

case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);

34136

case ISD::FMINIMUM:

34137

case ISD::FMAXIMUM:

34138

return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);

34139

case ISD::ABS: return LowerABS(Op, Subtarget, DAG);

34140

case ISD::ABDS:

34141

case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);

34142

case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);

34143

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

34144

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

34145

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

34146

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

34147

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

34148

case ISD::GC_TRANSITION_START:

34149

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

34150

case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

34151

case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

34152

}

34153

}

34154

34155

/// Replace a node with an illegal result type with a new node built out of

34156

/// custom code.

34157

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

34158

SmallVectorImpl<SDValue>&Results,

34159

SelectionDAG &DAG) const {

34160

SDLoc dl(N);

34161

switch (N->getOpcode()) {

34162

default:

34163

#ifndef NDEBUG

34164

dbgs() << "ReplaceNodeResults: ";

34165

N->dump(&DAG);

34166

#endif

34167

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34167);

34168

case X86ISD::CVTPH2PS: {

34169

EVT VT = N->getValueType(0);

34170

SDValue Lo, Hi;

34171

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

34172

EVT LoVT, HiVT;

34173

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

34174

Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

34175

Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

34176

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34177

Results.push_back(Res);

34178

return;

34179

}

34180

case X86ISD::STRICT_CVTPH2PS: {

34181

EVT VT = N->getValueType(0);

34182

SDValue Lo, Hi;

34183

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

34184

EVT LoVT, HiVT;

34185

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

34186

Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

34187

{N->getOperand(0), Lo});

34188

Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

34189

{N->getOperand(0), Hi});

34190

SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

34191

Lo.getValue(1), Hi.getValue(1));

34192

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34193

Results.push_back(Res);

34194

Results.push_back(Chain);

34195

return;

34196

}

34197

case X86ISD::CVTPS2PH:

34198

Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));

34199

return;

34200

case ISD::CTPOP: {

34201

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34201, __extension__
__PRETTY_FUNCTION__));

34202

// Use a v2i64 if possible.

34203

bool NoImplicitFloatOps =

34204

DAG.getMachineFunction().getFunction().hasFnAttribute(

34205

Attribute::NoImplicitFloat);

34206

if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

34207

SDValue Wide =

34208

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

34209

Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

34210

// Bit count should fit in 32-bits, extract it as that and then zero

34211

// extend to i64. Otherwise we end up extracting bits 63:32 separately.

34212

Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

34213

Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

34214

DAG.getIntPtrConstant(0, dl));

34215

Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

34216

Results.push_back(Wide);

34217

}

34218

return;

34219

}

34220

case ISD::MUL: {

34221

EVT VT = N->getValueType(0);

34222

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__
__PRETTY_FUNCTION__))

34223

VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__
__PRETTY_FUNCTION__));

34224

// Pre-promote these to vXi16 to avoid op legalization thinking all 16

34225

// elements are needed.

34226

MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

34227

SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

34228

SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

34229

SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

34230

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34231

unsigned NumConcats = 16 / VT.getVectorNumElements();

34232

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

34233

ConcatOps[0] = Res;

34234

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

34235

Results.push_back(Res);

34236

return;

34237

}

34238

case ISD::SMULO:

34239

case ISD::UMULO: {

34240

EVT VT = N->getValueType(0);

34241

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__))

34242

VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__));

34243

bool IsSigned = N->getOpcode() == ISD::SMULO;

34244

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

34245

SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));

34246

SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));

34247

SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);

34248

// Extract the high 32 bits from each result using PSHUFD.

34249

// TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.

34250

SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);

34251

Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});

34252

Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,

34253

DAG.getIntPtrConstant(0, dl));

34254

34255

// Truncate the low bits of the result. This will become PSHUFD.

34256

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34257

34258

SDValue HiCmp;

34259

if (IsSigned) {

34260

// SMULO overflows if the high bits don't match the sign of the low.

34261

HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));

34262

} else {

34263

// UMULO overflows if the high bits are non-zero.

34264

HiCmp = DAG.getConstant(0, dl, VT);

34265

}

34266

SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);

34267

34268

// Widen the result with by padding with undef.

34269

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

34270

DAG.getUNDEF(VT));

34271

Results.push_back(Res);

34272

Results.push_back(Ovf);

34273

return;

34274

}

34275

case X86ISD::VPMADDWD: {

34276

// Legalize types for X86ISD::VPMADDWD by widening.

34277

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34277, __extension__
__PRETTY_FUNCTION__));

34278

34279

EVT VT = N->getValueType(0);

34280

EVT InVT = N->getOperand(0).getValueType();

34281

assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__
__PRETTY_FUNCTION__))

34282

"Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__
__PRETTY_FUNCTION__));

34283

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__
__PRETTY_FUNCTION__))

34284

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__
__PRETTY_FUNCTION__));

34285

unsigned NumConcat = 128 / InVT.getSizeInBits();

34286

34287

EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

34288

InVT.getVectorElementType(),

34289

NumConcat * InVT.getVectorNumElements());

34290

EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

34291

VT.getVectorElementType(),

34292

NumConcat * VT.getVectorNumElements());

34293

34294

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

34295

Ops[0] = N->getOperand(0);

34296

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

34297

Ops[0] = N->getOperand(1);

34298

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

34299

34300

SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);

34301

Results.push_back(Res);

34302

return;

34303

}

34304

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

34305

case X86ISD::FMINC:

34306

case X86ISD::FMIN:

34307

case X86ISD::FMAXC:

34308

case X86ISD::FMAX: {

34309

EVT VT = N->getValueType(0);

34310

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__
__PRETTY_FUNCTION__));

34311

SDValue UNDEF = DAG.getUNDEF(VT);

34312

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

34313

N->getOperand(0), UNDEF);

34314

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

34315

N->getOperand(1), UNDEF);

34316

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

34317

return;

34318

}

34319

case ISD::SDIV:

34320

case ISD::UDIV:

34321

case ISD::SREM:

34322

case ISD::UREM: {

34323

EVT VT = N->getValueType(0);

34324

if (VT.isVector()) {

34325

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__
__PRETTY_FUNCTION__))

34326

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__
__PRETTY_FUNCTION__));

34327

// If this RHS is a constant splat vector we can widen this and let

34328

// division/remainder by constant optimize it.

34329

// TODO: Can we do something for non-splat?

34330

APInt SplatVal;

34331

if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

34332

unsigned NumConcats = 128 / VT.getSizeInBits();

34333

SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

34334

Ops0[0] = N->getOperand(0);

34335

EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

34336

SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

34337

SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

34338

SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);

34339

Results.push_back(Res);

34340

}

34341

return;

34342

}

34343

34344

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

34345

Results.push_back(V);

34346

return;

34347

}

34348

case ISD::TRUNCATE: {

34349

MVT VT = N->getSimpleValueType(0);

34350

if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

34351

return;

34352

34353

// The generic legalizer will try to widen the input type to the same

34354

// number of elements as the widened result type. But this isn't always

34355

// the best thing so do some custom legalization to avoid some cases.

34356

MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

34357

SDValue In = N->getOperand(0);

34358

EVT InVT = In.getValueType();

34359

34360

unsigned InBits = InVT.getSizeInBits();

34361

if (128 % InBits == 0) {

34362

// 128 bit and smaller inputs should avoid truncate all together and

34363

// just use a build_vector that will become a shuffle.

34364

// TODO: Widen and use a shuffle directly?

34365

MVT InEltVT = InVT.getSimpleVT().getVectorElementType();

34366

EVT EltVT = VT.getVectorElementType();

34367

unsigned WidenNumElts = WidenVT.getVectorNumElements();

34368

SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));

34369

// Use the original element count so we don't do more scalar opts than

34370

// necessary.

34371

unsigned MinElts = VT.getVectorNumElements();

34372

for (unsigned i=0; i < MinElts; ++i) {

34373

SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,

34374

DAG.getIntPtrConstant(i, dl));

34375

Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);

34376

}

34377

Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));

34378

return;

34379

}

34380

// With AVX512 there are some cases that can use a target specific

34381

// truncate node to go from 256/512 to less than 128 with zeros in the

34382

// upper elements of the 128 bit result.

34383

if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

34384

// We can use VTRUNC directly if for 256 bits with VLX or for any 512.

34385

if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

34386

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

34387

return;

34388

}

34389

// There's one case we can widen to 512 bits and use VTRUNC.

34390

if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

34391

In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

34392

DAG.getUNDEF(MVT::v4i64));

34393

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

34394

return;

34395

}

34396

}

34397

if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

34398

getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

34399

isTypeLegal(MVT::v4i64)) {

34400

// Input needs to be split and output needs to widened. Let's use two

34401

// VTRUNCs, and shuffle their results together into the wider type.

34402

SDValue Lo, Hi;

34403

std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

34404

34405

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

34406

Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

34407

SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

34408

{ 0, 1, 2, 3, 16, 17, 18, 19,

34409

-1, -1, -1, -1, -1, -1, -1, -1 });

34410

Results.push_back(Res);

34411

return;

34412

}

34413

34414

return;

34415

}

34416

case ISD::ANY_EXTEND:

34417

// Right now, only MVT::v8i8 has Custom action for an illegal type.

34418

// It's intended to custom handle the input type.

34419

assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__
__PRETTY_FUNCTION__))

34420

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__
__PRETTY_FUNCTION__));

34421

return;

34422

case ISD::SIGN_EXTEND:

34423

case ISD::ZERO_EXTEND: {

34424

EVT VT = N->getValueType(0);

34425

SDValue In = N->getOperand(0);

34426

EVT InVT = In.getValueType();

34427

if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

34428

(InVT == MVT::v4i16 || InVT == MVT::v4i8)){

34429

assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__
__PRETTY_FUNCTION__))

34430

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__
__PRETTY_FUNCTION__));

34431

assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34431, __extension__
__PRETTY_FUNCTION__));

34432

// Custom split this so we can extend i8/i16->i32 invec. This is better

34433

// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

34434

// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

34435

// we allow the sra from the extend to i32 to be shared by the split.

34436

In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

34437

34438

// Fill a vector with sign bits for each element.

34439

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

34440

SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

34441

34442

// Create an unpackl and unpackh to interleave the sign bits then bitcast

34443

// to v2i64.

34444

SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

34445

{0, 4, 1, 5});

34446

Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

34447

SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

34448

{2, 6, 3, 7});

34449

Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

34450

34451

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34452

Results.push_back(Res);

34453

return;

34454

}

34455

34456

if (VT == MVT::v16i32 || VT == MVT::v8i64) {

34457

if (!InVT.is128BitVector()) {

34458

// Not a 128 bit vector, but maybe type legalization will promote

34459

// it to 128 bits.

34460

if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

34461

return;

34462

InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

34463

if (!InVT.is128BitVector())

34464

return;

34465

34466

// Promote the input to 128 bits. Type legalization will turn this into

34467

// zext_inreg/sext_inreg.

34468

In = DAG.getNode(N->getOpcode(), dl, InVT, In);

34469

}

34470

34471

// Perform custom splitting instead of the two stage extend we would get

34472

// by default.

34473

EVT LoVT, HiVT;

34474

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

34475

assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34475, __extension__
__PRETTY_FUNCTION__));

34476

34477

SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);

34478

34479

// We need to shift the input over by half the number of elements.

34480

unsigned NumElts = InVT.getVectorNumElements();

34481

unsigned HalfNumElts = NumElts / 2;

34482

SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

34483

for (unsigned i = 0; i != HalfNumElts; ++i)

34484

ShufMask[i] = i + HalfNumElts;

34485

34486

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

34487

Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);

34488

34489

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34490

Results.push_back(Res);

34491

}

34492

return;

34493

}

34494

case ISD::FP_TO_SINT:

34495

case ISD::STRICT_FP_TO_SINT:

34496

case ISD::FP_TO_UINT:

34497

case ISD::STRICT_FP_TO_UINT: {

34498

bool IsStrict = N->isStrictFPOpcode();

34499

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

34500

N->getOpcode() == ISD::STRICT_FP_TO_SINT;

34501

EVT VT = N->getValueType(0);

34502

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34503

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34504

EVT SrcVT = Src.getValueType();

34505

34506

SDValue Res;

34507

if (isSoftFP16(SrcVT)) {

34508

EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

34509

if (IsStrict) {

34510

Res =

34511

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

34512

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

34513

{NVT, MVT::Other}, {Chain, Src})});

34514

Chain = Res.getValue(1);

34515

} else {

34516

Res = DAG.getNode(N->getOpcode(), dl, VT,

34517

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

34518

}

34519

Results.push_back(Res);

34520

if (IsStrict)

34521

Results.push_back(Chain);

34522

34523

return;

34524

}

34525

34526

if (VT.isVector() && Subtarget.hasFP16() &&

34527

SrcVT.getVectorElementType() == MVT::f16) {

34528

EVT EleVT = VT.getVectorElementType();

34529

EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

34530

34531

if (SrcVT != MVT::v8f16) {

34532

SDValue Tmp =

34533

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

34534

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

34535

Ops[0] = Src;

34536

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

34537

}

34538

34539

if (IsStrict) {

34540

unsigned Opc =

34541

IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34542

Res =

34543

DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});

34544

Chain = Res.getValue(1);

34545

} else {

34546

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34547

Res = DAG.getNode(Opc, dl, ResVT, Src);

34548

}

34549

34550

// TODO: Need to add exception check code for strict FP.

34551

if (EleVT.getSizeInBits() < 16) {

34552

MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);

34553

Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);

34554

34555

// Now widen to 128 bits.

34556

unsigned NumConcats = 128 / TmpVT.getSizeInBits();

34557

MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);

34558

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));

34559

ConcatOps[0] = Res;

34560

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

34561

}

34562

34563

Results.push_back(Res);

34564

if (IsStrict)

34565

Results.push_back(Chain);

34566

34567

return;

34568

}

34569

34570

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

34571

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__
__PRETTY_FUNCTION__))

34572

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__
__PRETTY_FUNCTION__));

34573

34574

// Try to create a 128 bit vector, but don't exceed a 32 bit element.

34575

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

34576

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

34577

VT.getVectorNumElements());

34578

SDValue Res;

34579

SDValue Chain;

34580

if (IsStrict) {

34581

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

34582

{N->getOperand(0), Src});

34583

Chain = Res.getValue(1);

34584

} else

34585

Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

34586

34587

// Preserve what we know about the size of the original result. If the

34588

// result is v2i32, we have to manually widen the assert.

34589

if (PromoteVT == MVT::v2i32)

34590

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

34591

DAG.getUNDEF(MVT::v2i32));

34592

34593

Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,

34594

Res.getValueType(), Res,

34595

DAG.getValueType(VT.getVectorElementType()));

34596

34597

if (PromoteVT == MVT::v2i32)

34598

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

34599

DAG.getIntPtrConstant(0, dl));

34600

34601

// Truncate back to the original width.

34602

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34603

34604

// Now widen to 128 bits.

34605

unsigned NumConcats = 128 / VT.getSizeInBits();

34606

MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

34607

VT.getVectorNumElements() * NumConcats);

34608

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

34609

ConcatOps[0] = Res;

34610

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

34611

Results.push_back(Res);

34612

if (IsStrict)

34613

Results.push_back(Chain);

34614

return;

34615

}

34616

34617

34618

if (VT == MVT::v2i32) {

34619

assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__
__PRETTY_FUNCTION__))

34620

"Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__
__PRETTY_FUNCTION__));

34621

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34621, __extension__
__PRETTY_FUNCTION__));

34622

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__
__PRETTY_FUNCTION__))

34623

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__
__PRETTY_FUNCTION__));

34624

if (Src.getValueType() == MVT::v2f64) {

34625

if (!IsSigned && !Subtarget.hasAVX512()) {

34626

SDValue Res =

34627

expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);

34628

Results.push_back(Res);

34629

return;

34630

}

34631

34632

unsigned Opc;

34633

if (IsStrict)

34634

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34635

else

34636

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34637

34638

// If we have VLX we can emit a target specific FP_TO_UINT node,.

34639

if (!IsSigned && !Subtarget.hasVLX()) {

34640

// Otherwise we can defer to the generic legalizer which will widen

34641

// the input as well. This will be further widened during op

34642

// legalization to v8i32<-v8f64.

34643

// For strict nodes we'll need to widen ourselves.

34644

// FIXME: Fix the type legalizer to safely widen strict nodes?

34645

if (!IsStrict)

34646

return;

34647

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

34648

DAG.getConstantFP(0.0, dl, MVT::v2f64));

34649

Opc = N->getOpcode();

34650

}

34651

SDValue Res;

34652

SDValue Chain;

34653

if (IsStrict) {

34654

Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

34655

{N->getOperand(0), Src});

34656

Chain = Res.getValue(1);

34657

} else {

34658

Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

34659

}

34660

Results.push_back(Res);

34661

if (IsStrict)

34662

Results.push_back(Chain);

34663

return;

34664

}

34665

34666

// Custom widen strict v2f32->v2i32 by padding with zeros.

34667

// FIXME: Should generic type legalizer do this?

34668

if (Src.getValueType() == MVT::v2f32 && IsStrict) {

34669

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

34670

DAG.getConstantFP(0.0, dl, MVT::v2f32));

34671

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

34672

{N->getOperand(0), Src});

34673

Results.push_back(Res);

34674

Results.push_back(Res.getValue(1));

34675

return;

34676

}

34677

34678

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

34679

// so early out here.

34680

return;

34681

}

34682

34683

assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34683, __extension__
__PRETTY_FUNCTION__));

34684

34685

if ((Subtarget.hasDQI() && VT == MVT::i64 &&

34686

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||

34687

(Subtarget.hasFP16() && SrcVT == MVT::f16)) {

34688

assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__
__PRETTY_FUNCTION__));

34689

unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

34690

// If we use a 128-bit result we might need to use a target specific node.

34691

unsigned SrcElts =

34692

std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

34693

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

34694

MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

34695

unsigned Opc = N->getOpcode();

34696

if (NumElts != SrcElts) {

34697

if (IsStrict)

34698

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34699

else

34700

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34701

}

34702

34703

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

34704

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

34705

DAG.getConstantFP(0.0, dl, VecInVT), Src,

34706

ZeroIdx);

34707

SDValue Chain;

34708

if (IsStrict) {

34709

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

34710

Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

34711

Chain = Res.getValue(1);

34712

} else

34713

Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

34714

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

34715

Results.push_back(Res);

34716

if (IsStrict)

34717

Results.push_back(Chain);

34718

return;

34719

}

34720

34721

if (VT == MVT::i128 && Subtarget.isTargetWin64()) {

34722

SDValue Chain;

34723

SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);

34724

Results.push_back(V);

34725

if (IsStrict)

34726

Results.push_back(Chain);

34727

return;

34728

}

34729

34730

if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

34731

Results.push_back(V);

34732

if (IsStrict)

34733

Results.push_back(Chain);

34734

}

34735

return;

34736

}

34737

case ISD::LRINT:

34738

case ISD::LLRINT: {

34739

if (SDValue V = LRINT_LLRINTHelper(N, DAG))

34740

Results.push_back(V);

34741

return;

34742

}

34743

34744

case ISD::SINT_TO_FP:

34745

case ISD::STRICT_SINT_TO_FP:

34746

case ISD::UINT_TO_FP:

34747

case ISD::STRICT_UINT_TO_FP: {

34748

bool IsStrict = N->isStrictFPOpcode();

34749

bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

34750

N->getOpcode() == ISD::STRICT_SINT_TO_FP;

34751

EVT VT = N->getValueType(0);

34752

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34753

if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&

34754

Subtarget.hasVLX()) {

34755

if (Src.getValueType().getVectorElementType() == MVT::i16)

34756

return;

34757

34758

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)

34759

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34760

IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)

34761

: DAG.getUNDEF(MVT::v2i32));

34762

if (IsStrict) {

34763

unsigned Opc =

34764

IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;

34765

SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

34766

{N->getOperand(0), Src});

34767

Results.push_back(Res);

34768

Results.push_back(Res.getValue(1));

34769

} else {

34770

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34771

Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));

34772

}

34773

return;

34774

}

34775

if (VT != MVT::v2f32)

34776

return;

34777

EVT SrcVT = Src.getValueType();

34778

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

34779

if (IsStrict) {

34780

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

34781

: X86ISD::STRICT_CVTUI2P;

34782

SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

34783

{N->getOperand(0), Src});

34784

Results.push_back(Res);

34785

Results.push_back(Res.getValue(1));

34786

} else {

34787

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34788

Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

34789

}

34790

return;

34791

}

34792

if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

34793

Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

34794

SDValue Zero = DAG.getConstant(0, dl, SrcVT);

34795

SDValue One = DAG.getConstant(1, dl, SrcVT);

34796

SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

34797

DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

34798

DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

34799

SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

34800

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

34801

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

34802

for (int i = 0; i != 2; ++i) {

34803

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

34804

SignSrc, DAG.getIntPtrConstant(i, dl));

34805

if (IsStrict)

34806

SignCvts[i] =

34807

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

34808

{N->getOperand(0), Elt});

34809

else

34810

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

34811

};

34812

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

34813

SDValue Slow, Chain;

34814

if (IsStrict) {

34815

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

34816

SignCvts[0].getValue(1), SignCvts[1].getValue(1));

34817

Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

34818

{Chain, SignCvt, SignCvt});

34819

Chain = Slow.getValue(1);

34820

} else {

34821

Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

34822

}

34823

IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

34824

IsNeg =

34825

DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

34826

SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

34827

Results.push_back(Cvt);

34828

if (IsStrict)

34829

Results.push_back(Chain);

34830

return;

34831

}

34832

34833

if (SrcVT != MVT::v2i32)

34834

return;

34835

34836

if (IsSigned || Subtarget.hasAVX512()) {

34837

if (!IsStrict)

34838

return;

34839

34840

// Custom widen strict v2i32->v2f32 to avoid scalarization.

34841

// FIXME: Should generic type legalizer do this?

34842

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34843

DAG.getConstant(0, dl, MVT::v2i32));

34844

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

34845

{N->getOperand(0), Src});

34846

Results.push_back(Res);

34847

Results.push_back(Res.getValue(1));

34848

return;

34849

}

34850

34851

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34851, __extension__
__PRETTY_FUNCTION__));

34852

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

34853

SDValue VBias = DAG.getConstantFP(

34854

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);

34855

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

34856

DAG.getBitcast(MVT::v2i64, VBias));

34857

Or = DAG.getBitcast(MVT::v2f64, Or);

34858

if (IsStrict) {

34859

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

34860

{N->getOperand(0), Or, VBias});

34861

SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

34862

{MVT::v4f32, MVT::Other},

34863

{Sub.getValue(1), Sub});

34864

Results.push_back(Res);

34865

Results.push_back(Res.getValue(1));

34866

} else {

34867

// TODO: Are there any fast-math-flags to propagate here?

34868

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

34869

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

34870

}

34871

return;

34872

}

34873

case ISD::STRICT_FP_ROUND:

34874

case ISD::FP_ROUND: {

34875

bool IsStrict = N->isStrictFPOpcode();

34876

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34877

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34878

SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);

34879

EVT SrcVT = Src.getValueType();

34880

EVT VT = N->getValueType(0);

34881

SDValue V;

34882

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {

34883

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)

34884

: DAG.getUNDEF(MVT::v2f32);

34885

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);

34886

}

34887

if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {

34888

assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34888, __extension__
__PRETTY_FUNCTION__));

34889

if (SrcVT.getVectorElementType() != MVT::f32)

34890

return;

34891

34892

if (IsStrict)

34893

V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

34894

{Chain, Src, Rnd});

34895

else

34896

V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);

34897

34898

Results.push_back(DAG.getBitcast(MVT::v8f16, V));

34899

if (IsStrict)

34900

Results.push_back(V.getValue(1));

34901

return;

34902

}

34903

if (!isTypeLegal(Src.getValueType()))

34904

return;

34905

EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;

34906

if (IsStrict)

34907

V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},

34908

{Chain, Src});

34909

else

34910

V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);

34911

Results.push_back(V);

34912

if (IsStrict)

34913

Results.push_back(V.getValue(1));

34914

return;

34915

}

34916

case ISD::FP_EXTEND:

34917

case ISD::STRICT_FP_EXTEND: {

34918

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

34919

// No other ValueType for FP_EXTEND should reach this point.

34920

assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__
__PRETTY_FUNCTION__))

34921

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__
__PRETTY_FUNCTION__));

34922

if (!Subtarget.hasFP16() || !Subtarget.hasVLX())

34923

return;

34924

bool IsStrict = N->isStrictFPOpcode();

34925

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34926

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)

34927

: DAG.getUNDEF(MVT::v2f16);

34928

SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);

34929

if (IsStrict)

34930

V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},

34931

{N->getOperand(0), V});

34932

else

34933

V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);

34934

Results.push_back(V);

34935

if (IsStrict)

34936

Results.push_back(V.getValue(1));

34937

return;

34938

}

34939

case ISD::INTRINSIC_W_CHAIN: {

34940

unsigned IntNo = N->getConstantOperandVal(1);

34941

switch (IntNo) {

34942

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34943)

34943

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34943);

34944

case Intrinsic::x86_rdtsc:

34945

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

34946

Results);

34947

case Intrinsic::x86_rdtscp:

34948

return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

34949

Results);

34950

case Intrinsic::x86_rdpmc:

34951

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

34952

Results);

34953

return;

34954

case Intrinsic::x86_rdpru:

34955

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,

34956

Results);

34957

return;

34958

case Intrinsic::x86_xgetbv:

34959

expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

34960

Results);

34961

return;

34962

}

34963

}

34964

case ISD::READCYCLECOUNTER: {

34965

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

34966

}

34967

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

34968

EVT T = N->getValueType(0);

34969

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34969, __extension__
__PRETTY_FUNCTION__));

34970

bool Regs64bit = T == MVT::i128;

34971

assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__
__PRETTY_FUNCTION__))

34972

"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__
__PRETTY_FUNCTION__));

34973

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

34974

SDValue cpInL, cpInH;

34975

std::tie(cpInL, cpInH) =

34976

DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);

34977

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

34978

Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());

34979

cpInH =

34980

DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,

34981

cpInH, cpInL.getValue(1));

34982

SDValue swapInL, swapInH;

34983

std::tie(swapInL, swapInH) =

34984

DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);

34985

swapInH =

34986

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

34987

swapInH, cpInH.getValue(1));

34988

34989

// In 64-bit mode we might need the base pointer in RBX, but we can't know

34990

// until later. So we keep the RBX input in a vreg and use a custom

34991

// inserter.

34992

// Since RBX will be a reserved register the register allocator will not

34993

// make sure its value will be properly saved and restored around this

34994

// live-range.

34995

SDValue Result;

34996

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

34997

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

34998

if (Regs64bit) {

34999

SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,

35000

swapInH.getValue(1)};

35001

Result =

35002

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);

35003

} else {

35004

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,

35005

swapInH.getValue(1));

35006

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

35007

swapInL.getValue(1)};

35008

Result =

35009

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);

35010

}

35011

35012

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

35013

Regs64bit ? X86::RAX : X86::EAX,

35014

HalfT, Result.getValue(1));

35015

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

35016

Regs64bit ? X86::RDX : X86::EDX,

35017

HalfT, cpOutL.getValue(2));

35018

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

35019

35020

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

35021

MVT::i32, cpOutH.getValue(2));

35022

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

35023

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

35024

35025

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

35026

Results.push_back(Success);

35027

Results.push_back(EFLAGS.getValue(1));

35028

return;

35029

}

35030

case ISD::ATOMIC_LOAD: {

35031

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35031, __extension__
__PRETTY_FUNCTION__));

35032

bool NoImplicitFloatOps =

35033

DAG.getMachineFunction().getFunction().hasFnAttribute(

35034

Attribute::NoImplicitFloat);

35035

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

35036

auto *Node = cast<AtomicSDNode>(N);

35037

if (Subtarget.hasSSE1()) {

35038

// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

35039

// Then extract the lower 64-bits.

35040

MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

35041

SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

35042

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

35043

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

35044

MVT::i64, Node->getMemOperand());

35045

if (Subtarget.hasSSE2()) {

35046

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

35047

DAG.getIntPtrConstant(0, dl));

35048

Results.push_back(Res);

35049

Results.push_back(Ld.getValue(1));

35050

return;

35051

}

35052

// We use an alternative sequence for SSE1 that extracts as v2f32 and

35053

// then casts to i64. This avoids a 128-bit stack temporary being

35054

// created by type legalization if we were to cast v4f32->v2i64.

35055

SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

35056

DAG.getIntPtrConstant(0, dl));

35057

Res = DAG.getBitcast(MVT::i64, Res);

35058

Results.push_back(Res);

35059

Results.push_back(Ld.getValue(1));

35060

return;

35061

}

35062

if (Subtarget.hasX87()) {

35063

// First load this into an 80-bit X87 register. This will put the whole

35064

// integer into the significand.

35065

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

35066

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

35067

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

35068

dl, Tys, Ops, MVT::i64,

35069

Node->getMemOperand());

35070

SDValue Chain = Result.getValue(1);

35071

35072

// Now store the X87 register to a stack temporary and convert to i64.

35073

// This store is not atomic and doesn't need to be.

35074

// FIXME: We don't need a stack temporary if the result of the load

35075

// is already being stored. We could just directly store there.

35076

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

35077

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

35078

MachinePointerInfo MPI =

35079

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

35080

SDValue StoreOps[] = { Chain, Result, StackPtr };

35081

Chain = DAG.getMemIntrinsicNode(

35082

X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

35083

MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);

35084

35085

// Finally load the value back from the stack temporary and return it.

35086

// This load is not atomic and doesn't need to be.

35087

// This load will be further type legalized.

35088

Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

35089

Results.push_back(Result);

35090

Results.push_back(Result.getValue(1));

35091

return;

35092

}

35093

}

35094

// TODO: Use MOVLPS when SSE1 is available?

35095

// Delegate to generic TypeLegalization. Situations we can really handle

35096

// should have already been dealt with by AtomicExpandPass.cpp.

35097

break;

35098

}

35099

case ISD::ATOMIC_SWAP:

35100

case ISD::ATOMIC_LOAD_ADD:

35101

case ISD::ATOMIC_LOAD_SUB:

35102

case ISD::ATOMIC_LOAD_AND:

35103

case ISD::ATOMIC_LOAD_OR:

35104

case ISD::ATOMIC_LOAD_XOR:

35105

case ISD::ATOMIC_LOAD_NAND:

35106

case ISD::ATOMIC_LOAD_MIN:

35107

case ISD::ATOMIC_LOAD_MAX:

35108

case ISD::ATOMIC_LOAD_UMIN:

35109

case ISD::ATOMIC_LOAD_UMAX:

35110

// Delegate to generic TypeLegalization. Situations we can really handle

35111

// should have already been dealt with by AtomicExpandPass.cpp.

35112

break;

35113

35114

case ISD::BITCAST: {

35115

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35115, __extension__
__PRETTY_FUNCTION__));

35116

EVT DstVT = N->getValueType(0);

35117

EVT SrcVT = N->getOperand(0).getValueType();

35118

35119

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

35120

// we can split using the k-register rather than memory.

35121

if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

35122

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35122, __extension__
__PRETTY_FUNCTION__));

35123

SDValue Lo, Hi;

35124

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

35125

Lo = DAG.getBitcast(MVT::i32, Lo);

35126

Hi = DAG.getBitcast(MVT::i32, Hi);

35127

SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

35128

Results.push_back(Res);

35129

return;

35130

}

35131

35132

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

35133

// FIXME: Use v4f32 for SSE1?

35134

assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35134, __extension__
__PRETTY_FUNCTION__));

35135

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__
__PRETTY_FUNCTION__))

35136

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__
__PRETTY_FUNCTION__));

35137

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

35138

SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

35139

N->getOperand(0));

35140

Res = DAG.getBitcast(WideVT, Res);

35141

Results.push_back(Res);

35142

return;

35143

}

35144

35145

return;

35146

}

35147

case ISD::MGATHER: {

35148

EVT VT = N->getValueType(0);

35149

if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

35150

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

35151

auto *Gather = cast<MaskedGatherSDNode>(N);

35152

SDValue Index = Gather->getIndex();

35153

if (Index.getValueType() != MVT::v2i64)

35154

return;

35155

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__
__PRETTY_FUNCTION__))

35156

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__
__PRETTY_FUNCTION__));

35157

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

35158

SDValue Mask = Gather->getMask();

35159

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35159, __extension__
__PRETTY_FUNCTION__));

35160

SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

35161

Gather->getPassThru(),

35162

DAG.getUNDEF(VT));

35163

if (!Subtarget.hasVLX()) {

35164

// We need to widen the mask, but the instruction will only use 2

35165

// of its elements. So we can use undef.

35166

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

35167

DAG.getUNDEF(MVT::v2i1));

35168

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

35169

}

35170

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

35171

Gather->getBasePtr(), Index, Gather->getScale() };

35172

SDValue Res = DAG.getMemIntrinsicNode(

35173

X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

35174

Gather->getMemoryVT(), Gather->getMemOperand());

35175

Results.push_back(Res);

35176

Results.push_back(Res.getValue(1));

35177

return;

35178

}

35179

return;

35180

}

35181

case ISD::LOAD: {

35182

// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

35183

// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

35184

// cast since type legalization will try to use an i64 load.

35185

MVT VT = N->getSimpleValueType(0);

35186

assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35186, __extension__
__PRETTY_FUNCTION__));

35187

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__
__PRETTY_FUNCTION__))

35188

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__
__PRETTY_FUNCTION__));

35189

if (!ISD::isNON_EXTLoad(N))

35190

return;

35191

auto *Ld = cast<LoadSDNode>(N);

35192

if (Subtarget.hasSSE2()) {

35193

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

35194

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

35195

Ld->getPointerInfo(), Ld->getOriginalAlign(),

35196

Ld->getMemOperand()->getFlags());

35197

SDValue Chain = Res.getValue(1);

35198

MVT VecVT = MVT::getVectorVT(LdVT, 2);

35199

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

35200

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

35201

Res = DAG.getBitcast(WideVT, Res);

35202

Results.push_back(Res);

35203

Results.push_back(Chain);

35204

return;

35205

}

35206

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35206, __extension__
__PRETTY_FUNCTION__));

35207

SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

35208

SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

35209

SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

35210

MVT::i64, Ld->getMemOperand());

35211

Results.push_back(Res);

35212

Results.push_back(Res.getValue(1));

35213

return;

35214

}

35215

case ISD::ADDRSPACECAST: {

35216

SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

35217

Results.push_back(V);

35218

return;

35219

}

35220

case ISD::BITREVERSE: {

35221

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35221, __extension__
__PRETTY_FUNCTION__));

35222

assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35222, __extension__
__PRETTY_FUNCTION__));

35223

// We can use VPPERM by copying to a vector register and back. We'll need

35224

// to move the scalar in two i32 pieces.

35225

Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));

35226

return;

35227

}

35228

case ISD::EXTRACT_VECTOR_ELT: {

35229

// f16 = extract vXf16 %vec, i64 %idx

35230

assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__
__PRETTY_FUNCTION__))

35231

"Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__
__PRETTY_FUNCTION__));

35232

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35232, __extension__
__PRETTY_FUNCTION__));

35233

SDValue VecOp = N->getOperand(0);

35234

EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();

35235

SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));

35236

Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,

35237

N->getOperand(1));

35238

Split = DAG.getBitcast(MVT::f16, Split);

35239

Results.push_back(Split);

35240

return;

35241

}

35242

}

35243

}

35244

35245

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

35246

switch ((X86ISD::NodeType)Opcode) {

35247

case X86ISD::FIRST_NUMBER: break;

35248

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

35249

NODE_NAME_CASE(BSF)

35250

NODE_NAME_CASE(BSR)

35251

NODE_NAME_CASE(FSHL)

35252

NODE_NAME_CASE(FSHR)

35253

NODE_NAME_CASE(FAND)

35254

NODE_NAME_CASE(FANDN)

35255

NODE_NAME_CASE(FOR)

35256

NODE_NAME_CASE(FXOR)

35257

NODE_NAME_CASE(FILD)

35258

NODE_NAME_CASE(FIST)

35259

NODE_NAME_CASE(FP_TO_INT_IN_MEM)

35260

NODE_NAME_CASE(FLD)

35261

NODE_NAME_CASE(FST)

35262

NODE_NAME_CASE(CALL)

35263

NODE_NAME_CASE(CALL_RVMARKER)

35264

NODE_NAME_CASE(BT)

35265

NODE_NAME_CASE(CMP)

35266

NODE_NAME_CASE(FCMP)

35267

NODE_NAME_CASE(STRICT_FCMP)

35268

NODE_NAME_CASE(STRICT_FCMPS)

35269

NODE_NAME_CASE(COMI)

35270

NODE_NAME_CASE(UCOMI)

35271

NODE_NAME_CASE(CMPM)

35272

NODE_NAME_CASE(CMPMM)

35273

NODE_NAME_CASE(STRICT_CMPM)

35274

NODE_NAME_CASE(CMPMM_SAE)

35275

NODE_NAME_CASE(SETCC)

35276

NODE_NAME_CASE(SETCC_CARRY)

35277

NODE_NAME_CASE(FSETCC)

35278

NODE_NAME_CASE(FSETCCM)

35279

NODE_NAME_CASE(FSETCCM_SAE)

35280

NODE_NAME_CASE(CMOV)

35281

NODE_NAME_CASE(BRCOND)

35282

NODE_NAME_CASE(RET_GLUE)

35283

NODE_NAME_CASE(IRET)

35284

NODE_NAME_CASE(REP_STOS)

35285

NODE_NAME_CASE(REP_MOVS)

35286

NODE_NAME_CASE(GlobalBaseReg)

35287

NODE_NAME_CASE(Wrapper)

35288

NODE_NAME_CASE(WrapperRIP)

35289

NODE_NAME_CASE(MOVQ2DQ)

35290

NODE_NAME_CASE(MOVDQ2Q)

35291

NODE_NAME_CASE(MMX_MOVD2W)

35292

NODE_NAME_CASE(MMX_MOVW2D)

35293

NODE_NAME_CASE(PEXTRB)

35294

NODE_NAME_CASE(PEXTRW)

35295

NODE_NAME_CASE(INSERTPS)

35296

NODE_NAME_CASE(PINSRB)

35297

NODE_NAME_CASE(PINSRW)

35298

NODE_NAME_CASE(PSHUFB)

35299

NODE_NAME_CASE(ANDNP)

35300

NODE_NAME_CASE(BLENDI)

35301

NODE_NAME_CASE(BLENDV)

35302

NODE_NAME_CASE(HADD)

35303

NODE_NAME_CASE(HSUB)

35304

NODE_NAME_CASE(FHADD)

35305

NODE_NAME_CASE(FHSUB)

35306

NODE_NAME_CASE(CONFLICT)

35307

NODE_NAME_CASE(FMAX)

35308

NODE_NAME_CASE(FMAXS)

35309

NODE_NAME_CASE(FMAX_SAE)

35310

NODE_NAME_CASE(FMAXS_SAE)

35311

NODE_NAME_CASE(FMIN)

35312

NODE_NAME_CASE(FMINS)

35313

NODE_NAME_CASE(FMIN_SAE)

35314

NODE_NAME_CASE(FMINS_SAE)

35315

NODE_NAME_CASE(FMAXC)

35316

NODE_NAME_CASE(FMINC)

35317

NODE_NAME_CASE(FRSQRT)

35318

NODE_NAME_CASE(FRCP)

35319

NODE_NAME_CASE(EXTRQI)

35320

NODE_NAME_CASE(INSERTQI)

35321

NODE_NAME_CASE(TLSADDR)

35322

NODE_NAME_CASE(TLSBASEADDR)

35323

NODE_NAME_CASE(TLSCALL)

35324

NODE_NAME_CASE(EH_SJLJ_SETJMP)

35325

NODE_NAME_CASE(EH_SJLJ_LONGJMP)

35326

NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

35327

NODE_NAME_CASE(EH_RETURN)

35328

NODE_NAME_CASE(TC_RETURN)

35329

NODE_NAME_CASE(FNSTCW16m)

35330

NODE_NAME_CASE(FLDCW16m)

35331

NODE_NAME_CASE(LCMPXCHG_DAG)

35332

NODE_NAME_CASE(LCMPXCHG8_DAG)

35333

NODE_NAME_CASE(LCMPXCHG16_DAG)

35334

NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

35335

NODE_NAME_CASE(LADD)

35336

NODE_NAME_CASE(LSUB)

35337

NODE_NAME_CASE(LOR)

35338

NODE_NAME_CASE(LXOR)

35339

NODE_NAME_CASE(LAND)

35340

NODE_NAME_CASE(LBTS)

35341

NODE_NAME_CASE(LBTC)

35342

NODE_NAME_CASE(LBTR)

35343

NODE_NAME_CASE(LBTS_RM)

35344

NODE_NAME_CASE(LBTC_RM)

35345

NODE_NAME_CASE(LBTR_RM)

35346

NODE_NAME_CASE(AADD)

35347

NODE_NAME_CASE(AOR)

35348

NODE_NAME_CASE(AXOR)

35349

NODE_NAME_CASE(AAND)

35350

NODE_NAME_CASE(VZEXT_MOVL)

35351

NODE_NAME_CASE(VZEXT_LOAD)

35352

NODE_NAME_CASE(VEXTRACT_STORE)

35353

NODE_NAME_CASE(VTRUNC)

35354

NODE_NAME_CASE(VTRUNCS)

35355

NODE_NAME_CASE(VTRUNCUS)

35356

NODE_NAME_CASE(VMTRUNC)

35357

NODE_NAME_CASE(VMTRUNCS)

35358

NODE_NAME_CASE(VMTRUNCUS)

35359

NODE_NAME_CASE(VTRUNCSTORES)

35360

NODE_NAME_CASE(VTRUNCSTOREUS)

35361

NODE_NAME_CASE(VMTRUNCSTORES)

35362

NODE_NAME_CASE(VMTRUNCSTOREUS)

35363

NODE_NAME_CASE(VFPEXT)

35364

NODE_NAME_CASE(STRICT_VFPEXT)

35365

NODE_NAME_CASE(VFPEXT_SAE)

35366

NODE_NAME_CASE(VFPEXTS)

35367

NODE_NAME_CASE(VFPEXTS_SAE)

35368

NODE_NAME_CASE(VFPROUND)

35369

NODE_NAME_CASE(STRICT_VFPROUND)

35370

NODE_NAME_CASE(VMFPROUND)

35371

NODE_NAME_CASE(VFPROUND_RND)

35372

NODE_NAME_CASE(VFPROUNDS)

35373

NODE_NAME_CASE(VFPROUNDS_RND)

35374

NODE_NAME_CASE(VSHLDQ)

35375

NODE_NAME_CASE(VSRLDQ)

35376

NODE_NAME_CASE(VSHL)

35377

NODE_NAME_CASE(VSRL)

35378

NODE_NAME_CASE(VSRA)

35379

NODE_NAME_CASE(VSHLI)

35380

NODE_NAME_CASE(VSRLI)

35381

NODE_NAME_CASE(VSRAI)

35382

NODE_NAME_CASE(VSHLV)

35383

NODE_NAME_CASE(VSRLV)

35384

NODE_NAME_CASE(VSRAV)

35385

NODE_NAME_CASE(VROTLI)

35386

NODE_NAME_CASE(VROTRI)

35387

NODE_NAME_CASE(VPPERM)

35388

NODE_NAME_CASE(CMPP)

35389

NODE_NAME_CASE(STRICT_CMPP)

35390

NODE_NAME_CASE(PCMPEQ)

35391

NODE_NAME_CASE(PCMPGT)

35392

NODE_NAME_CASE(PHMINPOS)

35393

NODE_NAME_CASE(ADD)

35394

NODE_NAME_CASE(SUB)

35395

NODE_NAME_CASE(ADC)

35396

NODE_NAME_CASE(SBB)

35397

NODE_NAME_CASE(SMUL)

35398

NODE_NAME_CASE(UMUL)

35399

NODE_NAME_CASE(OR)

35400

NODE_NAME_CASE(XOR)

35401

NODE_NAME_CASE(AND)

35402

NODE_NAME_CASE(BEXTR)

35403

NODE_NAME_CASE(BEXTRI)

35404

NODE_NAME_CASE(BZHI)

35405

NODE_NAME_CASE(PDEP)

35406

NODE_NAME_CASE(PEXT)

35407

NODE_NAME_CASE(MUL_IMM)

35408

NODE_NAME_CASE(MOVMSK)

35409

NODE_NAME_CASE(PTEST)

35410

NODE_NAME_CASE(TESTP)

35411

NODE_NAME_CASE(KORTEST)

35412

NODE_NAME_CASE(KTEST)

35413

NODE_NAME_CASE(KADD)

35414

NODE_NAME_CASE(KSHIFTL)

35415

NODE_NAME_CASE(KSHIFTR)

35416

NODE_NAME_CASE(PACKSS)

35417

NODE_NAME_CASE(PACKUS)

35418

NODE_NAME_CASE(PALIGNR)

35419

NODE_NAME_CASE(VALIGN)

35420

NODE_NAME_CASE(VSHLD)

35421

NODE_NAME_CASE(VSHRD)

35422

NODE_NAME_CASE(VSHLDV)

35423

NODE_NAME_CASE(VSHRDV)

35424

NODE_NAME_CASE(PSHUFD)

35425

NODE_NAME_CASE(PSHUFHW)

35426

NODE_NAME_CASE(PSHUFLW)

35427

NODE_NAME_CASE(SHUFP)

35428

NODE_NAME_CASE(SHUF128)

35429

NODE_NAME_CASE(MOVLHPS)

35430

NODE_NAME_CASE(MOVHLPS)

35431

NODE_NAME_CASE(MOVDDUP)

35432

NODE_NAME_CASE(MOVSHDUP)

35433

NODE_NAME_CASE(MOVSLDUP)

35434

NODE_NAME_CASE(MOVSD)

35435

NODE_NAME_CASE(MOVSS)

35436

NODE_NAME_CASE(MOVSH)

35437

NODE_NAME_CASE(UNPCKL)

35438

NODE_NAME_CASE(UNPCKH)

35439

NODE_NAME_CASE(VBROADCAST)

35440

NODE_NAME_CASE(VBROADCAST_LOAD)

35441

NODE_NAME_CASE(VBROADCASTM)

35442

NODE_NAME_CASE(SUBV_BROADCAST_LOAD)

35443

NODE_NAME_CASE(VPERMILPV)

35444

NODE_NAME_CASE(VPERMILPI)

35445

NODE_NAME_CASE(VPERM2X128)

35446

NODE_NAME_CASE(VPERMV)

35447

NODE_NAME_CASE(VPERMV3)

35448

NODE_NAME_CASE(VPERMI)

35449

NODE_NAME_CASE(VPTERNLOG)

35450

NODE_NAME_CASE(VFIXUPIMM)

35451

NODE_NAME_CASE(VFIXUPIMM_SAE)

35452

NODE_NAME_CASE(VFIXUPIMMS)

35453

NODE_NAME_CASE(VFIXUPIMMS_SAE)

35454

NODE_NAME_CASE(VRANGE)

35455

NODE_NAME_CASE(VRANGE_SAE)

35456

NODE_NAME_CASE(VRANGES)

35457

NODE_NAME_CASE(VRANGES_SAE)

35458

NODE_NAME_CASE(PMULUDQ)

35459

NODE_NAME_CASE(PMULDQ)

35460

NODE_NAME_CASE(PSADBW)

35461

NODE_NAME_CASE(DBPSADBW)

35462

NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

35463

NODE_NAME_CASE(VAARG_64)

35464

NODE_NAME_CASE(VAARG_X32)

35465

NODE_NAME_CASE(DYN_ALLOCA)

35466

NODE_NAME_CASE(MFENCE)

35467

NODE_NAME_CASE(SEG_ALLOCA)

35468

NODE_NAME_CASE(PROBED_ALLOCA)

35469

NODE_NAME_CASE(RDRAND)

35470

NODE_NAME_CASE(RDSEED)

35471

NODE_NAME_CASE(RDPKRU)

35472

NODE_NAME_CASE(WRPKRU)

35473

NODE_NAME_CASE(VPMADDUBSW)

35474

NODE_NAME_CASE(VPMADDWD)

35475

NODE_NAME_CASE(VPSHA)

35476

NODE_NAME_CASE(VPSHL)

35477

NODE_NAME_CASE(VPCOM)

35478

NODE_NAME_CASE(VPCOMU)

35479

NODE_NAME_CASE(VPERMIL2)

35480

NODE_NAME_CASE(FMSUB)

35481

NODE_NAME_CASE(STRICT_FMSUB)

35482

NODE_NAME_CASE(FNMADD)

35483

NODE_NAME_CASE(STRICT_FNMADD)

35484

NODE_NAME_CASE(FNMSUB)

35485

NODE_NAME_CASE(STRICT_FNMSUB)

35486

NODE_NAME_CASE(FMADDSUB)

35487

NODE_NAME_CASE(FMSUBADD)

35488

NODE_NAME_CASE(FMADD_RND)

35489

NODE_NAME_CASE(FNMADD_RND)

35490

NODE_NAME_CASE(FMSUB_RND)

35491

NODE_NAME_CASE(FNMSUB_RND)

35492

NODE_NAME_CASE(FMADDSUB_RND)

35493

NODE_NAME_CASE(FMSUBADD_RND)

35494

NODE_NAME_CASE(VFMADDC)

35495

NODE_NAME_CASE(VFMADDC_RND)

35496

NODE_NAME_CASE(VFCMADDC)

35497

NODE_NAME_CASE(VFCMADDC_RND)

35498

NODE_NAME_CASE(VFMULC)

35499

NODE_NAME_CASE(VFMULC_RND)

35500

NODE_NAME_CASE(VFCMULC)

35501

NODE_NAME_CASE(VFCMULC_RND)

35502

NODE_NAME_CASE(VFMULCSH)

35503

NODE_NAME_CASE(VFMULCSH_RND)

35504

NODE_NAME_CASE(VFCMULCSH)

35505

NODE_NAME_CASE(VFCMULCSH_RND)

35506

NODE_NAME_CASE(VFMADDCSH)

35507

NODE_NAME_CASE(VFMADDCSH_RND)

35508

NODE_NAME_CASE(VFCMADDCSH)

35509

NODE_NAME_CASE(VFCMADDCSH_RND)

35510

NODE_NAME_CASE(VPMADD52H)

35511

NODE_NAME_CASE(VPMADD52L)

35512

NODE_NAME_CASE(VRNDSCALE)

35513

NODE_NAME_CASE(STRICT_VRNDSCALE)

35514

NODE_NAME_CASE(VRNDSCALE_SAE)

35515

NODE_NAME_CASE(VRNDSCALES)

35516

NODE_NAME_CASE(VRNDSCALES_SAE)

35517

NODE_NAME_CASE(VREDUCE)

35518

NODE_NAME_CASE(VREDUCE_SAE)

35519

NODE_NAME_CASE(VREDUCES)

35520

NODE_NAME_CASE(VREDUCES_SAE)

35521

NODE_NAME_CASE(VGETMANT)

35522

NODE_NAME_CASE(VGETMANT_SAE)

35523

NODE_NAME_CASE(VGETMANTS)

35524

NODE_NAME_CASE(VGETMANTS_SAE)

35525

NODE_NAME_CASE(PCMPESTR)

35526

NODE_NAME_CASE(PCMPISTR)

35527

NODE_NAME_CASE(XTEST)

35528

NODE_NAME_CASE(COMPRESS)

35529

NODE_NAME_CASE(EXPAND)

35530

NODE_NAME_CASE(SELECTS)

35531

NODE_NAME_CASE(ADDSUB)

35532

NODE_NAME_CASE(RCP14)

35533

NODE_NAME_CASE(RCP14S)

35534

NODE_NAME_CASE(RCP28)

35535

NODE_NAME_CASE(RCP28_SAE)

35536

NODE_NAME_CASE(RCP28S)

35537

NODE_NAME_CASE(RCP28S_SAE)

35538

NODE_NAME_CASE(EXP2)

35539

NODE_NAME_CASE(EXP2_SAE)

35540

NODE_NAME_CASE(RSQRT14)

35541

NODE_NAME_CASE(RSQRT14S)

35542

NODE_NAME_CASE(RSQRT28)

35543

NODE_NAME_CASE(RSQRT28_SAE)

35544

NODE_NAME_CASE(RSQRT28S)

35545

NODE_NAME_CASE(RSQRT28S_SAE)

35546

NODE_NAME_CASE(FADD_RND)

35547

NODE_NAME_CASE(FADDS)

35548

NODE_NAME_CASE(FADDS_RND)

35549

NODE_NAME_CASE(FSUB_RND)

35550

NODE_NAME_CASE(FSUBS)

35551

NODE_NAME_CASE(FSUBS_RND)

35552

NODE_NAME_CASE(FMUL_RND)

35553

NODE_NAME_CASE(FMULS)

35554

NODE_NAME_CASE(FMULS_RND)

35555

NODE_NAME_CASE(FDIV_RND)

35556

NODE_NAME_CASE(FDIVS)

35557

NODE_NAME_CASE(FDIVS_RND)

35558

NODE_NAME_CASE(FSQRT_RND)

35559

NODE_NAME_CASE(FSQRTS)

35560

NODE_NAME_CASE(FSQRTS_RND)

35561

NODE_NAME_CASE(FGETEXP)

35562

NODE_NAME_CASE(FGETEXP_SAE)

35563

NODE_NAME_CASE(FGETEXPS)

35564

NODE_NAME_CASE(FGETEXPS_SAE)

35565

NODE_NAME_CASE(SCALEF)

35566

NODE_NAME_CASE(SCALEF_RND)

35567

NODE_NAME_CASE(SCALEFS)

35568

NODE_NAME_CASE(SCALEFS_RND)

35569

NODE_NAME_CASE(MULHRS)

35570

NODE_NAME_CASE(SINT_TO_FP_RND)

35571

NODE_NAME_CASE(UINT_TO_FP_RND)

35572

NODE_NAME_CASE(CVTTP2SI)

35573

NODE_NAME_CASE(CVTTP2UI)

35574

NODE_NAME_CASE(STRICT_CVTTP2SI)

35575

NODE_NAME_CASE(STRICT_CVTTP2UI)

35576

NODE_NAME_CASE(MCVTTP2SI)

35577

NODE_NAME_CASE(MCVTTP2UI)

35578

NODE_NAME_CASE(CVTTP2SI_SAE)

35579

NODE_NAME_CASE(CVTTP2UI_SAE)

35580

NODE_NAME_CASE(CVTTS2SI)

35581

NODE_NAME_CASE(CVTTS2UI)

35582

NODE_NAME_CASE(CVTTS2SI_SAE)

35583

NODE_NAME_CASE(CVTTS2UI_SAE)

35584

NODE_NAME_CASE(CVTSI2P)

35585

NODE_NAME_CASE(CVTUI2P)

35586

NODE_NAME_CASE(STRICT_CVTSI2P)

35587

NODE_NAME_CASE(STRICT_CVTUI2P)

35588

NODE_NAME_CASE(MCVTSI2P)

35589

NODE_NAME_CASE(MCVTUI2P)

35590

NODE_NAME_CASE(VFPCLASS)

35591

NODE_NAME_CASE(VFPCLASSS)

35592

NODE_NAME_CASE(MULTISHIFT)

35593

NODE_NAME_CASE(SCALAR_SINT_TO_FP)

35594

NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

35595

NODE_NAME_CASE(SCALAR_UINT_TO_FP)

35596

NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

35597

NODE_NAME_CASE(CVTPS2PH)

35598

NODE_NAME_CASE(STRICT_CVTPS2PH)

35599

NODE_NAME_CASE(CVTPS2PH_SAE)

35600

NODE_NAME_CASE(MCVTPS2PH)

35601

NODE_NAME_CASE(MCVTPS2PH_SAE)

35602

NODE_NAME_CASE(CVTPH2PS)

35603

NODE_NAME_CASE(STRICT_CVTPH2PS)

35604

NODE_NAME_CASE(CVTPH2PS_SAE)

35605

NODE_NAME_CASE(CVTP2SI)

35606

NODE_NAME_CASE(CVTP2UI)

35607

NODE_NAME_CASE(MCVTP2SI)

35608

NODE_NAME_CASE(MCVTP2UI)

35609

NODE_NAME_CASE(CVTP2SI_RND)

35610

NODE_NAME_CASE(CVTP2UI_RND)

35611

NODE_NAME_CASE(CVTS2SI)

35612

NODE_NAME_CASE(CVTS2UI)

35613

NODE_NAME_CASE(CVTS2SI_RND)

35614

NODE_NAME_CASE(CVTS2UI_RND)

35615

NODE_NAME_CASE(CVTNE2PS2BF16)

35616

NODE_NAME_CASE(CVTNEPS2BF16)

35617

NODE_NAME_CASE(MCVTNEPS2BF16)

35618

NODE_NAME_CASE(DPBF16PS)

35619

NODE_NAME_CASE(LWPINS)

35620

NODE_NAME_CASE(MGATHER)

35621

NODE_NAME_CASE(MSCATTER)

35622

NODE_NAME_CASE(VPDPBUSD)

35623

NODE_NAME_CASE(VPDPBUSDS)

35624

NODE_NAME_CASE(VPDPWSSD)

35625

NODE_NAME_CASE(VPDPWSSDS)

35626

NODE_NAME_CASE(VPSHUFBITQMB)

35627

NODE_NAME_CASE(GF2P8MULB)

35628

NODE_NAME_CASE(GF2P8AFFINEQB)

35629

NODE_NAME_CASE(GF2P8AFFINEINVQB)

35630

NODE_NAME_CASE(NT_CALL)

35631

NODE_NAME_CASE(NT_BRIND)

35632

NODE_NAME_CASE(UMWAIT)

35633

NODE_NAME_CASE(TPAUSE)

35634

NODE_NAME_CASE(ENQCMD)

35635

NODE_NAME_CASE(ENQCMDS)

35636

NODE_NAME_CASE(VP2INTERSECT)

35637

NODE_NAME_CASE(VPDPBSUD)

35638

NODE_NAME_CASE(VPDPBSUDS)

35639

NODE_NAME_CASE(VPDPBUUD)

35640

NODE_NAME_CASE(VPDPBUUDS)

35641

NODE_NAME_CASE(VPDPBSSD)

35642

NODE_NAME_CASE(VPDPBSSDS)

35643

NODE_NAME_CASE(AESENC128KL)

35644

NODE_NAME_CASE(AESDEC128KL)

35645

NODE_NAME_CASE(AESENC256KL)

35646

NODE_NAME_CASE(AESDEC256KL)

35647

NODE_NAME_CASE(AESENCWIDE128KL)

35648

NODE_NAME_CASE(AESDECWIDE128KL)

35649

NODE_NAME_CASE(AESENCWIDE256KL)

35650

NODE_NAME_CASE(AESDECWIDE256KL)

35651

NODE_NAME_CASE(CMPCCXADD)

35652

NODE_NAME_CASE(TESTUI)

35653

NODE_NAME_CASE(FP80_ADD)

35654

NODE_NAME_CASE(STRICT_FP80_ADD)

35655

}

35656

return nullptr;

35657

#undef NODE_NAME_CASE

35658

}

35659

35660

/// Return true if the addressing mode represented by AM is legal for this

35661

/// target, for a load/store of the specified type.

35662

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

35663

const AddrMode &AM, Type *Ty,

35664

unsigned AS,

35665

Instruction *I) const {

35666

// X86 supports extremely general addressing modes.

35667

CodeModel::Model M = getTargetMachine().getCodeModel();

35668

35669

// X86 allows a sign-extended 32-bit immediate field as a displacement.

35670

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

35671

return false;

35672

35673

if (AM.BaseGV) {

35674

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

35675

35676

// If a reference to this global requires an extra load, we can't fold it.

35677

if (isGlobalStubReference(GVFlags))

35678

return false;

35679

35680

// If BaseGV requires a register for the PIC base, we cannot also have a

35681

// BaseReg specified.

35682

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

35683

return false;

35684

35685

// If lower 4G is not available, then we must use rip-relative addressing.

35686

if ((M != CodeModel::Small || isPositionIndependent()) &&

35687

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

35688

return false;

35689

}

35690

35691

switch (AM.Scale) {

35692

case 0:

35693

case 1:

35694

case 2:

35695

case 4:

35696

case 8:

35697

// These scales always work.

35698

break;

35699

case 3:

35700

case 5:

35701

case 9:

35702

// These scales are formed with basereg+scalereg. Only accept if there is

35703

// no basereg yet.

35704

if (AM.HasBaseReg)

35705

return false;

35706

break;

35707

default: // Other stuff never works.

35708

return false;

35709

}

35710

35711

return true;

35712

}

35713

35714

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

35715

unsigned Bits = Ty->getScalarSizeInBits();

35716

35717

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

35718

// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

35719

if (Subtarget.hasXOP() &&

35720

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

35721

return false;

35722

35723

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

35724

// shifts just as cheap as scalar ones.

35725

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

35726

return false;

35727

35728

// AVX512BW has shifts such as vpsllvw.

35729

if (Subtarget.hasBWI() && Bits == 16)

35730

return false;

35731

35732

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

35733

// fully general vector.

35734

return true;

35735

}

35736

35737

bool X86TargetLowering::isBinOp(unsigned Opcode) const {

35738

switch (Opcode) {

35739

// These are non-commutative binops.

35740

// TODO: Add more X86ISD opcodes once we have test coverage.

35741

case X86ISD::ANDNP:

35742

case X86ISD::PCMPGT:

35743

case X86ISD::FMAX:

35744

case X86ISD::FMIN:

35745

case X86ISD::FANDN:

35746

case X86ISD::VPSHA:

35747

case X86ISD::VPSHL:

35748

case X86ISD::VSHLV:

35749

case X86ISD::VSRLV:

35750

case X86ISD::VSRAV:

35751

return true;

35752

}

35753

35754

return TargetLoweringBase::isBinOp(Opcode);

35755

}

35756

35757

bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

35758

switch (Opcode) {

35759

// TODO: Add more X86ISD opcodes once we have test coverage.

35760

case X86ISD::PCMPEQ:

35761

case X86ISD::PMULDQ:

35762

case X86ISD::PMULUDQ:

35763

case X86ISD::FMAXC:

35764

case X86ISD::FMINC:

35765

case X86ISD::FAND:

35766

case X86ISD::FOR:

35767

case X86ISD::FXOR:

35768

return true;

35769

}

35770

35771

return TargetLoweringBase::isCommutativeBinOp(Opcode);

35772

}

35773

35774

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

35775

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35776

return false;

35777

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

35778

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

35779

return NumBits1 > NumBits2;

35780

}

35781

35782

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

35783

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35784

return false;

35785

35786

if (!isTypeLegal(EVT::getEVT(Ty1)))

35787

return false;

35788

35789

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35789, __extension__
__PRETTY_FUNCTION__));

35790

35791

// Assuming the caller doesn't have a zeroext or signext return parameter,

35792

// truncation all the way down to i1 is valid.

35793

return true;

35794

}

35795

35796

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

35797

return isInt<32>(Imm);

35798

}

35799

35800

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

35801

// Can also use sub to handle negated immediates.

35802

return isInt<32>(Imm);

35803

}

35804

35805

bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

35806

return isInt<32>(Imm);

35807

}

35808

35809

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

35810

if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

35811

return false;

35812

unsigned NumBits1 = VT1.getSizeInBits();

35813

unsigned NumBits2 = VT2.getSizeInBits();

35814

return NumBits1 > NumBits2;

35815

}

35816

35817

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

35818

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35819

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

35820

}

35821

35822

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

35823

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35824

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

35825

}

35826

35827

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

35828

EVT VT1 = Val.getValueType();

35829

if (isZExtFree(VT1, VT2))

35830

return true;

35831

35832

if (Val.getOpcode() != ISD::LOAD)

35833

return false;

35834

35835

if (!VT1.isSimple() || !VT1.isInteger() ||

35836

!VT2.isSimple() || !VT2.isInteger())

35837

return false;

35838

35839

switch (VT1.getSimpleVT().SimpleTy) {

35840

default: break;

35841

case MVT::i8:

35842

case MVT::i16:

35843

case MVT::i32:

35844

// X86 has 8, 16, and 32-bit zero-extending loads.

35845

return true;

35846

}

35847

35848

return false;

35849

}

35850

35851

bool X86TargetLowering::shouldSinkOperands(Instruction *I,

35852

SmallVectorImpl<Use *> &Ops) const {

35853

using namespace llvm::PatternMatch;

35854

35855

FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());

35856

if (!VTy)

35857

return false;

35858

35859

if (I->getOpcode() == Instruction::Mul &&

35860

VTy->getElementType()->isIntegerTy(64)) {

35861

for (auto &Op : I->operands()) {

35862

// Make sure we are not already sinking this operand

35863

if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))

35864

continue;

35865

35866

// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or

35867

// the PMULUDQ pattern where the input is a zext_inreg from vXi32.

35868

if (Subtarget.hasSSE41() &&

35869

match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),

35870

m_SpecificInt(32)))) {

35871

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

35872

Ops.push_back(&Op);

35873

} else if (Subtarget.hasSSE2() &&

35874

match(Op.get(),

35875

m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {

35876

Ops.push_back(&Op);

35877

}

35878

}

35879

35880

return !Ops.empty();

35881

}

35882

35883

// A uniform shift amount in a vector shift or funnel shift may be much

35884

// cheaper than a generic variable vector shift, so make that pattern visible

35885

// to SDAG by sinking the shuffle instruction next to the shift.

35886

int ShiftAmountOpNum = -1;

35887

if (I->isShift())

35888

ShiftAmountOpNum = 1;

35889

else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

35890

if (II->getIntrinsicID() == Intrinsic::fshl ||

35891

II->getIntrinsicID() == Intrinsic::fshr)

35892

ShiftAmountOpNum = 2;

35893

}

35894

35895

if (ShiftAmountOpNum == -1)

35896

return false;

35897

35898

auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

35899

if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

35900

isVectorShiftByScalarCheap(I->getType())) {

35901

Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

35902

return true;

35903

}

35904

35905

return false;

35906

}

35907

35908

bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

35909

if (!Subtarget.is64Bit())

35910

return false;

35911

return TargetLowering::shouldConvertPhiType(From, To);

35912

}

35913

35914

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

35915

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

35916

return false;

35917

35918

EVT SrcVT = ExtVal.getOperand(0).getValueType();

35919

35920

// There is no extending load for vXi1.

35921

if (SrcVT.getScalarType() == MVT::i1)

35922

return false;

35923

35924

return true;

35925

}

35926

35927

bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

35928

EVT VT) const {

35929

if (!Subtarget.hasAnyFMA())

35930

return false;

35931

35932

VT = VT.getScalarType();

35933

35934

if (!VT.isSimple())

35935

return false;

35936

35937

switch (VT.getSimpleVT().SimpleTy) {

35938

case MVT::f16:

35939

return Subtarget.hasFP16();

35940

case MVT::f32:

35941

case MVT::f64:

35942

return true;

35943

default:

35944

break;

35945

}

35946

35947

return false;

35948

}

35949

35950

bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {

35951

// i16 instructions are longer (0x66 prefix) and potentially slower.

35952

return !(SrcVT == MVT::i32 && DestVT == MVT::i16);

35953

}

35954

35955

bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,

35956

EVT VT) const {

35957

// TODO: This is too general. There are cases where pre-AVX512 codegen would

35958

// benefit. The transform may also be profitable for scalar code.

35959

if (!Subtarget.hasAVX512())

35960

return false;

35961

if (!Subtarget.hasVLX() && !VT.is512BitVector())

35962

return false;

35963

if (!VT.isVector() || VT.getScalarType() == MVT::i1)

35964

return false;

35965

35966

return true;

35967

}

35968

35969

/// Targets can use this to indicate that they only support *some*

35970

/// VECTOR_SHUFFLE operations, those with specific masks.

35971

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

35972

/// are assumed to be legal.

35973

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

35974

if (!VT.isSimple())

35975

return false;

35976

35977

// Not for i1 vectors

35978

if (VT.getSimpleVT().getScalarType() == MVT::i1)

35979

return false;

35980

35981

// Very little shuffling can be done for 64-bit vectors right now.

35982

if (VT.getSimpleVT().getSizeInBits() == 64)

35983

return false;

35984

35985

// We only care that the types being shuffled are legal. The lowering can

35986

// handle any possible shuffle mask that results.

35987

return isTypeLegal(VT.getSimpleVT());

35988

}

35989

35990

bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

35991

EVT VT) const {

35992

// Don't convert an 'and' into a shuffle that we don't directly support.

35993

// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

35994

if (!Subtarget.hasAVX2())

35995

if (VT == MVT::v32i8 || VT == MVT::v16i16)

35996

return false;

35997

35998

// Just delegate to the generic legality, clear masks aren't special.

35999

return isShuffleMaskLegal(Mask, VT);

36000

}

36001

36002

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

36003

// If the subtarget is using thunks, we need to not generate jump tables.

36004

if (Subtarget.useIndirectThunkBranches())

36005

return false;

36006

36007

// Otherwise, fallback on the generic logic.

36008

return TargetLowering::areJTsAllowed(Fn);

36009

}

36010

36011

MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,

36012

EVT ConditionVT) const {

36013

// Avoid 8 and 16 bit types because they increase the chance for unnecessary

36014

// zero-extensions.

36015

if (ConditionVT.getSizeInBits() < 32)

36016

return MVT::i32;

36017

return TargetLoweringBase::getPreferredSwitchConditionType(Context,

36018

ConditionVT);

36019

}

36020

36021

//===----------------------------------------------------------------------===//

36022

// X86 Scheduler Hooks

36023

//===----------------------------------------------------------------------===//

36024

36025

// Returns true if EFLAG is consumed after this iterator in the rest of the

36026

// basic block or any successors of the basic block.

36027

static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,

36028

MachineBasicBlock *BB) {

36029

// Scan forward through BB for a use/def of EFLAGS.

36030

for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {

36031

if (mi.readsRegister(X86::EFLAGS))

36032

return true;

36033

// If we found a def, we can stop searching.

36034

if (mi.definesRegister(X86::EFLAGS))

36035

return false;

36036

}

36037

36038

// If we hit the end of the block, check whether EFLAGS is live into a

36039

// successor.

36040

for (MachineBasicBlock *Succ : BB->successors())

36041

if (Succ->isLiveIn(X86::EFLAGS))

36042

return true;

36043

36044

return false;

36045

}

36046

36047

/// Utility function to emit xbegin specifying the start of an RTM region.

36048

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

36049

const TargetInstrInfo *TII) {

36050

const DebugLoc &DL = MI.getDebugLoc();

36051

36052

const BasicBlock *BB = MBB->getBasicBlock();

36053

MachineFunction::iterator I = ++MBB->getIterator();

36054

36055

// For the v = xbegin(), we generate

36056

//

36057

// thisMBB:

36058

// xbegin sinkMBB

36059

//

36060

// mainMBB:

36061

// s0 = -1

36062

//

36063

// fallBB:

36064

// eax = # XABORT_DEF

36065

// s1 = eax

36066

//

36067

// sinkMBB:

36068

// v = phi(s0/mainBB, s1/fallBB)

36069

36070

MachineBasicBlock *thisMBB = MBB;

36071

MachineFunction *MF = MBB->getParent();

36072

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

36073

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

36074

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

36075

MF->insert(I, mainMBB);

36076

MF->insert(I, fallMBB);

36077

MF->insert(I, sinkMBB);

36078

36079

if (isEFLAGSLiveAfter(MI, MBB)) {

36080

mainMBB->addLiveIn(X86::EFLAGS);

36081

fallMBB->addLiveIn(X86::EFLAGS);

36082

sinkMBB->addLiveIn(X86::EFLAGS);

36083

}

36084

36085

// Transfer the remainder of BB and its successor edges to sinkMBB.

36086

sinkMBB->splice(sinkMBB->begin(), MBB,

36087

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36088

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

36089

36090

MachineRegisterInfo &MRI = MF->getRegInfo();

36091

Register DstReg = MI.getOperand(0).getReg();

36092

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

36093

Register mainDstReg = MRI.createVirtualRegister(RC);

36094

Register fallDstReg = MRI.createVirtualRegister(RC);

36095

36096

// thisMBB:

36097

// xbegin fallMBB

36098

// # fallthrough to mainMBB

36099

// # abortion to fallMBB

36100

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

36101

thisMBB->addSuccessor(mainMBB);

36102

thisMBB->addSuccessor(fallMBB);

36103

36104

// mainMBB:

36105

// mainDstReg := -1

36106

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

36107

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

36108

mainMBB->addSuccessor(sinkMBB);

36109

36110

// fallMBB:

36111

// ; pseudo instruction to model hardware's definition from XABORT

36112

// EAX := XABORT_DEF

36113

// fallDstReg := EAX

36114

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

36115

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

36116

.addReg(X86::EAX);

36117

fallMBB->addSuccessor(sinkMBB);

36118

36119

// sinkMBB:

36120

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

36121

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

36122

.addReg(mainDstReg).addMBB(mainMBB)

36123

.addReg(fallDstReg).addMBB(fallMBB);

36124

36125

MI.eraseFromParent();

36126

return sinkMBB;

36127

}

36128

36129

MachineBasicBlock *

36130

X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,

36131

MachineBasicBlock *MBB) const {

36132

// Emit va_arg instruction on X86-64.

36133

36134

// Operands to this pseudo-instruction:

36135

// 0 ) Output : destination address (reg)

36136

// 1-5) Input : va_list address (addr, i64mem)

36137

// 6 ) ArgSize : Size (in bytes) of vararg type

36138

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

36139

// 8 ) Align : Alignment of type

36140

// 9 ) EFLAGS (implicit-def)

36141

36142

assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36142, __extension__
__PRETTY_FUNCTION__));

36143

static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");

36144

36145

Register DestReg = MI.getOperand(0).getReg();

36146

MachineOperand &Base = MI.getOperand(1);

36147

MachineOperand &Scale = MI.getOperand(2);

36148

MachineOperand &Index = MI.getOperand(3);

36149

MachineOperand &Disp = MI.getOperand(4);

36150

MachineOperand &Segment = MI.getOperand(5);

36151

unsigned ArgSize = MI.getOperand(6).getImm();

36152

unsigned ArgMode = MI.getOperand(7).getImm();

36153

Align Alignment = Align(MI.getOperand(8).getImm());

36154

36155

MachineFunction *MF = MBB->getParent();

36156

36157

// Memory Reference

36158

assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36158, __extension__
__PRETTY_FUNCTION__));

36159

36160

MachineMemOperand *OldMMO = MI.memoperands().front();

36161

36162

// Clone the MMO into two separate MMOs for loading and storing

36163

MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

36164

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

36165

MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

36166

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

36167

36168

// Machine Information

36169

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36170

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

36171

const TargetRegisterClass *AddrRegClass =

36172

getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));

36173

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

36174

const DebugLoc &DL = MI.getDebugLoc();

36175

36176

// struct va_list {

36177

// i32 gp_offset

36178

// i32 fp_offset

36179

// i64 overflow_area (address)

36180

// i64 reg_save_area (address)

36181

// }

36182

// sizeof(va_list) = 24

36183

// alignment(va_list) = 8

36184

36185

unsigned TotalNumIntRegs = 6;

36186

unsigned TotalNumXMMRegs = 8;

36187

bool UseGPOffset = (ArgMode == 1);

36188

bool UseFPOffset = (ArgMode == 2);

36189

unsigned MaxOffset = TotalNumIntRegs * 8 +

36190

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

36191

36192

/* Align ArgSize to a multiple of 8 */

36193

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

36194

bool NeedsAlign = (Alignment > 8);

36195

36196

MachineBasicBlock *thisMBB = MBB;

36197

MachineBasicBlock *overflowMBB;

36198

MachineBasicBlock *offsetMBB;

36199

MachineBasicBlock *endMBB;

36200

36201

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

36202

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

36203

unsigned OffsetReg = 0;

36204

36205

if (!UseGPOffset && !UseFPOffset) {

36206

// If we only pull from the overflow region, we don't create a branch.

36207

// We don't need to alter control flow.

36208

OffsetDestReg = 0; // unused

36209

OverflowDestReg = DestReg;

36210

36211

offsetMBB = nullptr;

36212

overflowMBB = thisMBB;

36213

endMBB = thisMBB;

36214

} else {

36215

// First emit code to check if gp_offset (or fp_offset) is below the bound.

36216

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

36217

// If not, pull from overflow_area. (branch to overflowMBB)

36218

//

36219

// thisMBB

36220

// | .

36221

// | .

36222

// offsetMBB overflowMBB

36223

// | .

36224

// | .

36225

// endMBB

36226

36227

// Registers for the PHI in endMBB

36228

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

36229

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

36230

36231

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

36232

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36233

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36234

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36235

36236

MachineFunction::iterator MBBIter = ++MBB->getIterator();

36237

36238

// Insert the new basic blocks

36239

MF->insert(MBBIter, offsetMBB);

36240

MF->insert(MBBIter, overflowMBB);

36241

MF->insert(MBBIter, endMBB);

36242

36243

// Transfer the remainder of MBB and its successor edges to endMBB.

36244

endMBB->splice(endMBB->begin(), thisMBB,

36245

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

36246

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

36247

36248

// Make offsetMBB and overflowMBB successors of thisMBB

36249

thisMBB->addSuccessor(offsetMBB);

36250

thisMBB->addSuccessor(overflowMBB);

36251

36252

// endMBB is a successor of both offsetMBB and overflowMBB

36253

offsetMBB->addSuccessor(endMBB);

36254

overflowMBB->addSuccessor(endMBB);

36255

36256

// Load the offset value into a register

36257

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

36258

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

36259

.add(Base)

36260

.add(Scale)

36261

.add(Index)

36262

.addDisp(Disp, UseFPOffset ? 4 : 0)

36263

.add(Segment)

36264

.setMemRefs(LoadOnlyMMO);

36265

36266

// Check if there is enough room left to pull this argument.

36267

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

36268

.addReg(OffsetReg)

36269

.addImm(MaxOffset + 8 - ArgSizeA8);

36270

36271

// Branch to "overflowMBB" if offset >= max

36272

// Fall through to "offsetMBB" otherwise

36273

BuildMI(thisMBB, DL, TII->get(X86::JCC_1))

36274

.addMBB(overflowMBB).addImm(X86::COND_AE);

36275

}

36276

36277

// In offsetMBB, emit code to use the reg_save_area.

36278

if (offsetMBB) {

36279

assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36279, __extension__ __PRETTY_FUNCTION__));

36280

36281

// Read the reg_save_area address.

36282

Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

36283

BuildMI(

36284

offsetMBB, DL,

36285

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

36286

RegSaveReg)

36287

.add(Base)

36288

.add(Scale)

36289

.add(Index)

36290

.addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)

36291

.add(Segment)

36292

.setMemRefs(LoadOnlyMMO);

36293

36294

if (Subtarget.isTarget64BitLP64()) {

36295

// Zero-extend the offset

36296

Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

36297

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

36298

.addImm(0)

36299

.addReg(OffsetReg)

36300

.addImm(X86::sub_32bit);

36301

36302

// Add the offset to the reg_save_area to get the final address.

36303

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

36304

.addReg(OffsetReg64)

36305

.addReg(RegSaveReg);

36306

} else {

36307

// Add the offset to the reg_save_area to get the final address.

36308

BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)

36309

.addReg(OffsetReg)

36310

.addReg(RegSaveReg);

36311

}

36312

36313

// Compute the offset for the next argument

36314

Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

36315

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

36316

.addReg(OffsetReg)

36317

.addImm(UseFPOffset ? 16 : 8);

36318

36319

// Store it back into the va_list.

36320

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

36321

.add(Base)

36322

.add(Scale)

36323

.add(Index)

36324

.addDisp(Disp, UseFPOffset ? 4 : 0)

36325

.add(Segment)

36326

.addReg(NextOffsetReg)

36327

.setMemRefs(StoreOnlyMMO);

36328

36329

// Jump to endMBB

36330

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

36331

.addMBB(endMBB);

36332

}

36333

36334

//

36335

// Emit code to use overflow area

36336

//

36337

36338

// Load the overflow_area address into a register.

36339

Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

36340

BuildMI(overflowMBB, DL,

36341

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

36342

OverflowAddrReg)

36343

.add(Base)

36344

.add(Scale)

36345

.add(Index)

36346

.addDisp(Disp, 8)

36347

.add(Segment)

36348

.setMemRefs(LoadOnlyMMO);

36349

36350

// If we need to align it, do so. Otherwise, just copy the address

36351

// to OverflowDestReg.

36352

if (NeedsAlign) {

36353

// Align the overflow address

36354

Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

36355

36356

// aligned_addr = (addr + (align-1)) & ~(align-1)

36357

BuildMI(

36358

overflowMBB, DL,

36359

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

36360

TmpReg)

36361

.addReg(OverflowAddrReg)

36362

.addImm(Alignment.value() - 1);

36363

36364

BuildMI(

36365

overflowMBB, DL,

36366

TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),

36367

OverflowDestReg)

36368

.addReg(TmpReg)

36369

.addImm(~(uint64_t)(Alignment.value() - 1));

36370

} else {

36371

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

36372

.addReg(OverflowAddrReg);

36373

}

36374

36375

// Compute the next overflow address after this argument.

36376

// (the overflow address should be kept 8-byte aligned)

36377

Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

36378

BuildMI(

36379

overflowMBB, DL,

36380

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

36381

NextAddrReg)

36382

.addReg(OverflowDestReg)

36383

.addImm(ArgSizeA8);

36384

36385

// Store the new overflow address.

36386

BuildMI(overflowMBB, DL,

36387

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))

36388

.add(Base)

36389

.add(Scale)

36390

.add(Index)

36391

.addDisp(Disp, 8)

36392

.add(Segment)

36393

.addReg(NextAddrReg)

36394

.setMemRefs(StoreOnlyMMO);

36395

36396

// If we branched, emit the PHI to the front of endMBB.

36397

if (offsetMBB) {

36398

BuildMI(*endMBB, endMBB->begin(), DL,

36399

TII->get(X86::PHI), DestReg)

36400

.addReg(OffsetDestReg).addMBB(offsetMBB)

36401

.addReg(OverflowDestReg).addMBB(overflowMBB);

36402

}

36403

36404

// Erase the pseudo instruction

36405

MI.eraseFromParent();

36406

36407

return endMBB;

36408

}

36409

36410

// The EFLAGS operand of SelectItr might be missing a kill marker

36411

// because there were multiple uses of EFLAGS, and ISel didn't know

36412

// which to mark. Figure out whether SelectItr should have had a

36413

// kill marker, and set it if it should. Returns the correct kill

36414

// marker value.

36415

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

36416

MachineBasicBlock* BB,

36417

const TargetRegisterInfo* TRI) {

36418

if (isEFLAGSLiveAfter(SelectItr, BB))

36419

return false;

36420

36421

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

36422

// out. SelectMI should have a kill flag on EFLAGS.

36423

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

36424

return true;

36425

}

36426

36427

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

36428

// together with other CMOV pseudo-opcodes into a single basic-block with

36429

// conditional jump around it.

36430

static bool isCMOVPseudo(MachineInstr &MI) {

36431

switch (MI.getOpcode()) {

36432

case X86::CMOV_FR16:

36433

case X86::CMOV_FR16X:

36434

case X86::CMOV_FR32:

36435

case X86::CMOV_FR32X:

36436

case X86::CMOV_FR64:

36437

case X86::CMOV_FR64X:

36438

case X86::CMOV_GR8:

36439

case X86::CMOV_GR16:

36440

case X86::CMOV_GR32:

36441

case X86::CMOV_RFP32:

36442

case X86::CMOV_RFP64:

36443

case X86::CMOV_RFP80:

36444

case X86::CMOV_VR64:

36445

case X86::CMOV_VR128:

36446

case X86::CMOV_VR128X:

36447

case X86::CMOV_VR256:

36448

case X86::CMOV_VR256X:

36449

case X86::CMOV_VR512:

36450

case X86::CMOV_VK1:

36451

case X86::CMOV_VK2:

36452

case X86::CMOV_VK4:

36453

case X86::CMOV_VK8:

36454

case X86::CMOV_VK16:

36455

case X86::CMOV_VK32:

36456

case X86::CMOV_VK64:

36457

return true;

36458

36459

default:

36460

return false;

36461

}

36462

}

36463

36464

// Helper function, which inserts PHI functions into SinkMBB:

36465

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

36466

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

36467

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

36468

// the last PHI function inserted.

36469

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

36470

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

36471

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

36472

MachineBasicBlock *SinkMBB) {

36473

MachineFunction *MF = TrueMBB->getParent();

36474

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

36475

const DebugLoc &DL = MIItBegin->getDebugLoc();

36476

36477

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

36478

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

36479

36480

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

36481

36482

// As we are creating the PHIs, we have to be careful if there is more than

36483

// one. Later CMOVs may reference the results of earlier CMOVs, but later

36484

// PHIs have to reference the individual true/false inputs from earlier PHIs.

36485

// That also means that PHI construction must work forward from earlier to

36486

// later, and that the code must maintain a mapping from earlier PHI's

36487

// destination registers, and the registers that went into the PHI.

36488

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

36489

MachineInstrBuilder MIB;

36490

36491

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

36492

Register DestReg = MIIt->getOperand(0).getReg();

36493

Register Op1Reg = MIIt->getOperand(1).getReg();

36494

Register Op2Reg = MIIt->getOperand(2).getReg();

36495

36496

// If this CMOV we are generating is the opposite condition from

36497

// the jump we generated, then we have to swap the operands for the

36498

// PHI that is going to be generated.

36499

if (MIIt->getOperand(3).getImm() == OppCC)

36500

std::swap(Op1Reg, Op2Reg);

36501

36502

if (RegRewriteTable.contains(Op1Reg))

36503

Op1Reg = RegRewriteTable[Op1Reg].first;

36504

36505

if (RegRewriteTable.contains(Op2Reg))

36506

Op2Reg = RegRewriteTable[Op2Reg].second;

36507

36508

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

36509

.addReg(Op1Reg)

36510

.addMBB(FalseMBB)

36511

.addReg(Op2Reg)

36512

.addMBB(TrueMBB);

36513

36514

// Add this PHI to the rewrite table.

36515

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

36516

}

36517

36518

return MIB;

36519

}

36520

36521

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

36522

MachineBasicBlock *

36523

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

36524

MachineInstr &SecondCascadedCMOV,

36525

MachineBasicBlock *ThisMBB) const {

36526

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36527

const DebugLoc &DL = FirstCMOV.getDebugLoc();

36528

36529

// We lower cascaded CMOVs such as

36530

//

36531

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

36532

//

36533

// to two successive branches.

36534

//

36535

// Without this, we would add a PHI between the two jumps, which ends up

36536

// creating a few copies all around. For instance, for

36537

//

36538

// (sitofp (zext (fcmp une)))

36539

//

36540

// we would generate:

36541

//

36542

// ucomiss %xmm1, %xmm0

36543

// movss <1.0f>, %xmm0

36544

// movaps %xmm0, %xmm1

36545

// jne .LBB5_2

36546

// xorps %xmm1, %xmm1

36547

// .LBB5_2:

36548

// jp .LBB5_4

36549

// movaps %xmm1, %xmm0

36550

// .LBB5_4:

36551

// retq

36552

//

36553

// because this custom-inserter would have generated:

36554

//

36555

// A

36556

// | \

36557

// | B

36558

// | /

36559

// C

36560

// | \

36561

// | D

36562

// | /

36563

// E

36564

//

36565

// A: X = ...; Y = ...

36566

// B: empty

36567

// C: Z = PHI [X, A], [Y, B]

36568

// D: empty

36569

// E: PHI [X, C], [Z, D]

36570

//

36571

// If we lower both CMOVs in a single step, we can instead generate:

36572

//

36573

// A

36574

// | \

36575

// | C

36576

// | /|

36577

// |/ |

36578

// | |

36579

// | D

36580

// | /

36581

// E

36582

//

36583

// A: X = ...; Y = ...

36584

// D: empty

36585

// E: PHI [X, A], [X, C], [Y, D]

36586

//

36587

// Which, in our sitofp/fcmp example, gives us something like:

36588

//

36589

// ucomiss %xmm1, %xmm0

36590

// movss <1.0f>, %xmm0

36591

// jne .LBB5_4

36592

// jp .LBB5_4

36593

// xorps %xmm0, %xmm0

36594

// .LBB5_4:

36595

// retq

36596

//

36597

36598

// We lower cascaded CMOV into two successive branches to the same block.

36599

// EFLAGS is used by both, so mark it as live in the second.

36600

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

36601

MachineFunction *F = ThisMBB->getParent();

36602

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

36603

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

36604

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

36605

36606

MachineFunction::iterator It = ++ThisMBB->getIterator();

36607

F->insert(It, FirstInsertedMBB);

36608

F->insert(It, SecondInsertedMBB);

36609

F->insert(It, SinkMBB);

36610

36611

// For a cascaded CMOV, we lower it to two successive branches to

36612

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

36613

// the FirstInsertedMBB.

36614

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

36615

36616

// If the EFLAGS register isn't dead in the terminator, then claim that it's

36617

// live into the sink and copy blocks.

36618

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36619

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

36620

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

36621

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

36622

SinkMBB->addLiveIn(X86::EFLAGS);

36623

}

36624

36625

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

36626

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

36627

std::next(MachineBasicBlock::iterator(FirstCMOV)),

36628

ThisMBB->end());

36629

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

36630

36631

// Fallthrough block for ThisMBB.

36632

ThisMBB->addSuccessor(FirstInsertedMBB);

36633

// The true block target of the first branch is always SinkMBB.

36634

ThisMBB->addSuccessor(SinkMBB);

36635

// Fallthrough block for FirstInsertedMBB.

36636

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

36637

// The true block for the branch of FirstInsertedMBB.

36638

FirstInsertedMBB->addSuccessor(SinkMBB);

36639

// This is fallthrough.

36640

SecondInsertedMBB->addSuccessor(SinkMBB);

36641

36642

// Create the conditional branch instructions.

36643

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

36644

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

36645

36646

X86::CondCode SecondCC =

36647

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

36648

BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

36649

36650

// SinkMBB:

36651

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

36652

Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();

36653

Register Op1Reg = FirstCMOV.getOperand(1).getReg();

36654

Register Op2Reg = FirstCMOV.getOperand(2).getReg();

36655

MachineInstrBuilder MIB =

36656

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

36657

.addReg(Op1Reg)

36658

.addMBB(SecondInsertedMBB)

36659

.addReg(Op2Reg)

36660

.addMBB(ThisMBB);

36661

36662

// The second SecondInsertedMBB provides the same incoming value as the

36663

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

36664

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

36665

36666

// Now remove the CMOVs.

36667

FirstCMOV.eraseFromParent();

36668

SecondCascadedCMOV.eraseFromParent();

36669

36670

return SinkMBB;

36671

}

36672

36673

MachineBasicBlock *

36674

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

36675

MachineBasicBlock *ThisMBB) const {

36676

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36677

const DebugLoc &DL = MI.getDebugLoc();

36678

36679

// To "insert" a SELECT_CC instruction, we actually have to insert the

36680

// diamond control-flow pattern. The incoming instruction knows the

36681

// destination vreg to set, the condition code register to branch on, the

36682

// true/false values to select between and a branch opcode to use.

36683

36684

// ThisMBB:

36685

// ...

36686

// TrueVal = ...

36687

// cmpTY ccX, r1, r2

36688

// bCC copy1MBB

36689

// fallthrough --> FalseMBB

36690

36691

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

36692

// as described above, by inserting a BB, and then making a PHI at the join

36693

// point to select the true and false operands of the CMOV in the PHI.

36694

//

36695

// The code also handles two different cases of multiple CMOV opcodes

36696

// in a row.

36697

//

36698

// Case 1:

36699

// In this case, there are multiple CMOVs in a row, all which are based on

36700

// the same condition setting (or the exact opposite condition setting).

36701

// In this case we can lower all the CMOVs using a single inserted BB, and

36702

// then make a number of PHIs at the join point to model the CMOVs. The only

36703

// trickiness here, is that in a case like:

36704

//

36705

// t2 = CMOV cond1 t1, f1

36706

// t3 = CMOV cond1 t2, f2

36707

//

36708

// when rewriting this into PHIs, we have to perform some renaming on the

36709

// temps since you cannot have a PHI operand refer to a PHI result earlier

36710

// in the same block. The "simple" but wrong lowering would be:

36711

//

36712

// t2 = PHI t1(BB1), f1(BB2)

36713

// t3 = PHI t2(BB1), f2(BB2)

36714

//

36715

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

36716

// renaming is to note that on the path through BB1, t2 is really just a

36717

// copy of t1, and do that renaming, properly generating:

36718

//

36719

// t2 = PHI t1(BB1), f1(BB2)

36720

// t3 = PHI t1(BB1), f2(BB2)

36721

//

36722

// Case 2:

36723

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

36724

// function - EmitLoweredCascadedSelect.

36725

36726

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

36727

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

36728

MachineInstr *LastCMOV = &MI;

36729

MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

36730

36731

// Check for case 1, where there are multiple CMOVs with the same condition

36732

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

36733

// number of jumps the most.

36734

36735

if (isCMOVPseudo(MI)) {

36736

// See if we have a string of CMOVS with the same condition. Skip over

36737

// intervening debug insts.

36738

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

36739

(NextMIIt->getOperand(3).getImm() == CC ||

36740

NextMIIt->getOperand(3).getImm() == OppCC)) {

36741

LastCMOV = &*NextMIIt;

36742

NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

36743

}

36744

}

36745

36746

// This checks for case 2, but only do this if we didn't already find

36747

// case 1, as indicated by LastCMOV == MI.

36748

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

36749

NextMIIt->getOpcode() == MI.getOpcode() &&

36750

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

36751

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

36752

NextMIIt->getOperand(1).isKill()) {

36753

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

36754

}

36755

36756

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

36757

MachineFunction *F = ThisMBB->getParent();

36758

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

36759

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

36760

36761

MachineFunction::iterator It = ++ThisMBB->getIterator();

36762

F->insert(It, FalseMBB);

36763

F->insert(It, SinkMBB);

36764

36765

// If the EFLAGS register isn't dead in the terminator, then claim that it's

36766

// live into the sink and copy blocks.

36767

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36768

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

36769

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

36770

FalseMBB->addLiveIn(X86::EFLAGS);

36771

SinkMBB->addLiveIn(X86::EFLAGS);

36772

}

36773

36774

// Transfer any debug instructions inside the CMOV sequence to the sunk block.

36775

auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),

36776

MachineBasicBlock::iterator(LastCMOV));

36777

for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))

36778

if (MI.isDebugInstr())

36779

SinkMBB->push_back(MI.removeFromParent());

36780

36781

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

36782

SinkMBB->splice(SinkMBB->end(), ThisMBB,

36783

std::next(MachineBasicBlock::iterator(LastCMOV)),

36784

ThisMBB->end());

36785

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

36786

36787

// Fallthrough block for ThisMBB.

36788

ThisMBB->addSuccessor(FalseMBB);

36789

// The true block target of the first (or only) branch is always a SinkMBB.

36790

ThisMBB->addSuccessor(SinkMBB);

36791

// Fallthrough block for FalseMBB.

36792

FalseMBB->addSuccessor(SinkMBB);

36793

36794

// Create the conditional branch instruction.

36795

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

36796

36797

// SinkMBB:

36798

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

36799

// ...

36800

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

36801

MachineBasicBlock::iterator MIItEnd =

36802

std::next(MachineBasicBlock::iterator(LastCMOV));

36803

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

36804

36805

// Now remove the CMOV(s).

36806

ThisMBB->erase(MIItBegin, MIItEnd);

36807

36808

return SinkMBB;

36809

}

36810

36811

static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

36812

if (IsLP64) {

36813

if (isInt<8>(Imm))

36814

return X86::SUB64ri8;

36815

return X86::SUB64ri32;

36816

} else {

36817

if (isInt<8>(Imm))

36818

return X86::SUB32ri8;

36819

return X86::SUB32ri;

36820

}

36821

}

36822

36823

MachineBasicBlock *

36824

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

36825

MachineBasicBlock *MBB) const {

36826

MachineFunction *MF = MBB->getParent();

36827

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36828

const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

36829

const DebugLoc &DL = MI.getDebugLoc();

36830

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

36831

36832

const unsigned ProbeSize = getStackProbeSize(*MF);

36833

36834

MachineRegisterInfo &MRI = MF->getRegInfo();

36835

MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36836

MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36837

MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36838

36839

MachineFunction::iterator MBBIter = ++MBB->getIterator();

36840

MF->insert(MBBIter, testMBB);

36841

MF->insert(MBBIter, blockMBB);

36842

MF->insert(MBBIter, tailMBB);

36843

36844

Register sizeVReg = MI.getOperand(1).getReg();

36845

36846

Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

36847

36848

Register TmpStackPtr = MRI.createVirtualRegister(

36849

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36850

Register FinalStackPtr = MRI.createVirtualRegister(

36851

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36852

36853

BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

36854

.addReg(physSPReg);

36855

{

36856

const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

36857

BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

36858

.addReg(TmpStackPtr)

36859

.addReg(sizeVReg);

36860

}

36861

36862

// test rsp size

36863

36864

BuildMI(testMBB, DL,

36865

TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

36866

.addReg(FinalStackPtr)

36867

.addReg(physSPReg);

36868

36869

BuildMI(testMBB, DL, TII->get(X86::JCC_1))

36870

.addMBB(tailMBB)

36871

.addImm(X86::COND_GE);

36872

testMBB->addSuccessor(blockMBB);

36873

testMBB->addSuccessor(tailMBB);

36874

36875

// Touch the block then extend it. This is done on the opposite side of

36876

// static probe where we allocate then touch, to avoid the need of probing the

36877

// tail of the static alloca. Possible scenarios are:

36878

//

36879

// + ---- <- ------------ <- ------------- <- ------------ +

36880

// | |

36881

// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

36882

// | |

36883

// + <- ----------- <- ------------ <- ----------- <- ------------ +

36884

//

36885

// The property we want to enforce is to never have more than [page alloc] between two probes.

36886

36887

const unsigned XORMIOpc =

36888

TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;

36889

addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)

36890

.addImm(0);

36891

36892

BuildMI(blockMBB, DL,

36893

TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

36894

.addReg(physSPReg)

36895

.addImm(ProbeSize);

36896

36897

36898

BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

36899

blockMBB->addSuccessor(testMBB);

36900

36901

// Replace original instruction by the expected stack ptr

36902

BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

36903

.addReg(FinalStackPtr);

36904

36905

tailMBB->splice(tailMBB->end(), MBB,

36906

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36907

tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

36908

MBB->addSuccessor(testMBB);

36909

36910

// Delete the original pseudo instruction.

36911

MI.eraseFromParent();

36912

36913

// And we're done.

36914

return tailMBB;

36915

}

36916

36917

MachineBasicBlock *

36918

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

36919

MachineBasicBlock *BB) const {

36920

MachineFunction *MF = BB->getParent();

36921

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36922

const DebugLoc &DL = MI.getDebugLoc();

36923

const BasicBlock *LLVM_BB = BB->getBasicBlock();

36924

36925

assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36925, __extension__ __PRETTY_FUNCTION__));

36926

36927

const bool Is64Bit = Subtarget.is64Bit();

36928

const bool IsLP64 = Subtarget.isTarget64BitLP64();

36929

36930

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

36931

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

36932

36933

// BB:

36934

// ... [Till the alloca]

36935

// If stacklet is not large enough, jump to mallocMBB

36936

//

36937

// bumpMBB:

36938

// Allocate by subtracting from RSP

36939

// Jump to continueMBB

36940

//

36941

// mallocMBB:

36942

// Allocate by call to runtime

36943

//

36944

// continueMBB:

36945

// ...

36946

// [rest of original BB]

36947

//

36948

36949

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36950

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36951

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36952

36953

MachineRegisterInfo &MRI = MF->getRegInfo();

36954

const TargetRegisterClass *AddrRegClass =

36955

getRegClassFor(getPointerTy(MF->getDataLayout()));

36956

36957

Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36958

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36959

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

36960

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

36961

sizeVReg = MI.getOperand(1).getReg(),

36962

physSPReg =

36963

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

36964

36965

MachineFunction::iterator MBBIter = ++BB->getIterator();

36966

36967

MF->insert(MBBIter, bumpMBB);

36968

MF->insert(MBBIter, mallocMBB);

36969

MF->insert(MBBIter, continueMBB);

36970

36971

continueMBB->splice(continueMBB->begin(), BB,

36972

std::next(MachineBasicBlock::iterator(MI)), BB->end());

36973

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

36974

36975

// Add code to the main basic block to check if the stack limit has been hit,

36976

// and if so, jump to mallocMBB otherwise to bumpMBB.

36977

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

36978

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

36979

.addReg(tmpSPVReg).addReg(sizeVReg);

36980

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

36981

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

36982

.addReg(SPLimitVReg);

36983

BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

36984

36985

// bumpMBB simply decreases the stack pointer, since we know the current

36986

// stacklet has enough space.

36987

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

36988

.addReg(SPLimitVReg);

36989

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

36990

.addReg(SPLimitVReg);

36991

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36992

36993

// Calls into a routine in libgcc to allocate more space from the heap.

36994

const uint32_t *RegMask =

36995

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

36996

if (IsLP64) {

36997

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

36998

.addReg(sizeVReg);

36999

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

37000

.addExternalSymbol("__morestack_allocate_stack_space")

37001

.addRegMask(RegMask)

37002

.addReg(X86::RDI, RegState::Implicit)

37003

.addReg(X86::RAX, RegState::ImplicitDefine);

37004

} else if (Is64Bit) {

37005

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

37006

.addReg(sizeVReg);

37007

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

37008

.addExternalSymbol("__morestack_allocate_stack_space")

37009

.addRegMask(RegMask)

37010

.addReg(X86::EDI, RegState::Implicit)

37011

.addReg(X86::EAX, RegState::ImplicitDefine);

37012

} else {

37013

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

37014

.addImm(12);

37015

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

37016

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

37017

.addExternalSymbol("__morestack_allocate_stack_space")

37018

.addRegMask(RegMask)

37019

.addReg(X86::EAX, RegState::ImplicitDefine);

37020

}

37021

37022

if (!Is64Bit)

37023

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

37024

.addImm(16);

37025

37026

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

37027

.addReg(IsLP64 ? X86::RAX : X86::EAX);

37028

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

37029

37030

// Set up the CFG correctly.

37031

BB->addSuccessor(bumpMBB);

37032

BB->addSuccessor(mallocMBB);

37033

mallocMBB->addSuccessor(continueMBB);

37034

bumpMBB->addSuccessor(continueMBB);

37035

37036

// Take care of the PHI nodes.

37037

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

37038

MI.getOperand(0).getReg())

37039

.addReg(mallocPtrVReg)

37040

.addMBB(mallocMBB)

37041

.addReg(bumpSPPtrVReg)

37042

.addMBB(bumpMBB);

37043

37044

// Delete the original pseudo instruction.

37045

MI.eraseFromParent();

37046

37047

// And we're done.

37048

return continueMBB;

37049

}

37050

37051

MachineBasicBlock *

37052

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

37053

MachineBasicBlock *BB) const {

37054

MachineFunction *MF = BB->getParent();

37055

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

37056

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

37057

const DebugLoc &DL = MI.getDebugLoc();

37058

37059

assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))

37060

classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__))

37061

"SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__
__PRETTY_FUNCTION__));

37062

37063

// Only 32-bit EH needs to worry about manually restoring stack pointers.

37064

if (!Subtarget.is32Bit())

37065

return BB;

37066

37067

// C++ EH creates a new target block to hold the restore code, and wires up

37068

// the new block to the return destination with a normal JMP_4.

37069

MachineBasicBlock *RestoreMBB =

37070

MF->CreateMachineBasicBlock(BB->getBasicBlock());

37071

assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37071, __extension__ __PRETTY_FUNCTION__));

37072

MF->insert(std::next(BB->getIterator()), RestoreMBB);

37073

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

37074

BB->addSuccessor(RestoreMBB);

37075

MI.getOperand(0).setMBB(RestoreMBB);

37076

37077

// Marking this as an EH pad but not a funclet entry block causes PEI to

37078

// restore stack pointers in the block.

37079

RestoreMBB->setIsEHPad(true);

37080

37081

auto RestoreMBBI = RestoreMBB->begin();

37082

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

37083

return BB;

37084

}

37085

37086

MachineBasicBlock *

37087

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

37088

MachineBasicBlock *BB) const {

37089

// So, here we replace TLSADDR with the sequence:

37090

// adjust_stackdown -> TLSADDR -> adjust_stackup.

37091

// We need this because TLSADDR is lowered into calls

37092

// inside MC, therefore without the two markers shrink-wrapping

37093

// may push the prologue/epilogue pass them.

37094

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

37095

const DebugLoc &DL = MI.getDebugLoc();

37096

MachineFunction &MF = *BB->getParent();

37097

37098

// Emit CALLSEQ_START right before the instruction.

37099

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

37100

MachineInstrBuilder CallseqStart =

37101

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

37102

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

37103

37104

// Emit CALLSEQ_END right after the instruction.

37105

// We don't call erase from parent because we want to keep the

37106

// original instruction around.

37107

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

37108

MachineInstrBuilder CallseqEnd =

37109

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

37110

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

37111

37112

return BB;

37113

}

37114

37115

MachineBasicBlock *

37116

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

37117

MachineBasicBlock *BB) const {

37118

// This is pretty easy. We're taking the value that we received from

37119

// our load from the relocation, sticking it in either RDI (x86-64)

37120

// or EAX and doing an indirect call. The return value will then

37121

// be in the normal return register.

37122

MachineFunction *F = BB->getParent();

37123

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37124

const DebugLoc &DL = MI.getDebugLoc();

37125

37126

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37126, __extension__
__PRETTY_FUNCTION__));

37127

assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37127, __extension__
__PRETTY_FUNCTION__));

37128

37129

// Get a register mask for the lowered call.

37130

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

37131

// proper register mask.

37132

const uint32_t *RegMask =

37133

Subtarget.is64Bit() ?

37134

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

37135

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

37136

if (Subtarget.is64Bit()) {

37137

MachineInstrBuilder MIB =

37138

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

37139

.addReg(X86::RIP)

37140

.addImm(0)

37141

.addReg(0)

37142

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

37143

MI.getOperand(3).getTargetFlags())

37144

.addReg(0);

37145

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

37146

addDirectMem(MIB, X86::RDI);

37147

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

37148

} else if (!isPositionIndependent()) {

37149

MachineInstrBuilder MIB =

37150

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

37151

.addReg(0)

37152

.addImm(0)

37153

.addReg(0)

37154

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

37155

MI.getOperand(3).getTargetFlags())

37156

.addReg(0);

37157

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

37158

addDirectMem(MIB, X86::EAX);

37159

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

37160

} else {

37161

MachineInstrBuilder MIB =

37162

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

37163

.addReg(TII->getGlobalBaseReg(F))

37164

.addImm(0)

37165

.addReg(0)

37166

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

37167

MI.getOperand(3).getTargetFlags())

37168

.addReg(0);

37169

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

37170

addDirectMem(MIB, X86::EAX);

37171

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

37172

}

37173

37174

MI.eraseFromParent(); // The pseudo instruction is gone now.

37175

return BB;

37176

}

37177

37178

static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

37179

switch (RPOpc) {

37180

case X86::INDIRECT_THUNK_CALL32:

37181

return X86::CALLpcrel32;

37182

case X86::INDIRECT_THUNK_CALL64:

37183

return X86::CALL64pcrel32;

37184

case X86::INDIRECT_THUNK_TCRETURN32:

37185

return X86::TCRETURNdi;

37186

case X86::INDIRECT_THUNK_TCRETURN64:

37187

return X86::TCRETURNdi64;

37188

}

37189

llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37189);

37190

}

37191

37192

static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

37193

unsigned Reg) {

37194

if (Subtarget.useRetpolineExternalThunk()) {

37195

// When using an external thunk for retpolines, we pick names that match the

37196

// names GCC happens to use as well. This helps simplify the implementation

37197

// of the thunks for kernels where they have no easy ability to create

37198

// aliases and are doing non-trivial configuration of the thunk's body. For

37199

// example, the Linux kernel will do boot-time hot patching of the thunk

37200

// bodies and cannot easily export aliases of these to loaded modules.

37201

//

37202

// Note that at any point in the future, we may need to change the semantics

37203

// of how we implement retpolines and at that time will likely change the

37204

// name of the called thunk. Essentially, there is no hard guarantee that

37205

// LLVM will generate calls to specific thunks, we merely make a best-effort

37206

// attempt to help out kernels and other systems where duplicating the

37207

// thunks is costly.

37208

switch (Reg) {

37209

case X86::EAX:

37210

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37210, __extension__
__PRETTY_FUNCTION__));

37211

return "__x86_indirect_thunk_eax";

37212

case X86::ECX:

37213

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37213, __extension__
__PRETTY_FUNCTION__));

37214

return "__x86_indirect_thunk_ecx";

37215

case X86::EDX:

37216

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37216, __extension__
__PRETTY_FUNCTION__));

37217

return "__x86_indirect_thunk_edx";

37218

case X86::EDI:

37219

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37219, __extension__
__PRETTY_FUNCTION__));

37220

return "__x86_indirect_thunk_edi";

37221

case X86::R11:

37222

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37222, __extension__
__PRETTY_FUNCTION__));

37223

return "__x86_indirect_thunk_r11";

37224

}

37225

llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37225);

37226

}

37227

37228

if (Subtarget.useRetpolineIndirectCalls() ||

37229

Subtarget.useRetpolineIndirectBranches()) {

37230

// When targeting an internal COMDAT thunk use an LLVM-specific name.

37231

switch (Reg) {

37232

case X86::EAX:

37233

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37233, __extension__
__PRETTY_FUNCTION__));

37234

return "__llvm_retpoline_eax";

37235

case X86::ECX:

37236

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37236, __extension__
__PRETTY_FUNCTION__));

37237

return "__llvm_retpoline_ecx";

37238

case X86::EDX:

37239

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37239, __extension__
__PRETTY_FUNCTION__));

37240

return "__llvm_retpoline_edx";

37241

case X86::EDI:

37242

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37242, __extension__
__PRETTY_FUNCTION__));

37243

return "__llvm_retpoline_edi";

37244

case X86::R11:

37245

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37245, __extension__
__PRETTY_FUNCTION__));

37246

return "__llvm_retpoline_r11";

37247

}

37248

llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37248);

37249

}

37250

37251

if (Subtarget.useLVIControlFlowIntegrity()) {

37252

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37252, __extension__
__PRETTY_FUNCTION__));

37253

return "__llvm_lvi_thunk_r11";

37254

}

37255

llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37255);

37256

}

37257

37258

MachineBasicBlock *

37259

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

37260

MachineBasicBlock *BB) const {

37261

// Copy the virtual register into the R11 physical register and

37262

// call the retpoline thunk.

37263

const DebugLoc &DL = MI.getDebugLoc();

37264

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37265

Register CalleeVReg = MI.getOperand(0).getReg();

37266

unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

37267

37268

// Find an available scratch register to hold the callee. On 64-bit, we can

37269

// just use R11, but we scan for uses anyway to ensure we don't generate

37270

// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

37271

// already a register use operand to the call to hold the callee. If none

37272

// are available, use EDI instead. EDI is chosen because EBX is the PIC base

37273

// register and ESI is the base pointer to realigned stack frames with VLAs.

37274

SmallVector<unsigned, 3> AvailableRegs;

37275

if (Subtarget.is64Bit())

37276

AvailableRegs.push_back(X86::R11);

37277

else

37278

AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

37279

37280

// Zero out any registers that are already used.

37281

for (const auto &MO : MI.operands()) {

37282

if (MO.isReg() && MO.isUse())

37283

for (unsigned &Reg : AvailableRegs)

37284

if (Reg == MO.getReg())

37285

Reg = 0;

37286

}

37287

37288

// Choose the first remaining non-zero available register.

37289

unsigned AvailableReg = 0;

37290

for (unsigned MaybeReg : AvailableRegs) {

37291

if (MaybeReg) {

37292

AvailableReg = MaybeReg;

37293

break;

37294

}

37295

}

37296

if (!AvailableReg)

37297

report_fatal_error("calling convention incompatible with retpoline, no "

37298

"available registers");

37299

37300

const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

37301

37302

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

37303

.addReg(CalleeVReg);

37304

MI.getOperand(0).ChangeToES(Symbol);

37305

MI.setDesc(TII->get(Opc));

37306

MachineInstrBuilder(*BB->getParent(), &MI)

37307

.addReg(AvailableReg, RegState::Implicit | RegState::Kill);

37308

return BB;

37309

}

37310

37311

/// SetJmp implies future control flow change upon calling the corresponding

37312

/// LongJmp.

37313

/// Instead of using the 'return' instruction, the long jump fixes the stack and

37314

/// performs an indirect branch. To do so it uses the registers that were stored

37315

/// in the jump buffer (when calling SetJmp).

37316

/// In case the shadow stack is enabled we need to fix it as well, because some

37317

/// return addresses will be skipped.

37318

/// The function will save the SSP for future fixing in the function

37319

/// emitLongJmpShadowStackFix.

37320

/// \sa emitLongJmpShadowStackFix

37321

/// \param [in] MI The temporary Machine Instruction for the builtin.

37322

/// \param [in] MBB The Machine Basic Block that will be modified.

37323

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

37324

MachineBasicBlock *MBB) const {

37325

const DebugLoc &DL = MI.getDebugLoc();

37326

MachineFunction *MF = MBB->getParent();

37327

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37328

MachineRegisterInfo &MRI = MF->getRegInfo();

37329

MachineInstrBuilder MIB;

37330

37331

// Memory Reference.

37332

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37333

MI.memoperands_end());

37334

37335

// Initialize a register with zero.

37336

MVT PVT = getPointerTy(MF->getDataLayout());

37337

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37338

Register ZReg = MRI.createVirtualRegister(PtrRC);

37339

unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

37340

BuildMI(*MBB, MI, DL, TII->get(XorRROpc))

37341

.addDef(ZReg)

37342

.addReg(ZReg, RegState::Undef)

37343

.addReg(ZReg, RegState::Undef);

37344

37345

// Read the current SSP Register value to the zeroed register.

37346

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

37347

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

37348

BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

37349

37350

// Write the SSP register value to offset 3 in input memory buffer.

37351

unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37352

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));

37353

const int64_t SSPOffset = 3 * PVT.getStoreSize();

37354

const unsigned MemOpndSlot = 1;

37355

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37356

if (i == X86::AddrDisp)

37357

MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

37358

else

37359

MIB.add(MI.getOperand(MemOpndSlot + i));

37360

}

37361

MIB.addReg(SSPCopyReg);

37362

MIB.setMemRefs(MMOs);

37363

}

37364

37365

MachineBasicBlock *

37366

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

37367

MachineBasicBlock *MBB) const {

37368

const DebugLoc &DL = MI.getDebugLoc();

37369

MachineFunction *MF = MBB->getParent();

37370

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37371

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

37372

MachineRegisterInfo &MRI = MF->getRegInfo();

37373

37374

const BasicBlock *BB = MBB->getBasicBlock();

37375

MachineFunction::iterator I = ++MBB->getIterator();

37376

37377

// Memory Reference

37378

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37379

MI.memoperands_end());

37380

37381

unsigned DstReg;

37382

unsigned MemOpndSlot = 0;

37383

37384

unsigned CurOp = 0;

37385

37386

DstReg = MI.getOperand(CurOp++).getReg();

37387

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

37388

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37388, __extension__
__PRETTY_FUNCTION__));

37389

(void)TRI;

37390

Register mainDstReg = MRI.createVirtualRegister(RC);

37391

Register restoreDstReg = MRI.createVirtualRegister(RC);

37392

37393

MemOpndSlot = CurOp;

37394

37395

MVT PVT = getPointerTy(MF->getDataLayout());

37396

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__))

37397

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__));

37398

37399

// For v = setjmp(buf), we generate

37400

//

37401

// thisMBB:

37402

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

37403

// SjLjSetup restoreMBB

37404

//

37405

// mainMBB:

37406

// v_main = 0

37407

//

37408

// sinkMBB:

37409

// v = phi(main, restore)

37410

//

37411

// restoreMBB:

37412

// if base pointer being used, load it from frame

37413

// v_restore = 1

37414

37415

MachineBasicBlock *thisMBB = MBB;

37416

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

37417

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

37418

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

37419

MF->insert(I, mainMBB);

37420

MF->insert(I, sinkMBB);

37421

MF->push_back(restoreMBB);

37422

restoreMBB->setMachineBlockAddressTaken();

37423

37424

MachineInstrBuilder MIB;

37425

37426

// Transfer the remainder of BB and its successor edges to sinkMBB.

37427

sinkMBB->splice(sinkMBB->begin(), MBB,

37428

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

37429

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

37430

37431

// thisMBB:

37432

unsigned PtrStoreOpc = 0;

37433

unsigned LabelReg = 0;

37434

const int64_t LabelOffset = 1 * PVT.getStoreSize();

37435

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

37436

!isPositionIndependent();

37437

37438

// Prepare IP either in reg or imm.

37439

if (!UseImmLabel) {

37440

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37441

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37442

LabelReg = MRI.createVirtualRegister(PtrRC);

37443

if (Subtarget.is64Bit()) {

37444

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

37445

.addReg(X86::RIP)

37446

.addImm(0)

37447

.addReg(0)

37448

.addMBB(restoreMBB)

37449

.addReg(0);

37450

} else {

37451

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

37452

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

37453

.addReg(XII->getGlobalBaseReg(MF))

37454

.addImm(0)

37455

.addReg(0)

37456

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

37457

.addReg(0);

37458

}

37459

} else

37460

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

37461

// Store IP

37462

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

37463

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37464

if (i == X86::AddrDisp)

37465

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

37466

else

37467

MIB.add(MI.getOperand(MemOpndSlot + i));

37468

}

37469

if (!UseImmLabel)

37470

MIB.addReg(LabelReg);

37471

else

37472

MIB.addMBB(restoreMBB);

37473

MIB.setMemRefs(MMOs);

37474

37475

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

37476

emitSetJmpShadowStackFix(MI, thisMBB);

37477

}

37478

37479

// Setup

37480

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

37481

.addMBB(restoreMBB);

37482

37483

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

37484

MIB.addRegMask(RegInfo->getNoPreservedMask());

37485

thisMBB->addSuccessor(mainMBB);

37486

thisMBB->addSuccessor(restoreMBB);

37487

37488

// mainMBB:

37489

// EAX = 0

37490

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

37491

mainMBB->addSuccessor(sinkMBB);

37492

37493

// sinkMBB:

37494

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

37495

TII->get(X86::PHI), DstReg)

37496

.addReg(mainDstReg).addMBB(mainMBB)

37497

.addReg(restoreDstReg).addMBB(restoreMBB);

37498

37499

// restoreMBB:

37500

if (RegInfo->hasBasePointer(*MF)) {

37501

const bool Uses64BitFramePtr =

37502

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37503

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

37504

X86FI->setRestoreBasePointer(MF);

37505

Register FramePtr = RegInfo->getFrameRegister(*MF);

37506

Register BasePtr = RegInfo->getBaseRegister();

37507

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

37508

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

37509

FramePtr, true, X86FI->getRestoreBasePointerOffset())

37510

.setMIFlag(MachineInstr::FrameSetup);

37511

}

37512

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

37513

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

37514

restoreMBB->addSuccessor(sinkMBB);

37515

37516

MI.eraseFromParent();

37517

return sinkMBB;

37518

}

37519

37520

/// Fix the shadow stack using the previously saved SSP pointer.

37521

/// \sa emitSetJmpShadowStackFix

37522

/// \param [in] MI The temporary Machine Instruction for the builtin.

37523

/// \param [in] MBB The Machine Basic Block that will be modified.

37524

/// \return The sink MBB that will perform the future indirect branch.

37525

MachineBasicBlock *

37526

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

37527

MachineBasicBlock *MBB) const {

37528

const DebugLoc &DL = MI.getDebugLoc();

37529

MachineFunction *MF = MBB->getParent();

37530

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37531

MachineRegisterInfo &MRI = MF->getRegInfo();

37532

37533

// Memory Reference

37534

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37535

MI.memoperands_end());

37536

37537

MVT PVT = getPointerTy(MF->getDataLayout());

37538

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37539

37540

// checkSspMBB:

37541

// xor vreg1, vreg1

37542

// rdssp vreg1

37543

// test vreg1, vreg1

37544

// je sinkMBB # Jump if Shadow Stack is not supported

37545

// fallMBB:

37546

// mov buf+24/12(%rip), vreg2

37547

// sub vreg1, vreg2

37548

// jbe sinkMBB # No need to fix the Shadow Stack

37549

// fixShadowMBB:

37550

// shr 3/2, vreg2

37551

// incssp vreg2 # fix the SSP according to the lower 8 bits

37552

// shr 8, vreg2

37553

// je sinkMBB

37554

// fixShadowLoopPrepareMBB:

37555

// shl vreg2

37556

// mov 128, vreg3

37557

// fixShadowLoopMBB:

37558

// incssp vreg3

37559

// dec vreg2

37560

// jne fixShadowLoopMBB # Iterate until you finish fixing

37561

// # the Shadow Stack

37562

// sinkMBB:

37563

37564

MachineFunction::iterator I = ++MBB->getIterator();

37565

const BasicBlock *BB = MBB->getBasicBlock();

37566

37567

MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

37568

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

37569

MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

37570

MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

37571

MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

37572

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

37573

MF->insert(I, checkSspMBB);

37574

MF->insert(I, fallMBB);

37575

MF->insert(I, fixShadowMBB);

37576

MF->insert(I, fixShadowLoopPrepareMBB);

37577

MF->insert(I, fixShadowLoopMBB);

37578

MF->insert(I, sinkMBB);

37579

37580

// Transfer the remainder of BB and its successor edges to sinkMBB.

37581

sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

37582

MBB->end());

37583

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

37584

37585

MBB->addSuccessor(checkSspMBB);

37586

37587

// Initialize a register with zero.

37588

Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

37589

BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

37590

37591

if (PVT == MVT::i64) {

37592

Register TmpZReg = MRI.createVirtualRegister(PtrRC);

37593

BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

37594

.addImm(0)

37595

.addReg(ZReg)

37596

.addImm(X86::sub_32bit);

37597

ZReg = TmpZReg;

37598

}

37599

37600

// Read the current SSP Register value to the zeroed register.

37601

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

37602

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

37603

BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

37604

37605

// Check whether the result of the SSP register is zero and jump directly

37606

// to the sink.

37607

unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

37608

BuildMI(checkSspMBB, DL, TII->get(TestRROpc))

37609

.addReg(SSPCopyReg)

37610

.addReg(SSPCopyReg);

37611

BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

37612

checkSspMBB->addSuccessor(sinkMBB);

37613

checkSspMBB->addSuccessor(fallMBB);

37614

37615

// Reload the previously saved SSP register value.

37616

Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

37617

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

37618

const int64_t SPPOffset = 3 * PVT.getStoreSize();

37619

MachineInstrBuilder MIB =

37620

BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);

37621

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37622

const MachineOperand &MO = MI.getOperand(i);

37623

if (i == X86::AddrDisp)

37624

MIB.addDisp(MO, SPPOffset);

37625

else if (MO.isReg()) // Don't add the whole operand, we don't want to

37626

// preserve kill flags.

37627

MIB.addReg(MO.getReg());

37628

else

37629

MIB.add(MO);

37630

}

37631

MIB.setMemRefs(MMOs);

37632

37633

// Subtract the current SSP from the previous SSP.

37634

Register SspSubReg = MRI.createVirtualRegister(PtrRC);

37635

unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

37636

BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)

37637

.addReg(PrevSSPReg)

37638

.addReg(SSPCopyReg);

37639

37640

// Jump to sink in case PrevSSPReg <= SSPCopyReg.

37641

BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);

37642

fallMBB->addSuccessor(sinkMBB);

37643

fallMBB->addSuccessor(fixShadowMBB);

37644

37645

// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

37646

unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

37647

unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

37648

Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

37649

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)

37650

.addReg(SspSubReg)

37651

.addImm(Offset);

37652

37653

// Increase SSP when looking only on the lower 8 bits of the delta.

37654

unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

37655

BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

37656

37657

// Reset the lower 8 bits.

37658

Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

37659

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)

37660

.addReg(SspFirstShrReg)

37661

.addImm(8);

37662

37663

// Jump if the result of the shift is zero.

37664

BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

37665

fixShadowMBB->addSuccessor(sinkMBB);

37666

fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

37667

37668

// Do a single shift left.

37669

unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;

37670

Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

37671

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)

37672

.addReg(SspSecondShrReg);

37673

37674

// Save the value 128 to a register (will be used next with incssp).

37675

Register Value128InReg = MRI.createVirtualRegister(PtrRC);

37676

unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

37677

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)

37678

.addImm(128);

37679

fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

37680

37681

// Since incssp only looks at the lower 8 bits, we might need to do several

37682

// iterations of incssp until we finish fixing the shadow stack.

37683

Register DecReg = MRI.createVirtualRegister(PtrRC);

37684

Register CounterReg = MRI.createVirtualRegister(PtrRC);

37685

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)

37686

.addReg(SspAfterShlReg)

37687

.addMBB(fixShadowLoopPrepareMBB)

37688

.addReg(DecReg)

37689

.addMBB(fixShadowLoopMBB);

37690

37691

// Every iteration we increase the SSP by 128.

37692

BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

37693

37694

// Every iteration we decrement the counter by 1.

37695

unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

37696

BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

37697

37698

// Jump if the counter is not zero yet.

37699

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);

37700

fixShadowLoopMBB->addSuccessor(sinkMBB);

37701

fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

37702

37703

return sinkMBB;

37704

}

37705

37706

MachineBasicBlock *

37707

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

37708

MachineBasicBlock *MBB) const {

37709

const DebugLoc &DL = MI.getDebugLoc();

37710

MachineFunction *MF = MBB->getParent();

37711

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37712

MachineRegisterInfo &MRI = MF->getRegInfo();

37713

37714

// Memory Reference

37715

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37716

MI.memoperands_end());

37717

37718

MVT PVT = getPointerTy(MF->getDataLayout());

37719

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__
__PRETTY_FUNCTION__))

37720

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__
__PRETTY_FUNCTION__));

37721

37722

const TargetRegisterClass *RC =

37723

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37724

Register Tmp = MRI.createVirtualRegister(RC);

37725

// Since FP is only updated here but NOT referenced, it's treated as GPR.

37726

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

37727

Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

37728

Register SP = RegInfo->getStackRegister();

37729

37730

MachineInstrBuilder MIB;

37731

37732

const int64_t LabelOffset = 1 * PVT.getStoreSize();

37733

const int64_t SPOffset = 2 * PVT.getStoreSize();

37734

37735

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

37736

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

37737

37738

MachineBasicBlock *thisMBB = MBB;

37739

37740

// When CET and shadow stack is enabled, we need to fix the Shadow Stack.

37741

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

37742

thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

37743

}

37744

37745

// Reload FP

37746

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);

37747

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37748

const MachineOperand &MO = MI.getOperand(i);

37749

if (MO.isReg()) // Don't add the whole operand, we don't want to

37750

// preserve kill flags.

37751

MIB.addReg(MO.getReg());

37752

else

37753

MIB.add(MO);

37754

}

37755

MIB.setMemRefs(MMOs);

37756

37757

// Reload IP

37758

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

37759

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37760

const MachineOperand &MO = MI.getOperand(i);

37761

if (i == X86::AddrDisp)

37762

MIB.addDisp(MO, LabelOffset);

37763

else if (MO.isReg()) // Don't add the whole operand, we don't want to

37764

// preserve kill flags.

37765

MIB.addReg(MO.getReg());

37766

else

37767

MIB.add(MO);

37768

}

37769

MIB.setMemRefs(MMOs);

37770

37771

// Reload SP

37772

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);

37773

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37774

if (i == X86::AddrDisp)

37775

MIB.addDisp(MI.getOperand(i), SPOffset);

37776

else

37777

MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

37778

// the last instruction of the expansion.

37779

}

37780

MIB.setMemRefs(MMOs);

37781

37782

// Jump

37783

BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

37784

37785

MI.eraseFromParent();

37786

return thisMBB;

37787

}

37788

37789

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

37790

MachineBasicBlock *MBB,

37791

MachineBasicBlock *DispatchBB,

37792

int FI) const {

37793

const DebugLoc &DL = MI.getDebugLoc();

37794

MachineFunction *MF = MBB->getParent();

37795

MachineRegisterInfo *MRI = &MF->getRegInfo();

37796

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37797

37798

MVT PVT = getPointerTy(MF->getDataLayout());

37799

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37799, __extension__
__PRETTY_FUNCTION__));

37800

37801

unsigned Op = 0;

37802

unsigned VR = 0;

37803

37804

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

37805

!isPositionIndependent();

37806

37807

if (UseImmLabel) {

37808

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

37809

} else {

37810

const TargetRegisterClass *TRC =

37811

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37812

VR = MRI->createVirtualRegister(TRC);

37813

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37814

37815

if (Subtarget.is64Bit())

37816

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

37817

.addReg(X86::RIP)

37818

.addImm(1)

37819

.addReg(0)

37820

.addMBB(DispatchBB)

37821

.addReg(0);

37822

else

37823

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

37824

.addReg(0) /* TII->getGlobalBaseReg(MF) */

37825

.addImm(1)

37826

.addReg(0)

37827

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

37828

.addReg(0);

37829

}

37830

37831

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

37832

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

37833

if (UseImmLabel)

37834

MIB.addMBB(DispatchBB);

37835

else

37836

MIB.addReg(VR);

37837

}

37838

37839

MachineBasicBlock *

37840

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

37841

MachineBasicBlock *BB) const {

37842

const DebugLoc &DL = MI.getDebugLoc();

37843

MachineFunction *MF = BB->getParent();

37844

MachineRegisterInfo *MRI = &MF->getRegInfo();

37845

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37846

int FI = MF->getFrameInfo().getFunctionContextIndex();

37847

37848

// Get a mapping of the call site numbers to all of the landing pads they're

37849

// associated with.

37850

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

37851

unsigned MaxCSNum = 0;

37852

for (auto &MBB : *MF) {

37853

if (!MBB.isEHPad())

37854

continue;

37855

37856

MCSymbol *Sym = nullptr;

37857

for (const auto &MI : MBB) {

37858

if (MI.isDebugInstr())

37859

continue;

37860

37861

assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37861, __extension__
__PRETTY_FUNCTION__));

37862

Sym = MI.getOperand(0).getMCSymbol();

37863

break;

37864

}

37865

37866

if (!MF->hasCallSiteLandingPad(Sym))

37867

continue;

37868

37869

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

37870

CallSiteNumToLPad[CSI].push_back(&MBB);

37871

MaxCSNum = std::max(MaxCSNum, CSI);

37872

}

37873

}

37874

37875

// Get an ordered list of the machine basic blocks for the jump table.

37876

std::vector<MachineBasicBlock *> LPadList;

37877

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

37878

LPadList.reserve(CallSiteNumToLPad.size());

37879

37880

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

37881

for (auto &LP : CallSiteNumToLPad[CSI]) {

37882

LPadList.push_back(LP);

37883

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

37884

}

37885

}

37886

37887

assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__
__PRETTY_FUNCTION__))

37888

"No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__
__PRETTY_FUNCTION__));

37889

37890

// Create the MBBs for the dispatch code.

37891

37892

// Shove the dispatch's address into the return slot in the function context.

37893

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

37894

DispatchBB->setIsEHPad(true);

37895

37896

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

37897

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

37898

DispatchBB->addSuccessor(TrapBB);

37899

37900

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

37901

DispatchBB->addSuccessor(DispContBB);

37902

37903

// Insert MBBs.

37904

MF->push_back(DispatchBB);

37905

MF->push_back(DispContBB);

37906

MF->push_back(TrapBB);

37907

37908

// Insert code into the entry block that creates and registers the function

37909

// context.

37910

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

37911

37912

// Create the jump table and associated information

37913

unsigned JTE = getJumpTableEncoding();

37914

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

37915

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

37916

37917

const X86RegisterInfo &RI = TII->getRegisterInfo();

37918

// Add a register mask with no preserved registers. This results in all

37919

// registers being marked as clobbered.

37920

if (RI.hasBasePointer(*MF)) {

37921

const bool FPIs64Bit =

37922

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37923

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

37924

MFI->setRestoreBasePointer(MF);

37925

37926

Register FP = RI.getFrameRegister(*MF);

37927

Register BP = RI.getBaseRegister();

37928

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

37929

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

37930

MFI->getRestoreBasePointerOffset())

37931

.addRegMask(RI.getNoPreservedMask());

37932

} else {

37933

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

37934

.addRegMask(RI.getNoPreservedMask());

37935

}

37936

37937

// IReg is used as an index in a memory operand and therefore can't be SP

37938

Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

37939

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

37940

Subtarget.is64Bit() ? 8 : 4);

37941

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

37942

.addReg(IReg)

37943

.addImm(LPadList.size());

37944

BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

37945

37946

if (Subtarget.is64Bit()) {

37947

Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37948

Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

37949

37950

// leaq .LJTI0_0(%rip), BReg

37951

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

37952

.addReg(X86::RIP)

37953

.addImm(1)

37954

.addReg(0)

37955

.addJumpTableIndex(MJTI)

37956

.addReg(0);

37957

// movzx IReg64, IReg

37958

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

37959

.addImm(0)

37960

.addReg(IReg)

37961

.addImm(X86::sub_32bit);

37962

37963

switch (JTE) {

37964

case MachineJumpTableInfo::EK_BlockAddress:

37965

// jmpq *(BReg,IReg64,8)

37966

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

37967

.addReg(BReg)

37968

.addImm(8)

37969

.addReg(IReg64)

37970

.addImm(0)

37971

.addReg(0);

37972

break;

37973

case MachineJumpTableInfo::EK_LabelDifference32: {

37974

Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

37975

Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

37976

Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37977

37978

// movl (BReg,IReg64,4), OReg

37979

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

37980

.addReg(BReg)

37981

.addImm(4)

37982

.addReg(IReg64)

37983

.addImm(0)

37984

.addReg(0);

37985

// movsx OReg64, OReg

37986

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

37987

// addq BReg, OReg64, TReg

37988

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

37989

.addReg(OReg64)

37990

.addReg(BReg);

37991

// jmpq *TReg

37992

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

37993

break;

37994

}

37995

default:

37996

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37996);

37997

}

37998

} else {

37999

// jmpl *.LJTI0_0(,IReg,4)

38000

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

38001

.addReg(0)

38002

.addImm(4)

38003

.addReg(IReg)

38004

.addJumpTableIndex(MJTI)

38005

.addReg(0);

38006

}

38007

38008

// Add the jump table entries as successors to the MBB.

38009

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

38010

for (auto &LP : LPadList)

38011

if (SeenMBBs.insert(LP).second)

38012

DispContBB->addSuccessor(LP);

38013

38014

// N.B. the order the invoke BBs are processed in doesn't matter here.

38015

SmallVector<MachineBasicBlock *, 64> MBBLPads;

38016

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

38017

for (MachineBasicBlock *MBB : InvokeBBs) {

38018

// Remove the landing pad successor from the invoke block and replace it

38019

// with the new dispatch block.

38020

// Keep a copy of Successors since it's modified inside the loop.

38021

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

38022

MBB->succ_rend());

38023

// FIXME: Avoid quadratic complexity.

38024

for (auto *MBBS : Successors) {

38025

if (MBBS->isEHPad()) {

38026

MBB->removeSuccessor(MBBS);

38027

MBBLPads.push_back(MBBS);

38028

}

38029

}

38030

38031

MBB->addSuccessor(DispatchBB);

38032

38033

// Find the invoke call and mark all of the callee-saved registers as

38034

// 'implicit defined' so that they're spilled. This prevents code from

38035

// moving instructions to before the EH block, where they will never be

38036

// executed.

38037

for (auto &II : reverse(*MBB)) {

38038

if (!II.isCall())

38039

continue;

38040

38041

DenseMap<unsigned, bool> DefRegs;

38042

for (auto &MOp : II.operands())

38043

if (MOp.isReg())

38044

DefRegs[MOp.getReg()] = true;

38045

38046

MachineInstrBuilder MIB(*MF, &II);

38047

for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

38048

unsigned Reg = SavedRegs[RegIdx];

38049

if (!DefRegs[Reg])

38050

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

38051

}

38052

38053

break;

38054

}

38055

}

38056

38057

// Mark all former landing pads as non-landing pads. The dispatch is the only

38058

// landing pad now.

38059

for (auto &LP : MBBLPads)

38060

LP->setIsEHPad(false);

38061

38062

// The instruction is gone now.

38063

MI.eraseFromParent();

38064

return BB;

38065

}

38066

38067

MachineBasicBlock *

38068

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

38069

MachineBasicBlock *BB) const {

38070

MachineFunction *MF = BB->getParent();

38071

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

38072

const DebugLoc &DL = MI.getDebugLoc();

38073

38074

auto TMMImmToTMMReg = [](unsigned Imm) {

38075

assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38075, __extension__
__PRETTY_FUNCTION__));

38076

return X86::TMM0 + Imm;

38077

};

38078

switch (MI.getOpcode()) {

38079

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38079);

38080

case X86::TLS_addr32:

38081

case X86::TLS_addr64:

38082

case X86::TLS_addrX32:

38083

case X86::TLS_base_addr32:

38084

case X86::TLS_base_addr64:

38085

case X86::TLS_base_addrX32:

38086

return EmitLoweredTLSAddr(MI, BB);

38087

case X86::INDIRECT_THUNK_CALL32:

38088

case X86::INDIRECT_THUNK_CALL64:

38089

case X86::INDIRECT_THUNK_TCRETURN32:

38090

case X86::INDIRECT_THUNK_TCRETURN64:

38091

return EmitLoweredIndirectThunk(MI, BB);

38092

case X86::CATCHRET:

38093

return EmitLoweredCatchRet(MI, BB);

38094

case X86::SEG_ALLOCA_32:

38095

case X86::SEG_ALLOCA_64:

38096

return EmitLoweredSegAlloca(MI, BB);

38097

case X86::PROBED_ALLOCA_32:

38098

case X86::PROBED_ALLOCA_64:

38099

return EmitLoweredProbedAlloca(MI, BB);

38100

case X86::TLSCall_32:

38101

case X86::TLSCall_64:

38102

return EmitLoweredTLSCall(MI, BB);

38103

case X86::CMOV_FR16:

38104

case X86::CMOV_FR16X:

38105

case X86::CMOV_FR32:

38106

case X86::CMOV_FR32X:

38107

case X86::CMOV_FR64:

38108

case X86::CMOV_FR64X:

38109

case X86::CMOV_GR8:

38110

case X86::CMOV_GR16:

38111

case X86::CMOV_GR32:

38112

case X86::CMOV_RFP32:

38113

case X86::CMOV_RFP64:

38114

case X86::CMOV_RFP80:

38115

case X86::CMOV_VR64:

38116

case X86::CMOV_VR128:

38117

case X86::CMOV_VR128X:

38118

case X86::CMOV_VR256:

38119

case X86::CMOV_VR256X:

38120

case X86::CMOV_VR512:

38121

case X86::CMOV_VK1:

38122

case X86::CMOV_VK2:

38123

case X86::CMOV_VK4:

38124

case X86::CMOV_VK8:

38125

case X86::CMOV_VK16:

38126

case X86::CMOV_VK32:

38127

case X86::CMOV_VK64:

38128

return EmitLoweredSelect(MI, BB);

38129

38130

case X86::FP80_ADDr:

38131

case X86::FP80_ADDm32: {

38132

// Change the floating point control register to use double extended

38133

// precision when performing the addition.

38134

int OrigCWFrameIdx =

38135

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

38136

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),

38137

OrigCWFrameIdx);

38138

38139

// Load the old value of the control word...

38140

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

38141

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

38142

OrigCWFrameIdx);

38143

38144

// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended

38145

// precision.

38146

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

38147

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

38148

.addReg(OldCW, RegState::Kill)

38149

.addImm(0x300);

38150

38151

// Extract to 16 bits.

38152

Register NewCW16 =

38153

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

38154

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

38155

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

38156

38157

// Prepare memory for FLDCW.

38158

int NewCWFrameIdx =

38159

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

38160

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

38161

NewCWFrameIdx)

38162

.addReg(NewCW16, RegState::Kill);

38163

38164

// Reload the modified control word now...

38165

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

38166

NewCWFrameIdx);

38167

38168

// Do the addition.

38169

if (MI.getOpcode() == X86::FP80_ADDr) {

38170

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))

38171

.add(MI.getOperand(0))

38172

.add(MI.getOperand(1))

38173

.add(MI.getOperand(2));

38174

} else {

38175

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))

38176

.add(MI.getOperand(0))

38177

.add(MI.getOperand(1))

38178

.add(MI.getOperand(2))

38179

.add(MI.getOperand(3))

38180

.add(MI.getOperand(4))

38181

.add(MI.getOperand(5))

38182

.add(MI.getOperand(6));

38183

}

38184

38185

// Reload the original control word now.

38186

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

38187

OrigCWFrameIdx);

38188

38189

MI.eraseFromParent(); // The pseudo instruction is gone now.

38190

return BB;

38191

}

38192

38193

case X86::FP32_TO_INT16_IN_MEM:

38194

case X86::FP32_TO_INT32_IN_MEM:

38195

case X86::FP32_TO_INT64_IN_MEM:

38196

case X86::FP64_TO_INT16_IN_MEM:

38197

case X86::FP64_TO_INT32_IN_MEM:

38198

case X86::FP64_TO_INT64_IN_MEM:

38199

case X86::FP80_TO_INT16_IN_MEM:

38200

case X86::FP80_TO_INT32_IN_MEM:

38201

case X86::FP80_TO_INT64_IN_MEM: {

38202

// Change the floating point control register to use "round towards zero"

38203

// mode when truncating to an integer value.

38204

int OrigCWFrameIdx =

38205

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

38206

addFrameReference(BuildMI(*BB, MI, DL,

38207

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

38208

38209

// Load the old value of the control word...

38210

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

38211

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

38212

OrigCWFrameIdx);

38213

38214

// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

38215

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

38216

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

38217

.addReg(OldCW, RegState::Kill).addImm(0xC00);

38218

38219

// Extract to 16 bits.

38220

Register NewCW16 =

38221

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

38222

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

38223

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

38224

38225

// Prepare memory for FLDCW.

38226

int NewCWFrameIdx =

38227

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

38228

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

38229

NewCWFrameIdx)

38230

.addReg(NewCW16, RegState::Kill);

38231

38232

// Reload the modified control word now...

38233

addFrameReference(BuildMI(*BB, MI, DL,

38234

TII->get(X86::FLDCW16m)), NewCWFrameIdx);

38235

38236

// Get the X86 opcode to use.

38237

unsigned Opc;

38238

switch (MI.getOpcode()) {

38239

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38239);

38240

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

38241

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

38242

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

38243

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

38244

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

38245

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

38246

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

38247

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

38248

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

38249

}

38250

38251

X86AddressMode AM = getAddressFromInstr(&MI, 0);

38252

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

38253

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

38254

38255

// Reload the original control word now.

38256

addFrameReference(BuildMI(*BB, MI, DL,

38257

TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

38258

38259

MI.eraseFromParent(); // The pseudo instruction is gone now.

38260

return BB;

38261

}

38262

38263

// xbegin

38264

case X86::XBEGIN:

38265

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

38266

38267

case X86::VAARG_64:

38268

case X86::VAARG_X32:

38269

return EmitVAARGWithCustomInserter(MI, BB);

38270

38271

case X86::EH_SjLj_SetJmp32:

38272

case X86::EH_SjLj_SetJmp64:

38273

return emitEHSjLjSetJmp(MI, BB);

38274

38275

case X86::EH_SjLj_LongJmp32:

38276

case X86::EH_SjLj_LongJmp64:

38277

return emitEHSjLjLongJmp(MI, BB);

38278

38279

case X86::Int_eh_sjlj_setup_dispatch:

38280

return EmitSjLjDispatchBlock(MI, BB);

38281

38282

case TargetOpcode::STATEPOINT:

38283

// As an implementation detail, STATEPOINT shares the STACKMAP format at

38284

// this point in the process. We diverge later.

38285

return emitPatchPoint(MI, BB);

38286

38287

case TargetOpcode::STACKMAP:

38288

case TargetOpcode::PATCHPOINT:

38289

return emitPatchPoint(MI, BB);

38290

38291

case TargetOpcode::PATCHABLE_EVENT_CALL:

38292

case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

38293

return BB;

38294

38295

case X86::LCMPXCHG8B: {

38296

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38297

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

38298

// requires a memory operand. If it happens that current architecture is

38299

// i686 and for current function we need a base pointer

38300

// - which is ESI for i686 - register allocator would not be able to

38301

// allocate registers for an address in form of X(%reg, %reg, Y)

38302

// - there never would be enough unreserved registers during regalloc

38303

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

38304

// We are giving a hand to register allocator by precomputing the address in

38305

// a new vreg using LEA.

38306

38307

// If it is not i686 or there is no base pointer - nothing to do here.

38308

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

38309

return BB;

38310

38311

// Even though this code does not necessarily needs the base pointer to

38312

// be ESI, we check for that. The reason: if this assert fails, there are

38313

// some changes happened in the compiler base pointer handling, which most

38314

// probably have to be addressed somehow here.

38315

assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))

38316

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__))

38317

"base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__
__PRETTY_FUNCTION__));

38318

38319

MachineRegisterInfo &MRI = MF->getRegInfo();

38320

MVT SPTy = getPointerTy(MF->getDataLayout());

38321

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

38322

Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

38323

38324

X86AddressMode AM = getAddressFromInstr(&MI, 0);

38325

// Regalloc does not need any help when the memory operand of CMPXCHG8B

38326

// does not use index register.

38327

if (AM.IndexReg == X86::NoRegister)

38328

return BB;

38329

38330

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

38331

// four operand definitions that are E[ABCD] registers. We skip them and

38332

// then insert the LEA.

38333

MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

38334

while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||

38335

RMBBI->definesRegister(X86::EBX) ||

38336

RMBBI->definesRegister(X86::ECX) ||

38337

RMBBI->definesRegister(X86::EDX))) {

38338

++RMBBI;

38339

}

38340

MachineBasicBlock::iterator MBBI(RMBBI);

38341

addFullAddress(

38342

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

38343

38344

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

38345

38346

return BB;

38347

}

38348

case X86::LCMPXCHG16B_NO_RBX: {

38349

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38350

Register BasePtr = TRI->getBaseRegister();

38351

if (TRI->hasBasePointer(*MF) &&

38352

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

38353

if (!BB->isLiveIn(BasePtr))

38354

BB->addLiveIn(BasePtr);

38355

// Save RBX into a virtual register.

38356

Register SaveRBX =

38357

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38358

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

38359

.addReg(X86::RBX);

38360

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38361

MachineInstrBuilder MIB =

38362

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);

38363

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

38364

MIB.add(MI.getOperand(Idx));

38365

MIB.add(MI.getOperand(X86::AddrNumOperands));

38366

MIB.addReg(SaveRBX);

38367

} else {

38368

// Simple case, just copy the virtual register to RBX.

38369

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)

38370

.add(MI.getOperand(X86::AddrNumOperands));

38371

MachineInstrBuilder MIB =

38372

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));

38373

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

38374

MIB.add(MI.getOperand(Idx));

38375

}

38376

MI.eraseFromParent();

38377

return BB;

38378

}

38379

case X86::MWAITX: {

38380

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38381

Register BasePtr = TRI->getBaseRegister();

38382

bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

38383

// If no need to save the base pointer, we generate MWAITXrrr,

38384

// else we generate pseudo MWAITX_SAVE_RBX.

38385

if (!IsRBX || !TRI->hasBasePointer(*MF)) {

38386

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

38387

.addReg(MI.getOperand(0).getReg());

38388

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

38389

.addReg(MI.getOperand(1).getReg());

38390

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)

38391

.addReg(MI.getOperand(2).getReg());

38392

BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));

38393

MI.eraseFromParent();

38394

} else {

38395

if (!BB->isLiveIn(BasePtr)) {

38396

BB->addLiveIn(BasePtr);

38397

}

38398

// Parameters can be copied into ECX and EAX but not EBX yet.

38399

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

38400

.addReg(MI.getOperand(0).getReg());

38401

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

38402

.addReg(MI.getOperand(1).getReg());

38403

assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38403, __extension__
__PRETTY_FUNCTION__));

38404

// Save RBX into a virtual register.

38405

Register SaveRBX =

38406

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38407

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

38408

.addReg(X86::RBX);

38409

// Generate mwaitx pseudo.

38410

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38411

BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))

38412

.addDef(Dst) // Destination tied in with SaveRBX.

38413

.addReg(MI.getOperand(2).getReg()) // input value of EBX.

38414

.addUse(SaveRBX); // Save of base pointer.

38415

MI.eraseFromParent();

38416

}

38417

return BB;

38418

}

38419

case TargetOpcode::PREALLOCATED_SETUP: {

38420

assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38420, __extension__
__PRETTY_FUNCTION__));

38421

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

38422

MFI->setHasPreallocatedCall(true);

38423

int64_t PreallocatedId = MI.getOperand(0).getImm();

38424

size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

38425

assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38425, __extension__
__PRETTY_FUNCTION__));

38426

LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)

38427

<< StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false);

38428

BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

38429

.addReg(X86::ESP)

38430

.addImm(StackAdjustment);

38431

MI.eraseFromParent();

38432

return BB;

38433

}

38434

case TargetOpcode::PREALLOCATED_ARG: {

38435

assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38435, __extension__
__PRETTY_FUNCTION__));

38436

int64_t PreallocatedId = MI.getOperand(1).getImm();

38437

int64_t ArgIdx = MI.getOperand(2).getImm();

38438

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

38439

size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

38440

LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)

38441

<< ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false);

38442

// stack pointer + offset

38443

addRegOffset(

38444

BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

38445

X86::ESP, false, ArgOffset);

38446

MI.eraseFromParent();

38447

return BB;

38448

}

38449

case X86::PTDPBSSD:

38450

case X86::PTDPBSUD:

38451

case X86::PTDPBUSD:

38452

case X86::PTDPBUUD:

38453

case X86::PTDPBF16PS:

38454

case X86::PTDPFP16PS: {

38455

unsigned Opc;

38456

switch (MI.getOpcode()) {

38457

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38457);

38458

case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

38459

case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

38460

case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

38461

case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

38462

case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

38463

case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;

38464

}

38465

38466

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38467

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

38468

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

38469

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

38470

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

38471

38472

MI.eraseFromParent(); // The pseudo is gone now.

38473

return BB;

38474

}

38475

case X86::PTILEZERO: {

38476

unsigned Imm = MI.getOperand(0).getImm();

38477

BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

38478

MI.eraseFromParent(); // The pseudo is gone now.

38479

return BB;

38480

}

38481

case X86::PTILELOADD:

38482

case X86::PTILELOADDT1:

38483

case X86::PTILESTORED: {

38484

unsigned Opc;

38485

switch (MI.getOpcode()) {

38486

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38486);

38487

case X86::PTILELOADD: Opc = X86::TILELOADD; break;

38488

case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

38489

case X86::PTILESTORED: Opc = X86::TILESTORED; break;

38490

}

38491

38492

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38493

unsigned CurOp = 0;

38494

if (Opc != X86::TILESTORED)

38495

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

38496

RegState::Define);

38497

38498

MIB.add(MI.getOperand(CurOp++)); // base

38499

MIB.add(MI.getOperand(CurOp++)); // scale

38500

MIB.add(MI.getOperand(CurOp++)); // index -- stride

38501

MIB.add(MI.getOperand(CurOp++)); // displacement

38502

MIB.add(MI.getOperand(CurOp++)); // segment

38503

38504

if (Opc == X86::TILESTORED)

38505

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

38506

RegState::Undef);

38507

38508

MI.eraseFromParent(); // The pseudo is gone now.

38509

return BB;

38510

}

38511

case X86::PTCMMIMFP16PS:

38512

case X86::PTCMMRLFP16PS: {

38513

const DebugLoc &DL = MI.getDebugLoc();

38514

unsigned Opc;

38515

switch (MI.getOpcode()) {

38516

default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38516);

38517

case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;

38518

case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;

38519

}

38520

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38521

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

38522

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

38523

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

38524

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

38525

MI.eraseFromParent(); // The pseudo is gone now.

38526

return BB;

38527

}

38528

}

38529

}

38530

38531

//===----------------------------------------------------------------------===//

38532

// X86 Optimization Hooks

38533

//===----------------------------------------------------------------------===//

38534

38535

bool

38536

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

38537

const APInt &DemandedBits,

38538

const APInt &DemandedElts,

38539

TargetLoweringOpt &TLO) const {

38540

EVT VT = Op.getValueType();

38541

unsigned Opcode = Op.getOpcode();

38542

unsigned EltSize = VT.getScalarSizeInBits();

38543

38544

if (VT.isVector()) {

38545

// If the constant is only all signbits in the active bits, then we should

38546

// extend it to the entire constant to allow it act as a boolean constant

38547

// vector.

38548

auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

38549

if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

38550

return false;

38551

for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

38552

if (!DemandedElts[i] || V.getOperand(i).isUndef())

38553

continue;

38554

const APInt &Val = V.getConstantOperandAPInt(i);

38555

if (Val.getBitWidth() > Val.getNumSignBits() &&

38556

Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

38557

return true;

38558

}

38559

return false;

38560

};

38561

// For vectors - if we have a constant, then try to sign extend.

38562

// TODO: Handle AND/ANDN cases.

38563

unsigned ActiveBits = DemandedBits.getActiveBits();

38564

if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

38565

(Opcode == ISD::OR || Opcode == ISD::XOR) &&

38566

NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

38567

EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

38568

EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

38569

VT.getVectorNumElements());

38570

SDValue NewC =

38571

TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

38572

Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

38573

SDValue NewOp =

38574

TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

38575

return TLO.CombineTo(Op, NewOp);

38576

}

38577

return false;

38578

}

38579

38580

// Only optimize Ands to prevent shrinking a constant that could be

38581

// matched by movzx.

38582

if (Opcode != ISD::AND)

38583

return false;

38584

38585

// Make sure the RHS really is a constant.

38586

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

38587

if (!C)

38588

return false;

38589

38590

const APInt &Mask = C->getAPIntValue();

38591

38592

// Clear all non-demanded bits initially.

38593

APInt ShrunkMask = Mask & DemandedBits;

38594

38595

// Find the width of the shrunk mask.

38596

unsigned Width = ShrunkMask.getActiveBits();

38597

38598

// If the mask is all 0s there's nothing to do here.

38599

if (Width == 0)

38600

return false;

38601

38602

// Find the next power of 2 width, rounding up to a byte.

38603

Width = llvm::bit_ceil(std::max(Width, 8U));

38604

// Truncate the width to size to handle illegal types.

38605

Width = std::min(Width, EltSize);

38606

38607

// Calculate a possible zero extend mask for this constant.

38608

APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

38609

38610

// If we aren't changing the mask, just return true to keep it and prevent

38611

// the caller from optimizing.

38612

if (ZeroExtendMask == Mask)

38613

return true;

38614

38615

// Make sure the new mask can be represented by a combination of mask bits

38616

// and non-demanded bits.

38617

if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

38618

return false;

38619

38620

// Replace the constant with the zero extend mask.

38621

SDLoc DL(Op);

38622

SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

38623

SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

38624

return TLO.CombineTo(Op, NewOp);

38625

}

38626

38627

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

38628

KnownBits &Known,

38629

const APInt &DemandedElts,

38630

const SelectionDAG &DAG,

38631

unsigned Depth) const {

38632

unsigned BitWidth = Known.getBitWidth();

38633

unsigned NumElts = DemandedElts.getBitWidth();

38634

unsigned Opc = Op.getOpcode();

38635

EVT VT = Op.getValueType();

38636

assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))

38637

Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))

38638

Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))

38639

Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))

38640

"Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__))

38641

" is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__
__PRETTY_FUNCTION__));

38642

38643

Known.resetAll();

38644

switch (Opc) {

38645

default: break;

38646

case X86ISD::MUL_IMM: {

38647

KnownBits Known2;

38648

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38649

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38650

Known = KnownBits::mul(Known, Known2);

38651

break;

38652

}

38653

case X86ISD::SETCC:

38654

Known.Zero.setBitsFrom(1);

38655

break;

38656

case X86ISD::MOVMSK: {

38657

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

38658

Known.Zero.setBitsFrom(NumLoBits);

38659

break;

38660

}

38661

case X86ISD::PEXTRB:

38662

case X86ISD::PEXTRW: {

38663

SDValue Src = Op.getOperand(0);

38664

EVT SrcVT = Src.getValueType();

38665

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

38666

Op.getConstantOperandVal(1));

38667

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

38668

Known = Known.anyextOrTrunc(BitWidth);

38669

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

38670

break;

38671

}

38672

case X86ISD::VSRAI:

38673

case X86ISD::VSHLI:

38674

case X86ISD::VSRLI: {

38675

unsigned ShAmt = Op.getConstantOperandVal(1);

38676

if (ShAmt >= VT.getScalarSizeInBits()) {

38677

// Out of range logical bit shifts are guaranteed to be zero.

38678

// Out of range arithmetic bit shifts splat the sign bit.

38679

if (Opc != X86ISD::VSRAI) {

38680

Known.setAllZero();

38681

break;

38682

}

38683

38684

ShAmt = VT.getScalarSizeInBits() - 1;

38685

}

38686

38687

Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38688

if (Opc == X86ISD::VSHLI) {

38689

Known.Zero <<= ShAmt;

38690

Known.One <<= ShAmt;

38691

// Low bits are known zero.

38692

Known.Zero.setLowBits(ShAmt);

38693

} else if (Opc == X86ISD::VSRLI) {

38694

Known.Zero.lshrInPlace(ShAmt);

38695

Known.One.lshrInPlace(ShAmt);

38696

// High bits are known zero.

38697

Known.Zero.setHighBits(ShAmt);

38698

} else {

38699

Known.Zero.ashrInPlace(ShAmt);

38700

Known.One.ashrInPlace(ShAmt);

38701

}

38702

break;

38703

}

38704

case X86ISD::PACKUS: {

38705

// PACKUS is just a truncation if the upper half is zero.

38706

APInt DemandedLHS, DemandedRHS;

38707

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

38708

38709

Known.One = APInt::getAllOnes(BitWidth * 2);

38710

Known.Zero = APInt::getAllOnes(BitWidth * 2);

38711

38712

KnownBits Known2;

38713

if (!!DemandedLHS) {

38714

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

38715

Known = KnownBits::commonBits(Known, Known2);

38716

}

38717

if (!!DemandedRHS) {

38718

Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

38719

Known = KnownBits::commonBits(Known, Known2);

38720

}

38721

38722

if (Known.countMinLeadingZeros() < BitWidth)

38723

Known.resetAll();

38724

Known = Known.trunc(BitWidth);

38725

break;

38726

}

38727

case X86ISD::VBROADCAST: {

38728

SDValue Src = Op.getOperand(0);

38729

if (!Src.getSimpleValueType().isVector()) {

38730

Known = DAG.computeKnownBits(Src, Depth + 1);

38731

return;

38732

}

38733

break;

38734

}

38735

case X86ISD::AND: {

38736

if (Op.getResNo() == 0) {

38737

KnownBits Known2;

38738

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38739

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38740

Known &= Known2;

38741

}

38742

break;

38743

}

38744

case X86ISD::ANDNP: {

38745

KnownBits Known2;

38746

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38747

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38748

38749

// ANDNP = (~X & Y);

38750

Known.One &= Known2.Zero;

38751

Known.Zero |= Known2.One;

38752

break;

38753

}

38754

case X86ISD::FOR: {

38755

KnownBits Known2;

38756

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38757

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38758

38759

Known |= Known2;

38760

break;

38761

}

38762

case X86ISD::PSADBW: {

38763

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))

38764

Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__))

38765

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__
__PRETTY_FUNCTION__));

38766

38767

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.

38768

Known.Zero.setBitsFrom(16);

38769

break;

38770

}

38771

case X86ISD::PCMPGT:

38772

case X86ISD::PCMPEQ: {

38773

KnownBits KnownLhs =

38774

DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38775

KnownBits KnownRhs =

38776

DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38777

std::optional<bool> Res = Opc == X86ISD::PCMPEQ

38778

? KnownBits::eq(KnownLhs, KnownRhs)

38779

: KnownBits::sgt(KnownLhs, KnownRhs);

38780

if (Res) {

38781

if (*Res)

38782

Known.setAllOnes();

38783

else

38784

Known.setAllZero();

38785

}

38786

break;

38787

}

38788

case X86ISD::PMULUDQ: {

38789

KnownBits Known2;

38790

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38791

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38792

38793

Known = Known.trunc(BitWidth / 2).zext(BitWidth);

38794

Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);

38795

Known = KnownBits::mul(Known, Known2);

38796

break;

38797

}

38798

case X86ISD::CMOV: {

38799

Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

38800

// If we don't know any bits, early out.

38801

if (Known.isUnknown())

38802

break;

38803

KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

38804

38805

// Only known if known in both the LHS and RHS.

38806

Known = KnownBits::commonBits(Known, Known2);

38807

break;

38808

}

38809

case X86ISD::BEXTR:

38810

case X86ISD::BEXTRI: {

38811

SDValue Op0 = Op.getOperand(0);

38812

SDValue Op1 = Op.getOperand(1);

38813

38814

if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

38815

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

38816

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

38817

38818

// If the length is 0, the result is 0.

38819

if (Length == 0) {

38820

Known.setAllZero();

38821

break;

38822

}

38823

38824

if ((Shift + Length) <= BitWidth) {

38825

Known = DAG.computeKnownBits(Op0, Depth + 1);

38826

Known = Known.extractBits(Length, Shift);

38827

Known = Known.zextOrTrunc(BitWidth);

38828

}

38829

}

38830

break;

38831

}

38832

case X86ISD::PDEP: {

38833

KnownBits Known2;

38834

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38835

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38836

// Zeros are retained from the mask operand. But not ones.

38837

Known.One.clearAllBits();

38838

// The result will have at least as many trailing zeros as the non-mask

38839

// operand since bits can only map to the same or higher bit position.

38840

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

38841

break;

38842

}

38843

case X86ISD::PEXT: {

38844

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38845

// The result has as many leading zeros as the number of zeroes in the mask.

38846

unsigned Count = Known.Zero.popcount();

38847

Known.Zero = APInt::getHighBitsSet(BitWidth, Count);

38848

Known.One.clearAllBits();

38849

break;

38850

}

38851

case X86ISD::VTRUNC:

38852

case X86ISD::VTRUNCS:

38853

case X86ISD::VTRUNCUS:

38854

case X86ISD::CVTSI2P:

38855

case X86ISD::CVTUI2P:

38856

case X86ISD::CVTP2SI:

38857

case X86ISD::CVTP2UI:

38858

case X86ISD::MCVTP2SI:

38859

case X86ISD::MCVTP2UI:

38860

case X86ISD::CVTTP2SI:

38861

case X86ISD::CVTTP2UI:

38862

case X86ISD::MCVTTP2SI:

38863

case X86ISD::MCVTTP2UI:

38864

case X86ISD::MCVTSI2P:

38865

case X86ISD::MCVTUI2P:

38866

case X86ISD::VFPROUND:

38867

case X86ISD::VMFPROUND:

38868

case X86ISD::CVTPS2PH:

38869

case X86ISD::MCVTPS2PH: {

38870

// Truncations/Conversions - upper elements are known zero.

38871

EVT SrcVT = Op.getOperand(0).getValueType();

38872

if (SrcVT.isVector()) {

38873

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38874

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38875

Known.setAllZero();

38876

}

38877

break;

38878

}

38879

case X86ISD::STRICT_CVTTP2SI:

38880

case X86ISD::STRICT_CVTTP2UI:

38881

case X86ISD::STRICT_CVTSI2P:

38882

case X86ISD::STRICT_CVTUI2P:

38883

case X86ISD::STRICT_VFPROUND:

38884

case X86ISD::STRICT_CVTPS2PH: {

38885

// Strict Conversions - upper elements are known zero.

38886

EVT SrcVT = Op.getOperand(1).getValueType();

38887

if (SrcVT.isVector()) {

38888

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38889

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38890

Known.setAllZero();

38891

}

38892

break;

38893

}

38894

case X86ISD::MOVQ2DQ: {

38895

// Move from MMX to XMM. Upper half of XMM should be 0.

38896

if (DemandedElts.countr_zero() >= (NumElts / 2))

38897

Known.setAllZero();

38898

break;

38899

}

38900

case X86ISD::VBROADCAST_LOAD: {

38901

APInt UndefElts;

38902

SmallVector<APInt, 16> EltBits;

38903

if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,

38904

/*AllowWholeUndefs*/ false,

38905

/*AllowPartialUndefs*/ false)) {

38906

Known.Zero.setAllBits();

38907

Known.One.setAllBits();

38908

for (unsigned I = 0; I != NumElts; ++I) {

38909

if (!DemandedElts[I])

38910

continue;

38911

if (UndefElts[I]) {

38912

Known.resetAll();

38913

break;

38914

}

38915

KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);

38916

Known = KnownBits::commonBits(Known, Known2);

38917

}

38918

return;

38919

}

38920

break;

38921

}

38922

}

38923

38924

// Handle target shuffles.

38925

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38926

if (isTargetShuffle(Opc)) {

38927

SmallVector<int, 64> Mask;

38928

SmallVector<SDValue, 2> Ops;

38929

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38930

unsigned NumOps = Ops.size();

38931

unsigned NumElts = VT.getVectorNumElements();

38932

if (Mask.size() == NumElts) {

38933

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38934

Known.Zero.setAllBits(); Known.One.setAllBits();

38935

for (unsigned i = 0; i != NumElts; ++i) {

38936

if (!DemandedElts[i])

38937

continue;

38938

int M = Mask[i];

38939

if (M == SM_SentinelUndef) {

38940

// For UNDEF elements, we don't know anything about the common state

38941

// of the shuffle result.

38942

Known.resetAll();

38943

break;

38944

}

38945

if (M == SM_SentinelZero) {

38946

Known.One.clearAllBits();

38947

continue;

38948

}

38949

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__
__PRETTY_FUNCTION__))

38950

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__
__PRETTY_FUNCTION__));

38951

38952

unsigned OpIdx = (unsigned)M / NumElts;

38953

unsigned EltIdx = (unsigned)M % NumElts;

38954

if (Ops[OpIdx].getValueType() != VT) {

38955

// TODO - handle target shuffle ops with different value types.

38956

Known.resetAll();

38957

break;

38958

}

38959

DemandedOps[OpIdx].setBit(EltIdx);

38960

}

38961

// Known bits are the values that are shared by every demanded element.

38962

for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

38963

if (!DemandedOps[i])

38964

continue;

38965

KnownBits Known2 =

38966

DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

38967

Known = KnownBits::commonBits(Known, Known2);

38968

}

38969

}

38970

}

38971

}

38972

}

38973

38974

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

38975

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

38976

unsigned Depth) const {

38977

EVT VT = Op.getValueType();

38978

unsigned VTBits = VT.getScalarSizeInBits();

38979

unsigned Opcode = Op.getOpcode();

38980

switch (Opcode) {

38981

case X86ISD::SETCC_CARRY:

38982

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

38983

return VTBits;

38984

38985

case X86ISD::VTRUNC: {

38986

SDValue Src = Op.getOperand(0);

38987

MVT SrcVT = Src.getSimpleValueType();

38988

unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

38989

assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38989, __extension__
__PRETTY_FUNCTION__));

38990

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

38991

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

38992

if (Tmp > (NumSrcBits - VTBits))

38993

return Tmp - (NumSrcBits - VTBits);

38994

return 1;

38995

}

38996

38997

case X86ISD::PACKSS: {

38998

// PACKSS is just a truncation if the sign bits extend to the packed size.

38999

APInt DemandedLHS, DemandedRHS;

39000

getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

39001

DemandedRHS);

39002

39003

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

39004

unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

39005

if (!!DemandedLHS)

39006

Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);

39007

if (!!DemandedRHS)

39008

Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);

39009

unsigned Tmp = std::min(Tmp0, Tmp1);

39010

if (Tmp > (SrcBits - VTBits))

39011

return Tmp - (SrcBits - VTBits);

39012

return 1;

39013

}

39014

39015

case X86ISD::VBROADCAST: {

39016

SDValue Src = Op.getOperand(0);

39017

if (!Src.getSimpleValueType().isVector())

39018

return DAG.ComputeNumSignBits(Src, Depth + 1);

39019

break;

39020

}

39021

39022

case X86ISD::VSHLI: {

39023

SDValue Src = Op.getOperand(0);

39024

const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

39025

if (ShiftVal.uge(VTBits))

39026

return VTBits; // Shifted all bits out --> zero.

39027

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

39028

if (ShiftVal.uge(Tmp))

39029

return 1; // Shifted all sign bits out --> unknown.

39030

return Tmp - ShiftVal.getZExtValue();

39031

}

39032

39033

case X86ISD::VSRAI: {

39034

SDValue Src = Op.getOperand(0);

39035

APInt ShiftVal = Op.getConstantOperandAPInt(1);

39036

if (ShiftVal.uge(VTBits - 1))

39037

return VTBits; // Sign splat.

39038

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

39039

ShiftVal += Tmp;

39040

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

39041

}

39042

39043

case X86ISD::FSETCC:

39044

// cmpss/cmpsd return zero/all-bits result values in the bottom element.

39045

if (VT == MVT::f32 || VT == MVT::f64 ||

39046

((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))

39047

return VTBits;

39048

break;

39049

39050

case X86ISD::PCMPGT:

39051

case X86ISD::PCMPEQ:

39052

case X86ISD::CMPP:

39053

case X86ISD::VPCOM:

39054

case X86ISD::VPCOMU:

39055

// Vector compares return zero/all-bits result values.

39056

return VTBits;

39057

39058

case X86ISD::ANDNP: {

39059

unsigned Tmp0 =

39060

DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

39061

if (Tmp0 == 1) return 1; // Early out.

39062

unsigned Tmp1 =

39063

DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

39064

return std::min(Tmp0, Tmp1);

39065

}

39066

39067

case X86ISD::CMOV: {

39068

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

39069

if (Tmp0 == 1) return 1; // Early out.

39070

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

39071

return std::min(Tmp0, Tmp1);

39072

}

39073

}

39074

39075

// Handle target shuffles.

39076

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

39077

if (isTargetShuffle(Opcode)) {

39078

SmallVector<int, 64> Mask;

39079

SmallVector<SDValue, 2> Ops;

39080

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

39081

unsigned NumOps = Ops.size();

39082

unsigned NumElts = VT.getVectorNumElements();

39083

if (Mask.size() == NumElts) {

39084

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

39085

for (unsigned i = 0; i != NumElts; ++i) {

39086

if (!DemandedElts[i])

39087

continue;

39088

int M = Mask[i];

39089

if (M == SM_SentinelUndef) {

39090

// For UNDEF elements, we don't know anything about the common state

39091

// of the shuffle result.

39092

return 1;

39093

} else if (M == SM_SentinelZero) {

39094

// Zero = all sign bits.

39095

continue;

39096

}

39097

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__
__PRETTY_FUNCTION__))

39098

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__
__PRETTY_FUNCTION__));

39099

39100

unsigned OpIdx = (unsigned)M / NumElts;

39101

unsigned EltIdx = (unsigned)M % NumElts;

39102

if (Ops[OpIdx].getValueType() != VT) {

39103

// TODO - handle target shuffle ops with different value types.

39104

return 1;

39105

}

39106

DemandedOps[OpIdx].setBit(EltIdx);

39107

}

39108

unsigned Tmp0 = VTBits;

39109

for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

39110

if (!DemandedOps[i])

39111

continue;

39112

unsigned Tmp1 =

39113

DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

39114

Tmp0 = std::min(Tmp0, Tmp1);

39115

}

39116

return Tmp0;

39117

}

39118

}

39119

}

39120

39121

// Fallback case.

39122

return 1;

39123

}

39124

39125

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

39126

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

39127

return N->getOperand(0);

39128

return N;

39129

}

39130

39131

// Helper to look for a normal load that can be narrowed into a vzload with the

39132

// specified VT and memory VT. Returns SDValue() on failure.

39133

static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

39134

SelectionDAG &DAG) {

39135

// Can't if the load is volatile or atomic.

39136

if (!LN->isSimple())

39137

return SDValue();

39138

39139

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39140

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

39141

return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

39142

LN->getPointerInfo(), LN->getOriginalAlign(),

39143

LN->getMemOperand()->getFlags());

39144

}

39145

39146

// Attempt to match a combined shuffle mask against supported unary shuffle

39147

// instructions.

39148

// TODO: Investigate sharing more of this with shuffle lowering.

39149

static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

39150

bool AllowFloatDomain, bool AllowIntDomain,

39151

SDValue V1, const SelectionDAG &DAG,

39152

const X86Subtarget &Subtarget, unsigned &Shuffle,

39153

MVT &SrcVT, MVT &DstVT) {

39154

unsigned NumMaskElts = Mask.size();

39155

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

39156

39157

// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.

39158

if (Mask[0] == 0 &&

39159

(MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {

39160

if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||

39161

(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39162

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {

39163

Shuffle = X86ISD::VZEXT_MOVL;

39164

if (MaskEltSize == 16)

39165

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

39166

else

39167

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

39168

return true;

39169

}

39170

}

39171

39172

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.

39173

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

39174

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

39175

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

39176

unsigned MaxScale = 64 / MaskEltSize;

39177

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

39178

bool MatchAny = true;

39179

bool MatchZero = true;

39180

unsigned NumDstElts = NumMaskElts / Scale;

39181

for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {

39182

if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

39183

MatchAny = MatchZero = false;

39184

break;

39185

}

39186

MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);

39187

MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

39188

}

39189

if (MatchAny || MatchZero) {

39190

assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39190, __extension__
__PRETTY_FUNCTION__));

39191

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

39192

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

39193

MVT::getIntegerVT(MaskEltSize);

39194

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

39195

39196

Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);

39197

if (SrcVT.getVectorNumElements() != NumDstElts)

39198

Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);

39199

39200

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

39201

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

39202

return true;

39203

}

39204

}

39205

}

39206

39207

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

39208

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||

39209

(MaskEltSize == 16 && Subtarget.hasFP16())) &&

39210

isUndefOrEqual(Mask[0], 0) &&

39211

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

39212

Shuffle = X86ISD::VZEXT_MOVL;

39213

if (MaskEltSize == 16)

39214

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

39215

else

39216

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

39217

return true;

39218

}

39219

39220

// Check if we have SSE3 which will let us use MOVDDUP etc. The

39221

// instructions are no slower than UNPCKLPD but has the option to

39222

// fold the input operand into even an unaligned memory load.

39223

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

39224

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {

39225

Shuffle = X86ISD::MOVDDUP;

39226

SrcVT = DstVT = MVT::v2f64;

39227

return true;

39228

}

39229

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

39230

Shuffle = X86ISD::MOVSLDUP;

39231

SrcVT = DstVT = MVT::v4f32;

39232

return true;

39233

}

39234

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {

39235

Shuffle = X86ISD::MOVSHDUP;

39236

SrcVT = DstVT = MVT::v4f32;

39237

return true;

39238

}

39239

}

39240

39241

if (MaskVT.is256BitVector() && AllowFloatDomain) {

39242

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39242, __extension__
__PRETTY_FUNCTION__));

39243

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

39244

Shuffle = X86ISD::MOVDDUP;

39245

SrcVT = DstVT = MVT::v4f64;

39246

return true;

39247

}

39248

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

39249

V1)) {

39250

Shuffle = X86ISD::MOVSLDUP;

39251

SrcVT = DstVT = MVT::v8f32;

39252

return true;

39253

}

39254

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,

39255

V1)) {

39256

Shuffle = X86ISD::MOVSHDUP;

39257

SrcVT = DstVT = MVT::v8f32;

39258

return true;

39259

}

39260

}

39261

39262

if (MaskVT.is512BitVector() && AllowFloatDomain) {

39263

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__
__PRETTY_FUNCTION__))

39264

"AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__
__PRETTY_FUNCTION__));

39265

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

39266

V1)) {

39267

Shuffle = X86ISD::MOVDDUP;

39268

SrcVT = DstVT = MVT::v8f64;

39269

return true;

39270

}

39271

if (isTargetShuffleEquivalent(

39272

MaskVT, Mask,

39273

{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {

39274

Shuffle = X86ISD::MOVSLDUP;

39275

SrcVT = DstVT = MVT::v16f32;

39276

return true;

39277

}

39278

if (isTargetShuffleEquivalent(

39279

MaskVT, Mask,

39280

{1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {

39281

Shuffle = X86ISD::MOVSHDUP;

39282

SrcVT = DstVT = MVT::v16f32;

39283

return true;

39284

}

39285

}

39286

39287

return false;

39288

}

39289

39290

// Attempt to match a combined shuffle mask against supported unary immediate

39291

// permute instructions.

39292

// TODO: Investigate sharing more of this with shuffle lowering.

39293

static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

39294

const APInt &Zeroable,

39295

bool AllowFloatDomain, bool AllowIntDomain,

39296

const SelectionDAG &DAG,

39297

const X86Subtarget &Subtarget,

39298

unsigned &Shuffle, MVT &ShuffleVT,

39299

unsigned &PermuteImm) {

39300

unsigned NumMaskElts = Mask.size();

39301

unsigned InputSizeInBits = MaskVT.getSizeInBits();

39302

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

39303

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

39304

bool ContainsZeros = isAnyZero(Mask);

39305

39306

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

39307

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

39308

// Check for lane crossing permutes.

39309

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

39310

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

39311

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

39312

Shuffle = X86ISD::VPERMI;

39313

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

39314

PermuteImm = getV4X86ShuffleImm(Mask);

39315

return true;

39316

}

39317

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

39318

SmallVector<int, 4> RepeatedMask;

39319

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

39320

Shuffle = X86ISD::VPERMI;

39321

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

39322

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

39323

return true;

39324

}

39325

}

39326

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

39327

// VPERMILPD can permute with a non-repeating shuffle.

39328

Shuffle = X86ISD::VPERMILPI;

39329

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

39330

PermuteImm = 0;

39331

for (int i = 0, e = Mask.size(); i != e; ++i) {

39332

int M = Mask[i];

39333

if (M == SM_SentinelUndef)

39334

continue;

39335

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39335, __extension__
__PRETTY_FUNCTION__));

39336

PermuteImm |= (M & 1) << i;

39337

}

39338

return true;

39339

}

39340

}

39341

39342

// We are checking for shuffle match or shift match. Loop twice so we can

39343

// order which we try and match first depending on target preference.

39344

for (unsigned Order = 0; Order < 2; ++Order) {

39345

if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {

39346

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

39347

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

39348

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

39349

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

39350

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

39351

SmallVector<int, 4> RepeatedMask;

39352

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

39353

// Narrow the repeated mask to create 32-bit element permutes.

39354

SmallVector<int, 4> WordMask = RepeatedMask;

39355

if (MaskScalarSizeInBits == 64)

39356

narrowShuffleMaskElts(2, RepeatedMask, WordMask);

39357

39358

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

39359

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

39360

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

39361

PermuteImm = getV4X86ShuffleImm(WordMask);

39362

return true;

39363

}

39364

}

39365

39366

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

39367

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

39368

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39369

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39370

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

39371

SmallVector<int, 4> RepeatedMask;

39372

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

39373

ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

39374

ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

39375

39376

// PSHUFLW: permute lower 4 elements only.

39377

if (isUndefOrInRange(LoMask, 0, 4) &&

39378

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

39379

Shuffle = X86ISD::PSHUFLW;

39380

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

39381

PermuteImm = getV4X86ShuffleImm(LoMask);

39382

return true;

39383

}

39384

39385

// PSHUFHW: permute upper 4 elements only.

39386

if (isUndefOrInRange(HiMask, 4, 8) &&

39387

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

39388

// Offset the HiMask so that we can create the shuffle immediate.

39389

int OffsetHiMask[4];

39390

for (int i = 0; i != 4; ++i)

39391

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

39392

39393

Shuffle = X86ISD::PSHUFHW;

39394

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

39395

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

39396

return true;

39397

}

39398

}

39399

}

39400

} else {

39401

// Attempt to match against bit rotates.

39402

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

39403

((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

39404

Subtarget.hasAVX512())) {

39405

int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

39406

Subtarget, Mask);

39407

if (0 < RotateAmt) {

39408

Shuffle = X86ISD::VROTLI;

39409

PermuteImm = (unsigned)RotateAmt;

39410

return true;

39411

}

39412

}

39413

}

39414

// Attempt to match against byte/bit shifts.

39415

if (AllowIntDomain &&

39416

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39417

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39418

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39419

int ShiftAmt =

39420

matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,

39421

Zeroable, Subtarget);

39422

if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

39423

32 <= ShuffleVT.getScalarSizeInBits())) {

39424

// Byte shifts can be slower so only match them on second attempt.

39425

if (Order == 0 &&

39426

(Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))

39427

continue;

39428

39429

PermuteImm = (unsigned)ShiftAmt;

39430

return true;

39431

}

39432

39433

}

39434

}

39435

39436

return false;

39437

}

39438

39439

// Attempt to match a combined unary shuffle mask against supported binary

39440

// shuffle instructions.

39441

// TODO: Investigate sharing more of this with shuffle lowering.

39442

static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

39443

bool AllowFloatDomain, bool AllowIntDomain,

39444

SDValue &V1, SDValue &V2, const SDLoc &DL,

39445

SelectionDAG &DAG, const X86Subtarget &Subtarget,

39446

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

39447

bool IsUnary) {

39448

unsigned NumMaskElts = Mask.size();

39449

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

39450

unsigned SizeInBits = MaskVT.getSizeInBits();

39451

39452

if (MaskVT.is128BitVector()) {

39453

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&

39454

AllowFloatDomain) {

39455

V2 = V1;

39456

V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

39457

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

39458

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

39459

return true;

39460

}

39461

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&

39462

AllowFloatDomain) {

39463

V2 = V1;

39464

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

39465

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

39466

return true;

39467

}

39468

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&

39469

Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {

39470

std::swap(V1, V2);

39471

Shuffle = X86ISD::MOVSD;

39472

SrcVT = DstVT = MVT::v2f64;

39473

return true;

39474

}

39475

if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&

39476

(AllowFloatDomain || !Subtarget.hasSSE41())) {

39477

Shuffle = X86ISD::MOVSS;

39478

SrcVT = DstVT = MVT::v4f32;

39479

return true;

39480

}

39481

if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},

39482

DAG) &&

39483

Subtarget.hasFP16()) {

39484

Shuffle = X86ISD::MOVSH;

39485

SrcVT = DstVT = MVT::v8f16;

39486

return true;

39487

}

39488

}

39489

39490

// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

39491

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

39492

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

39493

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

39494

if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

39495

Subtarget)) {

39496

DstVT = MaskVT;

39497

return true;

39498

}

39499

}

39500

39501

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

39502

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

39503

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39504

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

39505

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39506

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

39507

if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

39508

Subtarget)) {

39509

SrcVT = DstVT = MaskVT;

39510

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

39511

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

39512

return true;

39513

}

39514

}

39515

39516

// Attempt to match against a OR if we're performing a blend shuffle and the

39517

// non-blended source element is zero in each case.

39518

// TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.

39519

if (SizeInBits == V1.getValueSizeInBits() &&

39520

SizeInBits == V2.getValueSizeInBits() &&

39521

(EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

39522

(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

39523

bool IsBlend = true;

39524

unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

39525

unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

39526

unsigned Scale1 = NumV1Elts / NumMaskElts;

39527

unsigned Scale2 = NumV2Elts / NumMaskElts;

39528

APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);

39529

APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);

39530

for (unsigned i = 0; i != NumMaskElts; ++i) {

39531

int M = Mask[i];

39532

if (M == SM_SentinelUndef)

39533

continue;

39534

if (M == SM_SentinelZero) {

39535

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

39536

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

39537

continue;

39538

}

39539

if (M == (int)i) {

39540

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

39541

continue;

39542

}

39543

if (M == (int)(i + NumMaskElts)) {

39544

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

39545

continue;

39546

}

39547

IsBlend = false;

39548

break;

39549

}

39550

if (IsBlend) {

39551

if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&

39552

DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {

39553

Shuffle = ISD::OR;

39554

SrcVT = DstVT = MaskVT.changeTypeToInteger();

39555

return true;

39556

}

39557

if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {

39558

// FIXME: handle mismatched sizes?

39559

// TODO: investigate if `ISD::OR` handling in

39560

// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.

39561

auto computeKnownBitsElementWise = [&DAG](SDValue V) {

39562

unsigned NumElts = V.getValueType().getVectorNumElements();

39563

KnownBits Known(NumElts);

39564

for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {

39565

APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);

39566

KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);

39567

if (PeepholeKnown.isZero())

39568

Known.Zero.setBit(EltIdx);

39569

if (PeepholeKnown.isAllOnes())

39570

Known.One.setBit(EltIdx);

39571

}

39572

return Known;

39573

};

39574

39575

KnownBits V1Known = computeKnownBitsElementWise(V1);

39576

KnownBits V2Known = computeKnownBitsElementWise(V2);

39577

39578

for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {

39579

int M = Mask[i];

39580

if (M == SM_SentinelUndef)

39581

continue;

39582

if (M == SM_SentinelZero) {

39583

IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];

39584

continue;

39585

}

39586

if (M == (int)i) {

39587

IsBlend &= V2Known.Zero[i] || V1Known.One[i];

39588

continue;

39589

}

39590

if (M == (int)(i + NumMaskElts)) {

39591

IsBlend &= V1Known.Zero[i] || V2Known.One[i];

39592

continue;

39593

}

39594

llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39594);

39595

}

39596

if (IsBlend) {

39597

Shuffle = ISD::OR;

39598

SrcVT = DstVT = MaskVT.changeTypeToInteger();

39599

return true;

39600

}

39601

}

39602

}

39603

}

39604

39605

return false;

39606

}

39607

39608

static bool matchBinaryPermuteShuffle(

39609

MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

39610

bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

39611

const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

39612

unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

39613

unsigned NumMaskElts = Mask.size();

39614

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

39615

39616

// Attempt to match against VALIGND/VALIGNQ rotate.

39617

if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

39618

((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

39619

(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

39620

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39621

if (!isAnyZero(Mask)) {

39622

int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

39623

if (0 < Rotation) {

39624

Shuffle = X86ISD::VALIGN;

39625

if (EltSizeInBits == 64)

39626

ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

39627

else

39628

ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

39629

PermuteImm = Rotation;

39630

return true;

39631

}

39632

}

39633

}

39634

39635

// Attempt to match against PALIGNR byte rotate.

39636

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

39637

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39638

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

39639

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

39640

if (0 < ByteRotation) {

39641

Shuffle = X86ISD::PALIGNR;

39642

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

39643

PermuteImm = ByteRotation;

39644

return true;

39645

}

39646

}

39647

39648

// Attempt to combine to X86ISD::BLENDI.

39649

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

39650

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

39651

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

39652

uint64_t BlendMask = 0;

39653

bool ForceV1Zero = false, ForceV2Zero = false;

39654

SmallVector<int, 8> TargetMask(Mask);

39655

if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,

39656

ForceV2Zero, BlendMask)) {

39657

if (MaskVT == MVT::v16i16) {

39658

// We can only use v16i16 PBLENDW if the lanes are repeated.

39659

SmallVector<int, 8> RepeatedMask;

39660

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

39661

RepeatedMask)) {

39662

assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__
__PRETTY_FUNCTION__))

39663

"Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__
__PRETTY_FUNCTION__));

39664

PermuteImm = 0;

39665

for (int i = 0; i < 8; ++i)

39666

if (RepeatedMask[i] >= 8)

39667

PermuteImm |= 1 << i;

39668

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39669

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39670

Shuffle = X86ISD::BLENDI;

39671

ShuffleVT = MaskVT;

39672

return true;

39673

}

39674

} else {

39675

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39676

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39677

PermuteImm = (unsigned)BlendMask;

39678

Shuffle = X86ISD::BLENDI;

39679

ShuffleVT = MaskVT;

39680

return true;

39681

}

39682

}

39683

}

39684

39685

// Attempt to combine to INSERTPS, but only if it has elements that need to

39686

// be set to zero.

39687

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

39688

MaskVT.is128BitVector() && isAnyZero(Mask) &&

39689

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

39690

Shuffle = X86ISD::INSERTPS;

39691

ShuffleVT = MVT::v4f32;

39692

return true;

39693

}

39694

39695

// Attempt to combine to SHUFPD.

39696

if (AllowFloatDomain && EltSizeInBits == 64 &&

39697

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39698

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

39699

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39700

bool ForceV1Zero = false, ForceV2Zero = false;

39701

if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

39702

PermuteImm, Mask, Zeroable)) {

39703

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39704

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39705

Shuffle = X86ISD::SHUFP;

39706

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

39707

return true;

39708

}

39709

}

39710

39711

// Attempt to combine to SHUFPS.

39712

if (AllowFloatDomain && EltSizeInBits == 32 &&

39713

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

39714

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

39715

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39716

SmallVector<int, 4> RepeatedMask;

39717

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

39718

// Match each half of the repeated mask, to determine if its just

39719

// referencing one of the vectors, is zeroable or entirely undef.

39720

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

39721

int M0 = RepeatedMask[Offset];

39722

int M1 = RepeatedMask[Offset + 1];

39723

39724

if (isUndefInRange(RepeatedMask, Offset, 2)) {

39725

return DAG.getUNDEF(MaskVT);

39726

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

39727

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

39728

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

39729

return getZeroVector(MaskVT, Subtarget, DAG, DL);

39730

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

39731

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

39732

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

39733

return V1;

39734

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

39735

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

39736

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

39737

return V2;

39738

}

39739

39740

return SDValue();

39741

};

39742

39743

int ShufMask[4] = {-1, -1, -1, -1};

39744

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

39745

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

39746

39747

if (Lo && Hi) {

39748

V1 = Lo;

39749

V2 = Hi;

39750

Shuffle = X86ISD::SHUFP;

39751

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

39752

PermuteImm = getV4X86ShuffleImm(ShufMask);

39753

return true;

39754

}

39755

}

39756

}

39757

39758

// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

39759

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

39760

MaskVT.is128BitVector() &&

39761

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

39762

Shuffle = X86ISD::INSERTPS;

39763

ShuffleVT = MVT::v4f32;

39764

return true;

39765

}

39766

39767

return false;

39768

}

39769

39770

static SDValue combineX86ShuffleChainWithExtract(

39771

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

39772

bool HasVariableMask, bool AllowVariableCrossLaneMask,

39773

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

39774

const X86Subtarget &Subtarget);

39775

39776

/// Combine an arbitrary chain of shuffles into a single instruction if

39777

/// possible.

39778

///

39779

/// This is the leaf of the recursive combine below. When we have found some

39780

/// chain of single-use x86 shuffle instructions and accumulated the combined

39781

/// shuffle mask represented by them, this will try to pattern match that mask

39782

/// into either a single instruction if there is a special purpose instruction

39783

/// for this operation, or into a PSHUFB instruction which is a fully general

39784

/// instruction but should only be used to replace chains over a certain depth.

39785

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

39786

ArrayRef<int> BaseMask, int Depth,

39787

bool HasVariableMask,

39788

bool AllowVariableCrossLaneMask,

39789

bool AllowVariablePerLaneMask,

39790

SelectionDAG &DAG,

39791

const X86Subtarget &Subtarget) {

39792

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39792, __extension__
__PRETTY_FUNCTION__));

39793

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__
__PRETTY_FUNCTION__))

39794

"Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__
__PRETTY_FUNCTION__));

39795

39796

SDLoc DL(Root);

39797

MVT RootVT = Root.getSimpleValueType();

39798

unsigned RootSizeInBits = RootVT.getSizeInBits();

39799

unsigned NumRootElts = RootVT.getVectorNumElements();

39800

39801

// Canonicalize shuffle input op to the requested type.

39802

auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {

39803

if (VT.getSizeInBits() > Op.getValueSizeInBits())

39804

Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());

39805

else if (VT.getSizeInBits() < Op.getValueSizeInBits())

39806

Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());

39807

return DAG.getBitcast(VT, Op);

39808

};

39809

39810

// Find the inputs that enter the chain. Note that multiple uses are OK

39811

// here, we're not going to remove the operands we find.

39812

bool UnaryShuffle = (Inputs.size() == 1);

39813

SDValue V1 = peekThroughBitcasts(Inputs[0]);

39814

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

39815

: peekThroughBitcasts(Inputs[1]));

39816

39817

MVT VT1 = V1.getSimpleValueType();

39818

MVT VT2 = V2.getSimpleValueType();

39819

assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__
__PRETTY_FUNCTION__))

39820

(RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__
__PRETTY_FUNCTION__));

39821

39822

SDValue Res;

39823

39824

unsigned NumBaseMaskElts = BaseMask.size();

39825

if (NumBaseMaskElts == 1) {

39826

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39826, __extension__
__PRETTY_FUNCTION__));

39827

return CanonicalizeShuffleInput(RootVT, V1);

39828

}

39829

39830

bool OptForSize = DAG.shouldOptForSize();

39831

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

39832

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

39833

(RootVT.isFloatingPoint() && Depth >= 1) ||

39834

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

39835

39836

// Don't combine if we are a AVX512/EVEX target and the mask element size

39837

// is different from the root element size - this would prevent writemasks

39838

// from being reused.

39839

bool IsMaskedShuffle = false;

39840

if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

39841

if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

39842

Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

39843

IsMaskedShuffle = true;

39844

}

39845

}

39846

39847

// If we are shuffling a splat (and not introducing zeros) then we can just

39848

// use it directly. This works for smaller elements as well as they already

39849

// repeat across each mask element.

39850

if (UnaryShuffle && !isAnyZero(BaseMask) &&

39851

V1.getValueSizeInBits() >= RootSizeInBits &&

39852

(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

39853

DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {

39854

return CanonicalizeShuffleInput(RootVT, V1);

39855

}

39856

39857

SmallVector<int, 64> Mask(BaseMask);

39858

39859

// See if the shuffle is a hidden identity shuffle - repeated args in HOPs

39860

// etc. can be simplified.

39861

if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {

39862

SmallVector<int> ScaledMask, IdentityMask;

39863

unsigned NumElts = VT1.getVectorNumElements();

39864

if (Mask.size() <= NumElts &&

39865

scaleShuffleElements(Mask, NumElts, ScaledMask)) {

39866

for (unsigned i = 0; i != NumElts; ++i)

39867

IdentityMask.push_back(i);

39868

if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,

39869

V2))

39870

return CanonicalizeShuffleInput(RootVT, V1);

39871

}

39872

}

39873

39874

// Handle 128/256-bit lane shuffles of 512-bit vectors.

39875

if (RootVT.is512BitVector() &&

39876

(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

39877

// If the upper subvectors are zeroable, then an extract+insert is more

39878

// optimal than using X86ISD::SHUF128. The insertion is free, even if it has

39879

// to zero the upper subvectors.

39880

if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {

39881

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39882

return SDValue(); // Nothing to do!

39883

assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__
__PRETTY_FUNCTION__))

39884

"Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__
__PRETTY_FUNCTION__));

39885

Res = CanonicalizeShuffleInput(RootVT, V1);

39886

unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);

39887

bool UseZero = isAnyZero(Mask);

39888

Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

39889

return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

39890

}

39891

39892

// Narrow shuffle mask to v4x128.

39893

SmallVector<int, 4> ScaledMask;

39894

assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39894, __extension__
__PRETTY_FUNCTION__));

39895

narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);

39896

39897

// Try to lower to vshuf64x2/vshuf32x4.

39898

auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,

39899

ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,

39900

SelectionDAG &DAG) {

39901

unsigned PermMask = 0;

39902

// Insure elements came from the same Op.

39903

SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

39904

for (int i = 0; i < 4; ++i) {

39905

assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39905, __extension__
__PRETTY_FUNCTION__));

39906

if (ScaledMask[i] < 0)

39907

continue;

39908

39909

SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;

39910

unsigned OpIndex = i / 2;

39911

if (Ops[OpIndex].isUndef())

39912

Ops[OpIndex] = Op;

39913

else if (Ops[OpIndex] != Op)

39914

return SDValue();

39915

39916

// Convert the 128-bit shuffle mask selection values into 128-bit

39917

// selection bits defined by a vshuf64x2 instruction's immediate control

39918

// byte.

39919

PermMask |= (ScaledMask[i] % 4) << (i * 2);

39920

}

39921

39922

return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

39923

CanonicalizeShuffleInput(ShuffleVT, Ops[0]),

39924

CanonicalizeShuffleInput(ShuffleVT, Ops[1]),

39925

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39926

};

39927

39928

// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

39929

// doesn't work because our mask is for 128 bits and we don't have an MVT

39930

// to match that.

39931

bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&

39932

isUndefOrInRange(ScaledMask[1], 0, 2) &&

39933

isUndefOrInRange(ScaledMask[2], 2, 4) &&

39934

isUndefOrInRange(ScaledMask[3], 2, 4) &&

39935

(ScaledMask[0] < 0 || ScaledMask[2] < 0 ||

39936

ScaledMask[0] == (ScaledMask[2] % 2)) &&

39937

(ScaledMask[1] < 0 || ScaledMask[3] < 0 ||

39938

ScaledMask[1] == (ScaledMask[3] % 2));

39939

39940

if (!isAnyZero(ScaledMask) && !PreferPERMQ) {

39941

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39942

return SDValue(); // Nothing to do!

39943

MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

39944

if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))

39945

return DAG.getBitcast(RootVT, V);

39946

}

39947

}

39948

39949

// Handle 128-bit lane shuffles of 256-bit vectors.

39950

if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

39951

// If the upper half is zeroable, then an extract+insert is more optimal

39952

// than using X86ISD::VPERM2X128. The insertion is free, even if it has to

39953

// zero the upper half.

39954

if (isUndefOrZero(Mask[1])) {

39955

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39956

return SDValue(); // Nothing to do!

39957

assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39957, __extension__
__PRETTY_FUNCTION__));

39958

Res = CanonicalizeShuffleInput(RootVT, V1);

39959

Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);

39960

return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,

39961

256);

39962

}

39963

39964

// If we're inserting the low subvector, an insert-subvector 'concat'

39965

// pattern is quicker than VPERM2X128.

39966

// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.

39967

if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&

39968

!Subtarget.hasAVX2()) {

39969

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39970

return SDValue(); // Nothing to do!

39971

SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);

39972

SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);

39973

Hi = extractSubVector(Hi, 0, DAG, DL, 128);

39974

return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);

39975

}

39976

39977

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

39978

return SDValue(); // Nothing to do!

39979

39980

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

39981

// we need to use the zeroing feature.

39982

// Prefer blends for sequential shuffles unless we are optimizing for size.

39983

if (UnaryShuffle &&

39984

!(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&

39985

(OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {

39986

unsigned PermMask = 0;

39987

PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);

39988

PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);

39989

return DAG.getNode(

39990

X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),

39991

DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));

39992

}

39993

39994

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39995

return SDValue(); // Nothing to do!

39996

39997

// TODO - handle AVX512VL cases with X86ISD::SHUF128.

39998

if (!UnaryShuffle && !IsMaskedShuffle) {

39999

assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__
__PRETTY_FUNCTION__))

40000

"Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__
__PRETTY_FUNCTION__));

40001

// Prefer blends to X86ISD::VPERM2X128.

40002

if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {

40003

unsigned PermMask = 0;

40004

PermMask |= ((Mask[0] & 3) << 0);

40005

PermMask |= ((Mask[1] & 3) << 4);

40006

SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;

40007

SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;

40008

return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,

40009

CanonicalizeShuffleInput(RootVT, LHS),

40010

CanonicalizeShuffleInput(RootVT, RHS),

40011

DAG.getTargetConstant(PermMask, DL, MVT::i8));

40012

}

40013

}

40014

}

40015

40016

// For masks that have been widened to 128-bit elements or more,

40017

// narrow back down to 64-bit elements.

40018

if (BaseMaskEltSizeInBits > 64) {

40019

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40019, __extension__
__PRETTY_FUNCTION__));

40020

int MaskScale = BaseMaskEltSizeInBits / 64;

40021

SmallVector<int, 64> ScaledMask;

40022

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

40023

Mask = std::move(ScaledMask);

40024

}

40025

40026

// For masked shuffles, we're trying to match the root width for better

40027

// writemask folding, attempt to scale the mask.

40028

// TODO - variable shuffles might need this to be widened again.

40029

if (IsMaskedShuffle && NumRootElts > Mask.size()) {

40030

assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40030, __extension__
__PRETTY_FUNCTION__));

40031

int MaskScale = NumRootElts / Mask.size();

40032

SmallVector<int, 64> ScaledMask;

40033

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

40034

Mask = std::move(ScaledMask);

40035

}

40036

40037

unsigned NumMaskElts = Mask.size();

40038

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

40039

40040

// Determine the effective mask value type.

40041

FloatDomain &= (32 <= MaskEltSizeInBits);

40042

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

40043

: MVT::getIntegerVT(MaskEltSizeInBits);

40044

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

40045

40046

// Only allow legal mask types.

40047

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

40048

return SDValue();

40049

40050

// Attempt to match the mask against known shuffle patterns.

40051

MVT ShuffleSrcVT, ShuffleVT;

40052

unsigned Shuffle, PermuteImm;

40053

40054

// Which shuffle domains are permitted?

40055

// Permit domain crossing at higher combine depths.

40056

// TODO: Should we indicate which domain is preferred if both are allowed?

40057

bool AllowFloatDomain = FloatDomain || (Depth >= 3);

40058

bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

40059

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

40060

40061

// Determine zeroable mask elements.

40062

APInt KnownUndef, KnownZero;

40063

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

40064

APInt Zeroable = KnownUndef | KnownZero;

40065

40066

if (UnaryShuffle) {

40067

// Attempt to match against broadcast-from-vector.

40068

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

40069

if ((Subtarget.hasAVX2() ||

40070

(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

40071

(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

40072

if (isUndefOrEqual(Mask, 0)) {

40073

if (V1.getValueType() == MaskVT &&

40074

V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

40075

X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {

40076

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

40077

return SDValue(); // Nothing to do!

40078

Res = V1.getOperand(0);

40079

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

40080

return DAG.getBitcast(RootVT, Res);

40081

}

40082

if (Subtarget.hasAVX2()) {

40083

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

40084

return SDValue(); // Nothing to do!

40085

Res = CanonicalizeShuffleInput(MaskVT, V1);

40086

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

40087

return DAG.getBitcast(RootVT, Res);

40088

}

40089

}

40090

}

40091

40092

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,

40093

DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&

40094

(!IsMaskedShuffle ||

40095

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

40096

if (Depth == 0 && Root.getOpcode() == Shuffle)

40097

return SDValue(); // Nothing to do!

40098

Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

40099

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

40100

return DAG.getBitcast(RootVT, Res);

40101

}

40102

40103

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

40104

AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,

40105

PermuteImm) &&

40106

(!IsMaskedShuffle ||

40107

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

40108

if (Depth == 0 && Root.getOpcode() == Shuffle)

40109

return SDValue(); // Nothing to do!

40110

Res = CanonicalizeShuffleInput(ShuffleVT, V1);

40111

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

40112

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

40113

return DAG.getBitcast(RootVT, Res);

40114

}

40115

}

40116

40117

// Attempt to combine to INSERTPS, but only if the inserted element has come

40118

// from a scalar.

40119

// TODO: Handle other insertions here as well?

40120

if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

40121

Subtarget.hasSSE41() &&

40122

!isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {

40123

if (MaskEltSizeInBits == 32) {

40124

SDValue SrcV1 = V1, SrcV2 = V2;

40125

if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

40126

DAG) &&

40127

SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

40128

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

40129

return SDValue(); // Nothing to do!

40130

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

40131

CanonicalizeShuffleInput(MVT::v4f32, SrcV1),

40132

CanonicalizeShuffleInput(MVT::v4f32, SrcV2),

40133

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

40134

return DAG.getBitcast(RootVT, Res);

40135

}

40136

}

40137

if (MaskEltSizeInBits == 64 &&

40138

isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&

40139

V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

40140

V2.getScalarValueSizeInBits() <= 32) {

40141

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

40142

return SDValue(); // Nothing to do!

40143

PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);

40144

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

40145

CanonicalizeShuffleInput(MVT::v4f32, V1),

40146

CanonicalizeShuffleInput(MVT::v4f32, V2),

40147

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

40148

return DAG.getBitcast(RootVT, Res);

40149

}

40150

}

40151

40152

SDValue NewV1 = V1; // Save operands in case early exit happens.

40153

SDValue NewV2 = V2;

40154

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

40155

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

40156

ShuffleVT, UnaryShuffle) &&

40157

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

40158

if (Depth == 0 && Root.getOpcode() == Shuffle)

40159

return SDValue(); // Nothing to do!

40160

NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);

40161

NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);

40162

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

40163

return DAG.getBitcast(RootVT, Res);

40164

}

40165

40166

NewV1 = V1; // Save operands in case early exit happens.

40167

NewV2 = V2;

40168

if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

40169

AllowIntDomain, NewV1, NewV2, DL, DAG,

40170

Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

40171

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

40172

if (Depth == 0 && Root.getOpcode() == Shuffle)

40173

return SDValue(); // Nothing to do!

40174

NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);

40175

NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);

40176

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

40177

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

40178

return DAG.getBitcast(RootVT, Res);

40179

}

40180

40181

// Typically from here on, we need an integer version of MaskVT.

40182

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

40183

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

40184

40185

// Annoyingly, SSE4A instructions don't map into the above match helpers.

40186

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

40187

uint64_t BitLen, BitIdx;

40188

if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

40189

Zeroable)) {

40190

if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)

40191

return SDValue(); // Nothing to do!

40192

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

40193

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

40194

DAG.getTargetConstant(BitLen, DL, MVT::i8),

40195

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

40196

return DAG.getBitcast(RootVT, Res);

40197

}

40198

40199

if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

40200

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)

40201

return SDValue(); // Nothing to do!

40202

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

40203

V2 = CanonicalizeShuffleInput(IntMaskVT, V2);

40204

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

40205

DAG.getTargetConstant(BitLen, DL, MVT::i8),

40206

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

40207

return DAG.getBitcast(RootVT, Res);

40208

}

40209

}

40210

40211

// Match shuffle against TRUNCATE patterns.

40212

if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

40213

// Match against a VTRUNC instruction, accounting for src/dst sizes.

40214

if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

40215

Subtarget)) {

40216

bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

40217

ShuffleSrcVT.getVectorNumElements();

40218

unsigned Opc =

40219

IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

40220

if (Depth == 0 && Root.getOpcode() == Opc)

40221

return SDValue(); // Nothing to do!

40222

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

40223

Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

40224

if (ShuffleVT.getSizeInBits() < RootSizeInBits)

40225

Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

40226

return DAG.getBitcast(RootVT, Res);

40227

}

40228

40229

// Do we need a more general binary truncation pattern?

40230

if (RootSizeInBits < 512 &&

40231

((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

40232

(RootVT.is128BitVector() && Subtarget.hasVLX())) &&

40233

(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

40234

isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

40235

// Bail if this was already a truncation or PACK node.

40236

// We sometimes fail to match PACK if we demand known undef elements.

40237

if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||

40238

Root.getOpcode() == X86ISD::PACKSS ||

40239

Root.getOpcode() == X86ISD::PACKUS))

40240

return SDValue(); // Nothing to do!

40241

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

40242

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

40243

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

40244

V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);

40245

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

40246

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

40247

Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

40248

Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

40249

return DAG.getBitcast(RootVT, Res);

40250

}

40251

}

40252

40253

// Don't try to re-form single instruction chains under any circumstances now

40254

// that we've done encoding canonicalization for them.

40255

if (Depth < 1)

40256

return SDValue();

40257

40258

// Depth threshold above which we can efficiently use variable mask shuffles.

40259

int VariableCrossLaneShuffleDepth =

40260

Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;

40261

int VariablePerLaneShuffleDepth =

40262

Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;

40263

AllowVariableCrossLaneMask &=

40264

(Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;

40265

AllowVariablePerLaneMask &=

40266

(Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;

40267

// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a

40268

// higher depth before combining them.

40269

bool AllowBWIVPERMV3 =

40270

(Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);

40271

40272

bool MaskContainsZeros = isAnyZero(Mask);

40273

40274

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

40275

// If we have a single input lane-crossing shuffle then lower to VPERMV.

40276

if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {

40277

if (Subtarget.hasAVX2() &&

40278

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

40279

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

40280

Res = CanonicalizeShuffleInput(MaskVT, V1);

40281

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

40282

return DAG.getBitcast(RootVT, Res);

40283

}

40284

// AVX512 variants (non-VLX will pad to 512-bit shuffles).

40285

if ((Subtarget.hasAVX512() &&

40286

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40287

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

40288

(Subtarget.hasBWI() &&

40289

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40290

(Subtarget.hasVBMI() &&

40291

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

40292

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40293

V2 = DAG.getUNDEF(MaskVT);

40294

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40295

return DAG.getBitcast(RootVT, Res);

40296

}

40297

}

40298

40299

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

40300

// vector as the second source (non-VLX will pad to 512-bit shuffles).

40301

if (UnaryShuffle && AllowVariableCrossLaneMask &&

40302

((Subtarget.hasAVX512() &&

40303

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40304

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

40305

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

40306

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

40307

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40308

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40309

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40310

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

40311

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

40312

for (unsigned i = 0; i != NumMaskElts; ++i)

40313

if (Mask[i] == SM_SentinelZero)

40314

Mask[i] = NumMaskElts + i;

40315

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40316

V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

40317

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40318

return DAG.getBitcast(RootVT, Res);

40319

}

40320

40321

// If that failed and either input is extracted then try to combine as a

40322

// shuffle with the larger type.

40323

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

40324

Inputs, Root, BaseMask, Depth, HasVariableMask,

40325

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,

40326

Subtarget))

40327

return WideShuffle;

40328

40329

// If we have a dual input lane-crossing shuffle then lower to VPERMV3,

40330

// (non-VLX will pad to 512-bit shuffles).

40331

if (AllowVariableCrossLaneMask && !MaskContainsZeros &&

40332

((Subtarget.hasAVX512() &&

40333

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40334

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

40335

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

40336

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

40337

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40338

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40339

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40340

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

40341

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40342

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40343

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40344

return DAG.getBitcast(RootVT, Res);

40345

}

40346

return SDValue();

40347

}

40348

40349

// See if we can combine a single input shuffle with zeros to a bit-mask,

40350

// which is much simpler than any shuffle.

40351

if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&

40352

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

40353

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

40354

APInt Zero = APInt::getZero(MaskEltSizeInBits);

40355

APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);

40356

APInt UndefElts(NumMaskElts, 0);

40357

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

40358

for (unsigned i = 0; i != NumMaskElts; ++i) {

40359

int M = Mask[i];

40360

if (M == SM_SentinelUndef) {

40361

UndefElts.setBit(i);

40362

continue;

40363

}

40364

if (M == SM_SentinelZero)

40365

continue;

40366

EltBits[i] = AllOnes;

40367

}

40368

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

40369

Res = CanonicalizeShuffleInput(MaskVT, V1);

40370

unsigned AndOpcode =

40371

MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

40372

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

40373

return DAG.getBitcast(RootVT, Res);

40374

}

40375

40376

// If we have a single input shuffle with different shuffle patterns in the

40377

// the 128-bit lanes use the variable mask to VPERMILPS.

40378

// TODO Combine other mask types at higher depths.

40379

if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

40380

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

40381

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

40382

SmallVector<SDValue, 16> VPermIdx;

40383

for (int M : Mask) {

40384

SDValue Idx =

40385

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

40386

VPermIdx.push_back(Idx);

40387

}

40388

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

40389

Res = CanonicalizeShuffleInput(MaskVT, V1);

40390

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

40391

return DAG.getBitcast(RootVT, Res);

40392

}

40393

40394

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

40395

// to VPERMIL2PD/VPERMIL2PS.

40396

if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&

40397

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

40398

MaskVT == MVT::v8f32)) {

40399

// VPERMIL2 Operation.

40400

// Bits[3] - Match Bit.

40401

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

40402

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

40403

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

40404

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

40405

SmallVector<int, 8> VPerm2Idx;

40406

unsigned M2ZImm = 0;

40407

for (int M : Mask) {

40408

if (M == SM_SentinelUndef) {

40409

VPerm2Idx.push_back(-1);

40410

continue;

40411

}

40412

if (M == SM_SentinelZero) {

40413

M2ZImm = 2;

40414

VPerm2Idx.push_back(8);

40415

continue;

40416

}

40417

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

40418

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

40419

VPerm2Idx.push_back(Index);

40420

}

40421

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40422

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40423

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

40424

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

40425

DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

40426

return DAG.getBitcast(RootVT, Res);

40427

}

40428

40429

// If we have 3 or more shuffle instructions or a chain involving a variable

40430

// mask, we can replace them with a single PSHUFB instruction profitably.

40431

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

40432

// instructions, but in practice PSHUFB tends to be *very* fast so we're

40433

// more aggressive.

40434

if (UnaryShuffle && AllowVariablePerLaneMask &&

40435

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

40436

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

40437

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

40438

SmallVector<SDValue, 16> PSHUFBMask;

40439

int NumBytes = RootVT.getSizeInBits() / 8;

40440

int Ratio = NumBytes / NumMaskElts;

40441

for (int i = 0; i < NumBytes; ++i) {

40442

int M = Mask[i / Ratio];

40443

if (M == SM_SentinelUndef) {

40444

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

40445

continue;

40446

}

40447

if (M == SM_SentinelZero) {

40448

PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

40449

continue;

40450

}

40451

M = Ratio * M + i % Ratio;

40452

assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40452, __extension__
__PRETTY_FUNCTION__));

40453

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

40454

}

40455

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

40456

Res = CanonicalizeShuffleInput(ByteVT, V1);

40457

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

40458

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

40459

return DAG.getBitcast(RootVT, Res);

40460

}

40461

40462

// With XOP, if we have a 128-bit binary input shuffle we can always combine

40463

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

40464

// slower than PSHUFB on targets that support both.

40465

if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&

40466

Subtarget.hasXOP()) {

40467

// VPPERM Mask Operation

40468

// Bits[4:0] - Byte Index (0 - 31)

40469

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

40470

SmallVector<SDValue, 16> VPPERMMask;

40471

int NumBytes = 16;

40472

int Ratio = NumBytes / NumMaskElts;

40473

for (int i = 0; i < NumBytes; ++i) {

40474

int M = Mask[i / Ratio];

40475

if (M == SM_SentinelUndef) {

40476

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

40477

continue;

40478

}

40479

if (M == SM_SentinelZero) {

40480

VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

40481

continue;

40482

}

40483

M = Ratio * M + i % Ratio;

40484

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

40485

}

40486

MVT ByteVT = MVT::v16i8;

40487

V1 = CanonicalizeShuffleInput(ByteVT, V1);

40488

V2 = CanonicalizeShuffleInput(ByteVT, V2);

40489

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

40490

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

40491

return DAG.getBitcast(RootVT, Res);

40492

}

40493

40494

// If that failed and either input is extracted then try to combine as a

40495

// shuffle with the larger type.

40496

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

40497

Inputs, Root, BaseMask, Depth, HasVariableMask,

40498

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))

40499

return WideShuffle;

40500

40501

// If we have a dual input shuffle then lower to VPERMV3,

40502

// (non-VLX will pad to 512-bit shuffles)

40503

if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

40504

((Subtarget.hasAVX512() &&

40505

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

40506

MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

40507

MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

40508

MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

40509

MaskVT == MVT::v16i32)) ||

40510

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40511

(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

40512

MaskVT == MVT::v32i16)) ||

40513

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40514

(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

40515

MaskVT == MVT::v64i8)))) {

40516

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40517

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40518

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40519

return DAG.getBitcast(RootVT, Res);

40520

}

40521

40522

// Failed to find any combines.

40523

return SDValue();

40524

}

40525

40526

// Combine an arbitrary chain of shuffles + extract_subvectors into a single

40527

// instruction if possible.

40528

//

40529

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

40530

// type size to attempt to combine:

40531

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

40532

// -->

40533

// extract_subvector(shuffle(x,y,m2),0)

40534

static SDValue combineX86ShuffleChainWithExtract(

40535

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

40536

bool HasVariableMask, bool AllowVariableCrossLaneMask,

40537

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40538

const X86Subtarget &Subtarget) {

40539

unsigned NumMaskElts = BaseMask.size();

40540

unsigned NumInputs = Inputs.size();

40541

if (NumInputs == 0)

40542

return SDValue();

40543

40544

EVT RootVT = Root.getValueType();

40545

unsigned RootSizeInBits = RootVT.getSizeInBits();

40546

unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;

40547

assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40547, __extension__
__PRETTY_FUNCTION__));

40548

40549

// Peek through extract_subvector to find widest legal vector.

40550

// TODO: Handle ISD::TRUNCATE

40551

unsigned WideSizeInBits = RootSizeInBits;

40552

for (unsigned I = 0; I != NumInputs; ++I) {

40553

SDValue Input = peekThroughBitcasts(Inputs[I]);

40554

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)

40555

Input = peekThroughBitcasts(Input.getOperand(0));

40556

if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&

40557

WideSizeInBits < Input.getValueSizeInBits())

40558

WideSizeInBits = Input.getValueSizeInBits();

40559

}

40560

40561

// Bail if we fail to find a source larger than the existing root.

40562

unsigned Scale = WideSizeInBits / RootSizeInBits;

40563

if (WideSizeInBits <= RootSizeInBits ||

40564

(WideSizeInBits % RootSizeInBits) != 0)

40565

return SDValue();

40566

40567

// Create new mask for larger type.

40568

SmallVector<int, 64> WideMask(BaseMask);

40569

for (int &M : WideMask) {

40570

if (M < 0)

40571

continue;

40572

M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);

40573

}

40574

WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

40575

40576

// Attempt to peek through inputs and adjust mask when we extract from an

40577

// upper subvector.

40578

int AdjustedMasks = 0;

40579

SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());

40580

for (unsigned I = 0; I != NumInputs; ++I) {

40581

SDValue &Input = WideInputs[I];

40582

Input = peekThroughBitcasts(Input);

40583

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40584

Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {

40585

uint64_t Idx = Input.getConstantOperandVal(1);

40586

if (Idx != 0) {

40587

++AdjustedMasks;

40588

unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();

40589

Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;

40590

40591

int lo = I * WideMask.size();

40592

int hi = (I + 1) * WideMask.size();

40593

for (int &M : WideMask)

40594

if (lo <= M && M < hi)

40595

M += Idx;

40596

}

40597

Input = peekThroughBitcasts(Input.getOperand(0));

40598

}

40599

}

40600

40601

// Remove unused/repeated shuffle source ops.

40602

resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

40603

assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40603, __extension__
__PRETTY_FUNCTION__));

40604

40605

// Bail if we're always extracting from the lowest subvectors,

40606

// combineX86ShuffleChain should match this for the current width, or the

40607

// shuffle still references too many inputs.

40608

if (AdjustedMasks == 0 || WideInputs.size() > 2)

40609

return SDValue();

40610

40611

// Minor canonicalization of the accumulated shuffle mask to make it easier

40612

// to match below. All this does is detect masks with sequential pairs of

40613

// elements, and shrink them to the half-width mask. It does this in a loop

40614

// so it will reduce the size of the mask to the minimal width mask which

40615

// performs an equivalent shuffle.

40616

while (WideMask.size() > 1) {

40617

SmallVector<int, 64> WidenedMask;

40618

if (!canWidenShuffleElements(WideMask, WidenedMask))

40619

break;

40620

WideMask = std::move(WidenedMask);

40621

}

40622

40623

// Canonicalization of binary shuffle masks to improve pattern matching by

40624

// commuting the inputs.

40625

if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {

40626

ShuffleVectorSDNode::commuteMask(WideMask);

40627

std::swap(WideInputs[0], WideInputs[1]);

40628

}

40629

40630

// Increase depth for every upper subvector we've peeked through.

40631

Depth += AdjustedMasks;

40632

40633

// Attempt to combine wider chain.

40634

// TODO: Can we use a better Root?

40635

SDValue WideRoot = WideInputs.front().getValueSizeInBits() >

40636

WideInputs.back().getValueSizeInBits()

40637

? WideInputs.front()

40638

: WideInputs.back();

40639

assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__))

40640

"WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__));

40641

40642

if (SDValue WideShuffle =

40643

combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,

40644

HasVariableMask, AllowVariableCrossLaneMask,

40645

AllowVariablePerLaneMask, DAG, Subtarget)) {

40646

WideShuffle =

40647

extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);

40648

return DAG.getBitcast(RootVT, WideShuffle);

40649

}

40650

40651

return SDValue();

40652

}

40653

40654

// Canonicalize the combined shuffle mask chain with horizontal ops.

40655

// NOTE: This may update the Ops and Mask.

40656

static SDValue canonicalizeShuffleMaskWithHorizOp(

40657

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

40658

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

40659

const X86Subtarget &Subtarget) {

40660

if (Mask.empty() || Ops.empty())

40661

return SDValue();

40662

40663

SmallVector<SDValue> BC;

40664

for (SDValue Op : Ops)

40665

BC.push_back(peekThroughBitcasts(Op));

40666

40667

// All ops must be the same horizop + type.

40668

SDValue BC0 = BC[0];

40669

EVT VT0 = BC0.getValueType();

40670

unsigned Opcode0 = BC0.getOpcode();

40671

if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {

40672

return V.getOpcode() != Opcode0 || V.getValueType() != VT0;

40673

}))

40674

return SDValue();

40675

40676

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

40677

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

40678

bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

40679

if (!isHoriz && !isPack)

40680

return SDValue();

40681

40682

// Do all ops have a single use?

40683

bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {

40684

return Op.hasOneUse() &&

40685

peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);

40686

});

40687

40688

int NumElts = VT0.getVectorNumElements();

40689

int NumLanes = VT0.getSizeInBits() / 128;

40690

int NumEltsPerLane = NumElts / NumLanes;

40691

int NumHalfEltsPerLane = NumEltsPerLane / 2;

40692

MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

40693

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

40694

40695

if (NumEltsPerLane >= 4 &&

40696

(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {

40697

SmallVector<int> LaneMask, ScaledMask;

40698

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&

40699

scaleShuffleElements(LaneMask, 4, ScaledMask)) {

40700

// See if we can remove the shuffle by resorting the HOP chain so that

40701

// the HOP args are pre-shuffled.

40702

// TODO: Generalize to any sized/depth chain.

40703

// TODO: Add support for PACKSS/PACKUS.

40704

if (isHoriz) {

40705

// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.

40706

auto GetHOpSrc = [&](int M) {

40707

if (M == SM_SentinelUndef)

40708

return DAG.getUNDEF(VT0);

40709

if (M == SM_SentinelZero)

40710

return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);

40711

SDValue Src0 = BC[M / 4];

40712

SDValue Src1 = Src0.getOperand((M % 4) >= 2);

40713

if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))

40714

return Src1.getOperand(M % 2);

40715

return SDValue();

40716

};

40717

SDValue M0 = GetHOpSrc(ScaledMask[0]);

40718

SDValue M1 = GetHOpSrc(ScaledMask[1]);

40719

SDValue M2 = GetHOpSrc(ScaledMask[2]);

40720

SDValue M3 = GetHOpSrc(ScaledMask[3]);

40721

if (M0 && M1 && M2 && M3) {

40722

SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);

40723

SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);

40724

return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

40725

}

40726

}

40727

// shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.

40728

if (Ops.size() >= 2) {

40729

SDValue LHS, RHS;

40730

auto GetHOpSrc = [&](int M, int &OutM) {

40731

// TODO: Support SM_SentinelZero

40732

if (M < 0)

40733

return M == SM_SentinelUndef;

40734

SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);

40735

if (!LHS || LHS == Src) {

40736

LHS = Src;

40737

OutM = (M % 2);

40738

return true;

40739

}

40740

if (!RHS || RHS == Src) {

40741

RHS = Src;

40742

OutM = (M % 2) + 2;

40743

return true;

40744

}

40745

return false;

40746

};

40747

int PostMask[4] = {-1, -1, -1, -1};

40748

if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&

40749

GetHOpSrc(ScaledMask[1], PostMask[1]) &&

40750

GetHOpSrc(ScaledMask[2], PostMask[2]) &&

40751

GetHOpSrc(ScaledMask[3], PostMask[3])) {

40752

LHS = DAG.getBitcast(SrcVT, LHS);

40753

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

40754

SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

40755

// Use SHUFPS for the permute so this will work on SSE3 targets,

40756

// shuffle combining and domain handling will simplify this later on.

40757

MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);

40758

Res = DAG.getBitcast(ShuffleVT, Res);

40759

return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,

40760

getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));

40761

}

40762

}

40763

}

40764

}

40765

40766

if (2 < Ops.size())

40767

return SDValue();

40768

40769

SDValue BC1 = BC[BC.size() - 1];

40770

if (Mask.size() == VT0.getVectorNumElements()) {

40771

// Canonicalize binary shuffles of horizontal ops that use the

40772

// same sources to an unary shuffle.

40773

// TODO: Try to perform this fold even if the shuffle remains.

40774

if (Ops.size() == 2) {

40775

auto ContainsOps = [](SDValue HOp, SDValue Op) {

40776

return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

40777

};

40778

// Commute if all BC0's ops are contained in BC1.

40779

if (ContainsOps(BC1, BC0.getOperand(0)) &&

40780

ContainsOps(BC1, BC0.getOperand(1))) {

40781

ShuffleVectorSDNode::commuteMask(Mask);

40782

std::swap(Ops[0], Ops[1]);

40783

std::swap(BC0, BC1);

40784

}

40785

40786

// If BC1 can be represented by BC0, then convert to unary shuffle.

40787

if (ContainsOps(BC0, BC1.getOperand(0)) &&

40788

ContainsOps(BC0, BC1.getOperand(1))) {

40789

for (int &M : Mask) {

40790

if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

40791

continue;

40792

int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

40793

M -= NumElts + (SubLane * NumHalfEltsPerLane);

40794

if (BC1.getOperand(SubLane) != BC0.getOperand(0))

40795

M += NumHalfEltsPerLane;

40796

}

40797

}

40798

}

40799

40800

// Canonicalize unary horizontal ops to only refer to lower halves.

40801

for (int i = 0; i != NumElts; ++i) {

40802

int &M = Mask[i];

40803

if (isUndefOrZero(M))

40804

continue;

40805

if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

40806

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40807

M -= NumHalfEltsPerLane;

40808

if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

40809

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40810

M -= NumHalfEltsPerLane;

40811

}

40812

}

40813

40814

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

40815

// single instruction. Attempt to match a v2X64 repeating shuffle pattern that

40816

// represents the LHS/RHS inputs for the lower/upper halves.

40817

SmallVector<int, 16> TargetMask128, WideMask128;

40818

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

40819

scaleShuffleElements(TargetMask128, 2, WideMask128)) {

40820

assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40820, __extension__
__PRETTY_FUNCTION__));

40821

bool SingleOp = (Ops.size() == 1);

40822

if (isPack || OneUseOps ||

40823

shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

40824

SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

40825

SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

40826

Lo = Lo.getOperand(WideMask128[0] & 1);

40827

Hi = Hi.getOperand(WideMask128[1] & 1);

40828

if (SingleOp) {

40829

SDValue Undef = DAG.getUNDEF(SrcVT);

40830

SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

40831

Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

40832

Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

40833

Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

40834

Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

40835

}

40836

return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

40837

}

40838

}

40839

40840

return SDValue();

40841

}

40842

40843

// Attempt to constant fold all of the constant source ops.

40844

// Returns true if the entire shuffle is folded to a constant.

40845

// TODO: Extend this to merge multiple constant Ops and update the mask.

40846

static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

40847

ArrayRef<int> Mask, SDValue Root,

40848

bool HasVariableMask,

40849

SelectionDAG &DAG,

40850

const X86Subtarget &Subtarget) {

40851

MVT VT = Root.getSimpleValueType();

40852

40853

unsigned SizeInBits = VT.getSizeInBits();

40854

unsigned NumMaskElts = Mask.size();

40855

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

40856

unsigned NumOps = Ops.size();

40857

40858

// Extract constant bits from each source op.

40859

SmallVector<APInt, 16> UndefEltsOps(NumOps);

40860

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

40861

for (unsigned I = 0; I != NumOps; ++I)

40862

if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],

40863

RawBitsOps[I]))

40864

return SDValue();

40865

40866

// If we're optimizing for size, only fold if at least one of the constants is

40867

// only used once or the combined shuffle has included a variable mask

40868

// shuffle, this is to avoid constant pool bloat.

40869

bool IsOptimizingSize = DAG.shouldOptForSize();

40870

if (IsOptimizingSize && !HasVariableMask &&

40871

llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))

40872

return SDValue();

40873

40874

// Shuffle the constant bits according to the mask.

40875

SDLoc DL(Root);

40876

APInt UndefElts(NumMaskElts, 0);

40877

APInt ZeroElts(NumMaskElts, 0);

40878

APInt ConstantElts(NumMaskElts, 0);

40879

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

40880

APInt::getZero(MaskSizeInBits));

40881

for (unsigned i = 0; i != NumMaskElts; ++i) {

40882

int M = Mask[i];

40883

if (M == SM_SentinelUndef) {

40884

UndefElts.setBit(i);

40885

continue;

40886

} else if (M == SM_SentinelZero) {

40887

ZeroElts.setBit(i);

40888

continue;

40889

}

40890

assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40890, __extension__
__PRETTY_FUNCTION__));

40891

40892

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

40893

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

40894

40895

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

40896

if (SrcUndefElts[SrcMaskIdx]) {

40897

UndefElts.setBit(i);

40898

continue;

40899

}

40900

40901

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

40902

APInt &Bits = SrcEltBits[SrcMaskIdx];

40903

if (!Bits) {

40904

ZeroElts.setBit(i);

40905

continue;

40906

}

40907

40908

ConstantElts.setBit(i);

40909

ConstantBitData[i] = Bits;

40910

}

40911

40912

40913

// Attempt to create a zero vector.

40914

if ((UndefElts | ZeroElts).isAllOnes())

40915

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

40916

40917

// Create the constant data.

40918

MVT MaskSVT;

40919

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

40920

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

40921

else

40922

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

40923

40924

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

40925

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

40926

return SDValue();

40927

40928

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

40929

return DAG.getBitcast(VT, CstOp);

40930

}

40931

40932

namespace llvm {

40933

namespace X86 {

40934

enum {

40935

MaxShuffleCombineDepth = 8

40936

};

40937

}

40938

} // namespace llvm

40939

40940

/// Fully generic combining of x86 shuffle instructions.

40941

///

40942

/// This should be the last combine run over the x86 shuffle instructions. Once

40943

/// they have been fully optimized, this will recursively consider all chains

40944

/// of single-use shuffle instructions, build a generic model of the cumulative

40945

/// shuffle operation, and check for simpler instructions which implement this

40946

/// operation. We use this primarily for two purposes:

40947

///

40948

/// 1) Collapse generic shuffles to specialized single instructions when

40949

/// equivalent. In most cases, this is just an encoding size win, but

40950

/// sometimes we will collapse multiple generic shuffles into a single

40951

/// special-purpose shuffle.

40952

/// 2) Look for sequences of shuffle instructions with 3 or more total

40953

/// instructions, and replace them with the slightly more expensive SSSE3

40954

/// PSHUFB instruction if available. We do this as the last combining step

40955

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

40956

/// a suitable short sequence of other instructions. The PSHUFB will either

40957

/// use a register or have to read from memory and so is slightly (but only

40958

/// slightly) more expensive than the other shuffle instructions.

40959

///

40960

/// Because this is inherently a quadratic operation (for each shuffle in

40961

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

40962

/// This should never be an issue in practice as the shuffle lowering doesn't

40963

/// produce sequences of more than 8 instructions.

40964

///

40965

/// FIXME: We will currently miss some cases where the redundant shuffling

40966

/// would simplify under the threshold for PSHUFB formation because of

40967

/// combine-ordering. To fix this, we should do the redundant instruction

40968

/// combining in this recursive walk.

40969

static SDValue combineX86ShufflesRecursively(

40970

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

40971

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

40972

unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,

40973

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40974

const X86Subtarget &Subtarget) {

40975

assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))

40976

(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__))

40977

"Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__
__PRETTY_FUNCTION__));

40978

MVT RootVT = Root.getSimpleValueType();

40979

assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40979, __extension__
__PRETTY_FUNCTION__));

40980

unsigned RootSizeInBits = RootVT.getSizeInBits();

40981

40982

// Bound the depth of our recursive combine because this is ultimately

40983

// quadratic in nature.

40984

if (Depth >= MaxDepth)

40985

return SDValue();

40986

40987

// Directly rip through bitcasts to find the underlying operand.

40988

SDValue Op = SrcOps[SrcOpIndex];

40989

Op = peekThroughOneUseBitcasts(Op);

40990

40991

EVT VT = Op.getValueType();

40992

if (!VT.isVector() || !VT.isSimple())

40993

return SDValue(); // Bail if we hit a non-simple non-vector.

40994

40995

// FIXME: Just bail on f16 for now.

40996

if (VT.getVectorElementType() == MVT::f16)

40997

return SDValue();

40998

40999

assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__
__PRETTY_FUNCTION__))

41000

"Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__
__PRETTY_FUNCTION__));

41001

41002

// Create a demanded elts mask from the referenced elements of Op.

41003

APInt OpDemandedElts = APInt::getZero(RootMask.size());

41004

for (int M : RootMask) {

41005

int BaseIdx = RootMask.size() * SrcOpIndex;

41006

if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))

41007

OpDemandedElts.setBit(M - BaseIdx);

41008

}

41009

if (RootSizeInBits != VT.getSizeInBits()) {

41010

// Op is smaller than Root - extract the demanded elts for the subvector.

41011

unsigned Scale = RootSizeInBits / VT.getSizeInBits();

41012

unsigned NumOpMaskElts = RootMask.size() / Scale;

41013

assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41013, __extension__
__PRETTY_FUNCTION__));

41014

assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))

41015

.extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))

41016

.isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__))

41017

"Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__
__PRETTY_FUNCTION__));

41018

OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);

41019

}

41020

OpDemandedElts =

41021

APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());

41022

41023

// Extract target shuffle mask and resolve sentinels and inputs.

41024

SmallVector<int, 64> OpMask;

41025

SmallVector<SDValue, 2> OpInputs;

41026

APInt OpUndef, OpZero;

41027

bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());

41028

if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

41029

OpZero, DAG, Depth, false)) {

41030

// Shuffle inputs must not be larger than the shuffle result.

41031

// TODO: Relax this for single input faux shuffles (e.g. trunc).

41032

if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {

41033

return OpInput.getValueSizeInBits() > VT.getSizeInBits();

41034

}))

41035

return SDValue();

41036

} else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

41037

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

41038

!isNullConstant(Op.getOperand(1))) {

41039

SDValue SrcVec = Op.getOperand(0);

41040

int ExtractIdx = Op.getConstantOperandVal(1);

41041

unsigned NumElts = VT.getVectorNumElements();

41042

OpInputs.assign({SrcVec});

41043

OpMask.assign(NumElts, SM_SentinelUndef);

41044

std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);

41045

OpZero = OpUndef = APInt::getZero(NumElts);

41046

} else {

41047

return SDValue();

41048

}

41049

41050

// If the shuffle result was smaller than the root, we need to adjust the

41051

// mask indices and pad the mask with undefs.

41052

if (RootSizeInBits > VT.getSizeInBits()) {

41053

unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();

41054

unsigned OpMaskSize = OpMask.size();

41055

if (OpInputs.size() > 1) {

41056

unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;

41057

for (int &M : OpMask) {

41058

if (M < 0)

41059

continue;

41060

int EltIdx = M % OpMaskSize;

41061

int OpIdx = M / OpMaskSize;

41062

M = (PaddedMaskSize * OpIdx) + EltIdx;

41063

}

41064

}

41065

OpZero = OpZero.zext(NumSubVecs * OpMaskSize);

41066

OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);

41067

OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);

41068

}

41069

41070

SmallVector<int, 64> Mask;

41071

SmallVector<SDValue, 16> Ops;

41072

41073

// We don't need to merge masks if the root is empty.

41074

bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

41075

if (EmptyRoot) {

41076

// Only resolve zeros if it will remove an input, otherwise we might end

41077

// up in an infinite loop.

41078

bool ResolveKnownZeros = true;

41079

if (!OpZero.isZero()) {

41080

APInt UsedInputs = APInt::getZero(OpInputs.size());

41081

for (int i = 0, e = OpMask.size(); i != e; ++i) {

41082

int M = OpMask[i];

41083

if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

41084

continue;

41085

UsedInputs.setBit(M / OpMask.size());

41086

if (UsedInputs.isAllOnes()) {

41087

ResolveKnownZeros = false;

41088

break;

41089

}

41090

}

41091

}

41092

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

41093

ResolveKnownZeros);

41094

41095

Mask = OpMask;

41096

Ops.append(OpInputs.begin(), OpInputs.end());

41097

} else {

41098

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

41099

41100

// Add the inputs to the Ops list, avoiding duplicates.

41101

Ops.append(SrcOps.begin(), SrcOps.end());

41102

41103

auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

41104

// Attempt to find an existing match.

41105

SDValue InputBC = peekThroughBitcasts(Input);

41106

for (int i = 0, e = Ops.size(); i < e; ++i)

41107

if (InputBC == peekThroughBitcasts(Ops[i]))

41108

return i;

41109

// Match failed - should we replace an existing Op?

41110

if (InsertionPoint >= 0) {

41111

Ops[InsertionPoint] = Input;

41112

return InsertionPoint;

41113

}

41114

// Add to the end of the Ops list.

41115

Ops.push_back(Input);

41116

return Ops.size() - 1;

41117

};

41118

41119

SmallVector<int, 2> OpInputIdx;

41120

for (SDValue OpInput : OpInputs)

41121

OpInputIdx.push_back(

41122

AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

41123

41124

assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))

41125

RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))

41126

(OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))

41127

OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))

41128

OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__))

41129

"The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__
__PRETTY_FUNCTION__));

41130

41131

// This function can be performance-critical, so we rely on the power-of-2

41132

// knowledge that we have about the mask sizes to replace div/rem ops with

41133

// bit-masks and shifts.

41134

assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__
__PRETTY_FUNCTION__))

41135

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__
__PRETTY_FUNCTION__));

41136

assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__
__PRETTY_FUNCTION__))

41137

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__
__PRETTY_FUNCTION__));

41138

unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());

41139

unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());

41140

41141

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

41142

unsigned RootRatio =

41143

std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

41144

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

41145

assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__
__PRETTY_FUNCTION__))

41146

"Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__
__PRETTY_FUNCTION__));

41147

41148

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41148, __extension__
__PRETTY_FUNCTION__));

41149

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41149, __extension__
__PRETTY_FUNCTION__));

41150

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41150, __extension__
__PRETTY_FUNCTION__));

41151

unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);

41152

unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);

41153

41154

Mask.resize(MaskWidth, SM_SentinelUndef);

41155

41156

// Merge this shuffle operation's mask into our accumulated mask. Note that

41157

// this shuffle's mask will be the first applied to the input, followed by

41158

// the root mask to get us all the way to the root value arrangement. The

41159

// reason for this order is that we are recursing up the operation chain.

41160

for (unsigned i = 0; i < MaskWidth; ++i) {

41161

unsigned RootIdx = i >> RootRatioLog2;

41162

if (RootMask[RootIdx] < 0) {

41163

// This is a zero or undef lane, we're done.

41164

Mask[i] = RootMask[RootIdx];

41165

continue;

41166

}

41167

41168

unsigned RootMaskedIdx =

41169

RootRatio == 1

41170

? RootMask[RootIdx]

41171

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

41172

41173

// Just insert the scaled root mask value if it references an input other

41174

// than the SrcOp we're currently inserting.

41175

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

41176

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

41177

Mask[i] = RootMaskedIdx;

41178

continue;

41179

}

41180

41181

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

41182

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

41183

if (OpMask[OpIdx] < 0) {

41184

// The incoming lanes are zero or undef, it doesn't matter which ones we

41185

// are using.

41186

Mask[i] = OpMask[OpIdx];

41187

continue;

41188

}

41189

41190

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

41191

unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

41192

: (OpMask[OpIdx] << OpRatioLog2) +

41193

(RootMaskedIdx & (OpRatio - 1));

41194

41195

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

41196

int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

41197

assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41197, __extension__
__PRETTY_FUNCTION__));

41198

OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

41199

41200

Mask[i] = OpMaskedIdx;

41201

}

41202

}

41203

41204

// Peek through vector widenings and set out of bounds mask indices to undef.

41205

// TODO: Can resolveTargetShuffleInputsAndMask do some of this?

41206

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

41207

SDValue &Op = Ops[I];

41208

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&

41209

isNullConstant(Op.getOperand(2))) {

41210

Op = Op.getOperand(1);

41211

unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();

41212

int Lo = I * Mask.size();

41213

int Hi = (I + 1) * Mask.size();

41214

int NewHi = Lo + (Mask.size() / Scale);

41215

for (int &M : Mask) {

41216

if (Lo <= M && NewHi <= M && M < Hi)

41217

M = SM_SentinelUndef;

41218

}

41219

}

41220

}

41221

41222

// Peek through any free extract_subvector nodes back to root size.

41223

for (SDValue &Op : Ops)

41224

while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

41225

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

41226

isNullConstant(Op.getOperand(1)))

41227

Op = Op.getOperand(0);

41228

41229

// Remove unused/repeated shuffle source ops.

41230

resolveTargetShuffleInputsAndMask(Ops, Mask);

41231

41232

// Handle the all undef/zero/ones cases early.

41233

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

41234

return DAG.getUNDEF(RootVT);

41235

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

41236

return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));

41237

if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

41238

!llvm::is_contained(Mask, SM_SentinelZero))

41239

return getOnesVector(RootVT, DAG, SDLoc(Root));

41240

41241

assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41241, __extension__
__PRETTY_FUNCTION__));

41242

HasVariableMask |= IsOpVariableMask;

41243

41244

// Update the list of shuffle nodes that have been combined so far.

41245

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

41246

SrcNodes.end());

41247

CombinedNodes.push_back(Op.getNode());

41248

41249

// See if we can recurse into each shuffle source op (if it's a target

41250

// shuffle). The source op should only be generally combined if it either has

41251

// a single use (i.e. current Op) or all its users have already been combined,

41252

// if not then we can still combine but should prevent generation of variable

41253

// shuffles to avoid constant pool bloat.

41254

// Don't recurse if we already have more source ops than we can combine in

41255

// the remaining recursion depth.

41256

if (Ops.size() < (MaxDepth - Depth)) {

41257

for (int i = 0, e = Ops.size(); i < e; ++i) {

41258

// For empty roots, we need to resolve zeroable elements before combining

41259

// them with other shuffles.

41260

SmallVector<int, 64> ResolvedMask = Mask;

41261

if (EmptyRoot)

41262

resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

41263

bool AllowCrossLaneVar = false;

41264

bool AllowPerLaneVar = false;

41265

if (Ops[i].getNode()->hasOneUse() ||

41266

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {

41267

AllowCrossLaneVar = AllowVariableCrossLaneMask;

41268

AllowPerLaneVar = AllowVariablePerLaneMask;

41269

}

41270

if (SDValue Res = combineX86ShufflesRecursively(

41271

Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,

41272

HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,

41273

Subtarget))

41274

return Res;

41275

}

41276

}

41277

41278

// Attempt to constant fold all of the constant source ops.

41279

if (SDValue Cst = combineX86ShufflesConstants(

41280

Ops, Mask, Root, HasVariableMask, DAG, Subtarget))

41281

return Cst;

41282

41283

// If constant fold failed and we only have constants - then we have

41284

// multiple uses by a single non-variable shuffle - just bail.

41285

if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {

41286

APInt UndefElts;

41287

SmallVector<APInt> RawBits;

41288

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

41289

return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

41290

RawBits);

41291

})) {

41292

return SDValue();

41293

}

41294

41295

// Canonicalize the combined shuffle mask chain with horizontal ops.

41296

// NOTE: This will update the Ops and Mask.

41297

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

41298

Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))

41299

return DAG.getBitcast(RootVT, HOp);

41300

41301

// Try to refine our inputs given our knowledge of target shuffle mask.

41302

for (auto I : enumerate(Ops)) {

41303

int OpIdx = I.index();

41304

SDValue &Op = I.value();

41305

41306

// What range of shuffle mask element values results in picking from Op?

41307

int Lo = OpIdx * Mask.size();

41308

int Hi = Lo + Mask.size();

41309

41310

// Which elements of Op do we demand, given the mask's granularity?

41311

APInt OpDemandedElts(Mask.size(), 0);

41312

for (int MaskElt : Mask) {

41313

if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?

41314

int OpEltIdx = MaskElt - Lo;

41315

OpDemandedElts.setBit(OpEltIdx);

41316

}

41317

}

41318

41319

// Is the shuffle result smaller than the root?

41320

if (Op.getValueSizeInBits() < RootSizeInBits) {

41321

// We padded the mask with undefs. But we now need to undo that.

41322

unsigned NumExpectedVectorElts = Mask.size();

41323

unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;

41324

unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;

41325

assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))

41326

NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__))

41327

"Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__
__PRETTY_FUNCTION__));

41328

OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW

41329

}

41330

41331

// The Op itself may be of different VT, so we need to scale the mask.

41332

unsigned NumOpElts = Op.getValueType().getVectorNumElements();

41333

APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);

41334

41335

// Can this operand be simplified any further, given it's demanded elements?

41336

if (SDValue NewOp =

41337

DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(

41338

Op, OpScaledDemandedElts, DAG))

41339

Op = NewOp;

41340

}

41341

// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?

41342

41343

// Widen any subvector shuffle inputs we've collected.

41344

// TODO: Remove this to avoid generating temporary nodes, we should only

41345

// widen once combineX86ShuffleChain has found a match.

41346

if (any_of(Ops, [RootSizeInBits](SDValue Op) {

41347

return Op.getValueSizeInBits() < RootSizeInBits;

41348

})) {

41349

for (SDValue &Op : Ops)

41350

if (Op.getValueSizeInBits() < RootSizeInBits)

41351

Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),

41352

RootSizeInBits);

41353

// Reresolve - we might have repeated subvector sources.

41354

resolveTargetShuffleInputsAndMask(Ops, Mask);

41355

}

41356

41357

// We can only combine unary and binary shuffle mask cases.

41358

if (Ops.size() <= 2) {

41359

// Minor canonicalization of the accumulated shuffle mask to make it easier

41360

// to match below. All this does is detect masks with sequential pairs of

41361

// elements, and shrink them to the half-width mask. It does this in a loop

41362

// so it will reduce the size of the mask to the minimal width mask which

41363

// performs an equivalent shuffle.

41364

while (Mask.size() > 1) {

41365

SmallVector<int, 64> WidenedMask;

41366

if (!canWidenShuffleElements(Mask, WidenedMask))

41367

break;

41368

Mask = std::move(WidenedMask);

41369

}

41370

41371

// Canonicalization of binary shuffle masks to improve pattern matching by

41372

// commuting the inputs.

41373

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

41374

ShuffleVectorSDNode::commuteMask(Mask);

41375

std::swap(Ops[0], Ops[1]);

41376

}

41377

41378

// Try to combine into a single shuffle instruction.

41379

if (SDValue Shuffle = combineX86ShuffleChain(

41380

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

41381

AllowVariablePerLaneMask, DAG, Subtarget))

41382

return Shuffle;

41383

41384

// If all the operands come from the same larger vector, fallthrough and try

41385

// to use combineX86ShuffleChainWithExtract.

41386

SDValue LHS = peekThroughBitcasts(Ops.front());

41387

SDValue RHS = peekThroughBitcasts(Ops.back());

41388

if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||

41389

(RootSizeInBits / Mask.size()) != 64 ||

41390

LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

41391

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

41392

LHS.getOperand(0) != RHS.getOperand(0))

41393

return SDValue();

41394

}

41395

41396

// If that failed and any input is extracted then try to combine as a

41397

// shuffle with the larger type.

41398

return combineX86ShuffleChainWithExtract(

41399

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

41400

AllowVariablePerLaneMask, DAG, Subtarget);

41401

}

41402

41403

/// Helper entry wrapper to combineX86ShufflesRecursively.

41404

static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

41405

const X86Subtarget &Subtarget) {

41406

return combineX86ShufflesRecursively(

41407

{Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,

41408

/*HasVarMask*/ false,

41409

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,

41410

Subtarget);

41411

}

41412

41413

/// Get the PSHUF-style mask from PSHUF node.

41414

///

41415

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

41416

/// PSHUF-style masks that can be reused with such instructions.

41417

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

41418

MVT VT = N.getSimpleValueType();

41419

SmallVector<int, 4> Mask;

41420

SmallVector<SDValue, 2> Ops;

41421

bool HaveMask =

41422

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);

41423

(void)HaveMask;

41424

assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41424
, __extension__ __PRETTY_FUNCTION__));

41425

41426

// If we have more than 128-bits, only the low 128-bits of shuffle mask

41427

// matter. Check that the upper masks are repeats and remove them.

41428

if (VT.getSizeInBits() > 128) {

41429

int LaneElts = 128 / VT.getScalarSizeInBits();

41430

#ifndef NDEBUG

41431

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

41432

for (int j = 0; j < LaneElts; ++j)

41433

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__
__PRETTY_FUNCTION__))

41434

"Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__
__PRETTY_FUNCTION__));

41435

#endif

41436

Mask.resize(LaneElts);

41437

}

41438

41439

switch (N.getOpcode()) {

41440

case X86ISD::PSHUFD:

41441

return Mask;

41442

case X86ISD::PSHUFLW:

41443

Mask.resize(4);

41444

return Mask;

41445

case X86ISD::PSHUFHW:

41446

Mask.erase(Mask.begin(), Mask.begin() + 4);

41447

for (int &M : Mask)

41448

M -= 4;

41449

return Mask;

41450

default:

41451

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41451);

41452

}

41453

}

41454

41455

/// Search for a combinable shuffle across a chain ending in pshufd.

41456

///

41457

/// We walk up the chain and look for a combinable shuffle, skipping over

41458

/// shuffles that we could hoist this shuffle's transformation past without

41459

/// altering anything.

41460

static SDValue

41461

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

41462

SelectionDAG &DAG) {

41463

assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__
__PRETTY_FUNCTION__))

41464

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__
__PRETTY_FUNCTION__));

41465

SDLoc DL(N);

41466

41467

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

41468

// of the shuffles in the chain so that we can form a fresh chain to replace

41469

// this one.

41470

SmallVector<SDValue, 8> Chain;

41471

SDValue V = N.getOperand(0);

41472

for (; V.hasOneUse(); V = V.getOperand(0)) {

41473

switch (V.getOpcode()) {

41474

default:

41475

return SDValue(); // Nothing combined!

41476

41477

case ISD::BITCAST:

41478

// Skip bitcasts as we always know the type for the target specific

41479

// instructions.

41480

continue;

41481

41482

case X86ISD::PSHUFD:

41483

// Found another dword shuffle.

41484

break;

41485

41486

case X86ISD::PSHUFLW:

41487

// Check that the low words (being shuffled) are the identity in the

41488

// dword shuffle, and the high words are self-contained.

41489

if (Mask[0] != 0 || Mask[1] != 1 ||

41490

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

41491

return SDValue();

41492

41493

Chain.push_back(V);

41494

continue;

41495

41496

case X86ISD::PSHUFHW:

41497

// Check that the high words (being shuffled) are the identity in the

41498

// dword shuffle, and the low words are self-contained.

41499

if (Mask[2] != 2 || Mask[3] != 3 ||

41500

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

41501

return SDValue();

41502

41503

Chain.push_back(V);

41504

continue;

41505

41506

case X86ISD::UNPCKL:

41507

case X86ISD::UNPCKH:

41508

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

41509

// shuffle into a preceding word shuffle.

41510

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

41511

V.getSimpleValueType().getVectorElementType() != MVT::i16)

41512

return SDValue();

41513

41514

// Search for a half-shuffle which we can combine with.

41515

unsigned CombineOp =

41516

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

41517

if (V.getOperand(0) != V.getOperand(1) ||

41518

!V->isOnlyUserOf(V.getOperand(0).getNode()))

41519

return SDValue();

41520

Chain.push_back(V);

41521

V = V.getOperand(0);

41522

do {

41523

switch (V.getOpcode()) {

41524

default:

41525

return SDValue(); // Nothing to combine.

41526

41527

case X86ISD::PSHUFLW:

41528

case X86ISD::PSHUFHW:

41529

if (V.getOpcode() == CombineOp)

41530

break;

41531

41532

Chain.push_back(V);

41533

41534

[[fallthrough]];

41535

case ISD::BITCAST:

41536

V = V.getOperand(0);

41537

continue;

41538

}

41539

break;

41540

} while (V.hasOneUse());

41541

break;

41542

}

41543

// Break out of the loop if we break out of the switch.

41544

break;

41545

}

41546

41547

if (!V.hasOneUse())

41548

// We fell out of the loop without finding a viable combining instruction.

41549

return SDValue();

41550

41551

// Merge this node's mask and our incoming mask.

41552

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

41553

for (int &M : Mask)

41554

M = VMask[M];

41555

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

41556

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

41557

41558

// Rebuild the chain around this new shuffle.

41559

while (!Chain.empty()) {

41560

SDValue W = Chain.pop_back_val();

41561

41562

if (V.getValueType() != W.getOperand(0).getValueType())

41563

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

41564

41565

switch (W.getOpcode()) {

41566

default:

41567

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41567);

41568

41569

case X86ISD::UNPCKL:

41570

case X86ISD::UNPCKH:

41571

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

41572

break;

41573

41574

case X86ISD::PSHUFD:

41575

case X86ISD::PSHUFLW:

41576

case X86ISD::PSHUFHW:

41577

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

41578

break;

41579

}

41580

}

41581

if (V.getValueType() != N.getValueType())

41582

V = DAG.getBitcast(N.getValueType(), V);

41583

41584

// Return the new chain to replace N.

41585

return V;

41586

}

41587

41588

// Attempt to commute shufps LHS loads:

41589

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

41590

static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

41591

SelectionDAG &DAG) {

41592

// TODO: Add vXf64 support.

41593

if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

41594

return SDValue();

41595

41596

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

41597

auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

41598

if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

41599

return SDValue();

41600

SDValue N0 = V.getOperand(0);

41601

SDValue N1 = V.getOperand(1);

41602

unsigned Imm = V.getConstantOperandVal(2);

41603

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

41604

if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||

41605

X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))

41606

return SDValue();

41607

Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

41608

return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

41609

DAG.getTargetConstant(Imm, DL, MVT::i8));

41610

};

41611

41612

switch (N.getOpcode()) {

41613

case X86ISD::VPERMILPI:

41614

if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

41615

unsigned Imm = N.getConstantOperandVal(1);

41616

return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

41617

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

41618

}

41619

break;

41620

case X86ISD::SHUFP: {

41621

SDValue N0 = N.getOperand(0);

41622

SDValue N1 = N.getOperand(1);

41623

unsigned Imm = N.getConstantOperandVal(2);

41624

if (N0 == N1) {

41625

if (SDValue NewSHUFP = commuteSHUFP(N, N0))

41626

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

41627

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

41628

} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

41629

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

41630

DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

41631

} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

41632

return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

41633

DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

41634

}

41635

break;

41636

}

41637

}

41638

41639

return SDValue();

41640

}

41641

41642

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

41643

static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,

41644

const SDLoc &DL) {

41645

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41646

EVT ShuffleVT = N.getValueType();

41647

41648

auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {

41649

// AllZeros/AllOnes constants are freely shuffled and will peek through

41650

// bitcasts. Other constant build vectors do not peek through bitcasts. Only

41651

// merge with target shuffles if it has one use so shuffle combining is

41652

// likely to kick in. Shuffles of splats are expected to be removed.

41653

return ISD::isBuildVectorAllOnes(Op.getNode()) ||

41654

ISD::isBuildVectorAllZeros(Op.getNode()) ||

41655

ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||

41656

ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||

41657

(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||

41658

(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||

41659

(FoldLoad && isShuffleFoldableLoad(Op)) ||

41660

DAG.isSplatValue(Op, /*AllowUndefs*/ false);

41661

};

41662

auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {

41663

// Ensure we only shuffle whole vector src elements, unless its a logical

41664

// binops where we can more aggressively move shuffles from dst to src.

41665

return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||

41666

BinOp == X86ISD::ANDNP ||

41667

(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());

41668

};

41669

41670

unsigned Opc = N.getOpcode();

41671

switch (Opc) {

41672

// Unary and Unary+Permute Shuffles.

41673

case X86ISD::PSHUFB: {

41674

// Don't merge PSHUFB if it contains zero'd elements.

41675

SmallVector<int> Mask;

41676

SmallVector<SDValue> Ops;

41677

if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,

41678

Mask))

41679

break;

41680

[[fallthrough]];

41681

}

41682

case X86ISD::VBROADCAST:

41683

case X86ISD::MOVDDUP:

41684

case X86ISD::PSHUFD:

41685

case X86ISD::PSHUFHW:

41686

case X86ISD::PSHUFLW:

41687

case X86ISD::VPERMI:

41688

case X86ISD::VPERMILPI: {

41689

if (N.getOperand(0).getValueType() == ShuffleVT &&

41690

N->isOnlyUserOf(N.getOperand(0).getNode())) {

41691

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

41692

unsigned SrcOpcode = N0.getOpcode();

41693

if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {

41694

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

41695

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

41696

if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||

41697

IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {

41698

SDValue LHS, RHS;

41699

Op00 = DAG.getBitcast(ShuffleVT, Op00);

41700

Op01 = DAG.getBitcast(ShuffleVT, Op01);

41701

if (N.getNumOperands() == 2) {

41702

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));

41703

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));

41704

} else {

41705

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);

41706

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);

41707

}

41708

EVT OpVT = N0.getValueType();

41709

return DAG.getBitcast(ShuffleVT,

41710

DAG.getNode(SrcOpcode, DL, OpVT,

41711

DAG.getBitcast(OpVT, LHS),

41712

DAG.getBitcast(OpVT, RHS)));

41713

}

41714

}

41715

}

41716

break;

41717

}

41718

// Binary and Binary+Permute Shuffles.

41719

case X86ISD::INSERTPS: {

41720

// Don't merge INSERTPS if it contains zero'd elements.

41721

unsigned InsertPSMask = N.getConstantOperandVal(2);

41722

unsigned ZeroMask = InsertPSMask & 0xF;

41723

if (ZeroMask != 0)

41724

break;

41725

[[fallthrough]];

41726

}

41727

case X86ISD::MOVSD:

41728

case X86ISD::MOVSS:

41729

case X86ISD::BLENDI:

41730

case X86ISD::SHUFP:

41731

case X86ISD::UNPCKH:

41732

case X86ISD::UNPCKL: {

41733

if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&

41734

N->isOnlyUserOf(N.getOperand(1).getNode())) {

41735

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

41736

SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));

41737

unsigned SrcOpcode = N0.getOpcode();

41738

if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

41739

IsSafeToMoveShuffle(N0, SrcOpcode) &&

41740

IsSafeToMoveShuffle(N1, SrcOpcode)) {

41741

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

41742

SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

41743

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

41744

SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));

41745

// Ensure the total number of shuffles doesn't increase by folding this

41746

// shuffle through to the source ops.

41747

if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||

41748

(IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||

41749

((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&

41750

(IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {

41751

SDValue LHS, RHS;

41752

Op00 = DAG.getBitcast(ShuffleVT, Op00);

41753

Op10 = DAG.getBitcast(ShuffleVT, Op10);

41754

Op01 = DAG.getBitcast(ShuffleVT, Op01);

41755

Op11 = DAG.getBitcast(ShuffleVT, Op11);

41756

if (N.getNumOperands() == 3) {

41757

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

41758

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));

41759

} else {

41760

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

41761

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);

41762

}

41763

EVT OpVT = N0.getValueType();

41764

return DAG.getBitcast(ShuffleVT,

41765

DAG.getNode(SrcOpcode, DL, OpVT,

41766

DAG.getBitcast(OpVT, LHS),

41767

DAG.getBitcast(OpVT, RHS)));

41768

}

41769

}

41770

}

41771

break;

41772

}

41773

}

41774

return SDValue();

41775

}

41776

41777

/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).

41778

static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,

41779

SelectionDAG &DAG,

41780

const SDLoc &DL) {

41781

assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41781, __extension__
__PRETTY_FUNCTION__));

41782

41783

MVT VT = V.getSimpleValueType();

41784

SDValue Src0 = peekThroughBitcasts(V.getOperand(0));

41785

SDValue Src1 = peekThroughBitcasts(V.getOperand(1));

41786

unsigned SrcOpc0 = Src0.getOpcode();

41787

unsigned SrcOpc1 = Src1.getOpcode();

41788

EVT SrcVT0 = Src0.getValueType();

41789

EVT SrcVT1 = Src1.getValueType();

41790

41791

if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))

41792

return SDValue();

41793

41794

switch (SrcOpc0) {

41795

case X86ISD::MOVDDUP: {

41796

SDValue LHS = Src0.getOperand(0);

41797

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41798

SDValue Res =

41799

DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));

41800

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);

41801

return DAG.getBitcast(VT, Res);

41802

}

41803

case X86ISD::VPERMILPI:

41804

// TODO: Handle v4f64 permutes with different low/high lane masks.

41805

if (SrcVT0 == MVT::v4f64) {

41806

uint64_t Mask = Src0.getConstantOperandVal(1);

41807

if ((Mask & 0x3) != ((Mask >> 2) & 0x3))

41808

break;

41809

}

41810

[[fallthrough]];

41811

case X86ISD::VSHLI:

41812

case X86ISD::VSRLI:

41813

case X86ISD::VSRAI:

41814

case X86ISD::PSHUFD:

41815

if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {

41816

SDValue LHS = Src0.getOperand(0);

41817

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41818

SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,

41819

V.getOperand(2));

41820

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));

41821

return DAG.getBitcast(VT, Res);

41822

}

41823

break;

41824

}

41825

41826

return SDValue();

41827

}

41828

41829

/// Try to combine x86 target specific shuffles.

41830

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

41831

TargetLowering::DAGCombinerInfo &DCI,

41832

const X86Subtarget &Subtarget) {

41833

SDLoc DL(N);

41834

MVT VT = N.getSimpleValueType();

41835

SmallVector<int, 4> Mask;

41836

unsigned Opcode = N.getOpcode();

41837

41838

if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

41839

return R;

41840

41841

// Handle specific target shuffles.

41842

switch (Opcode) {

41843

case X86ISD::MOVDDUP: {

41844

SDValue Src = N.getOperand(0);

41845

// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

41846

if (VT == MVT::v2f64 && Src.hasOneUse() &&

41847

ISD::isNormalLoad(Src.getNode())) {

41848

LoadSDNode *LN = cast<LoadSDNode>(Src);

41849

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

41850

SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

41851

DCI.CombineTo(N.getNode(), Movddup);

41852

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41853

DCI.recursivelyDeleteUnusedNodes(LN);

41854

return N; // Return N so it doesn't get rechecked!

41855

}

41856

}

41857

41858

return SDValue();

41859

}

41860

case X86ISD::VBROADCAST: {

41861

SDValue Src = N.getOperand(0);

41862

SDValue BC = peekThroughBitcasts(Src);

41863

EVT SrcVT = Src.getValueType();

41864

EVT BCVT = BC.getValueType();

41865

41866

// If broadcasting from another shuffle, attempt to simplify it.

41867

// TODO - we really need a general SimplifyDemandedVectorElts mechanism.

41868

if (isTargetShuffle(BC.getOpcode()) &&

41869

VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

41870

unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

41871

SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

41872

SM_SentinelUndef);

41873

for (unsigned i = 0; i != Scale; ++i)

41874

DemandedMask[i] = i;

41875

if (SDValue Res = combineX86ShufflesRecursively(

41876

{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,

41877

X86::MaxShuffleCombineDepth,

41878

/*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,

41879

/*AllowPerLaneVarMask*/ true, DAG, Subtarget))

41880

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41881

DAG.getBitcast(SrcVT, Res));

41882

}

41883

41884

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

41885

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

41886

if (Src.getOpcode() == ISD::BITCAST &&

41887

SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

41888

DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&

41889

FixedVectorType::isValidElementType(

41890

BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {

41891

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

41892

VT.getVectorNumElements());

41893

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41894

}

41895

41896

// vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))

41897

// If we're re-broadcasting a smaller type then broadcast with that type and

41898

// bitcast.

41899

// TODO: Do this for any splat?

41900

if (Src.getOpcode() == ISD::BITCAST &&

41901

(BC.getOpcode() == X86ISD::VBROADCAST ||

41902

BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&

41903

(VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&

41904

(VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {

41905

MVT NewVT =

41906

MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),

41907

VT.getSizeInBits() / BCVT.getScalarSizeInBits());

41908

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41909

}

41910

41911

// Reduce broadcast source vector to lowest 128-bits.

41912

if (SrcVT.getSizeInBits() > 128)

41913

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41914

extract128BitVector(Src, 0, DAG, DL));

41915

41916

// broadcast(scalar_to_vector(x)) -> broadcast(x).

41917

if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)

41918

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41919

41920

// broadcast(extract_vector_elt(x, 0)) -> broadcast(x).

41921

if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

41922

isNullConstant(Src.getOperand(1)) &&

41923

DAG.getTargetLoweringInfo().isTypeLegal(

41924

Src.getOperand(0).getValueType()))

41925

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41926

41927

// Share broadcast with the longest vector and extract low subvector (free).

41928

// Ensure the same SDValue from the SDNode use is being used.

41929

for (SDNode *User : Src->uses())

41930

if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

41931

Src == User->getOperand(0) &&

41932

User->getValueSizeInBits(0).getFixedValue() >

41933

VT.getFixedSizeInBits()) {

41934

return extractSubVector(SDValue(User, 0), 0, DAG, DL,

41935

VT.getSizeInBits());

41936

}

41937

41938

// vbroadcast(scalarload X) -> vbroadcast_load X

41939

// For float loads, extract other uses of the scalar from the broadcast.

41940

if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

41941

ISD::isNormalLoad(Src.getNode())) {

41942

LoadSDNode *LN = cast<LoadSDNode>(Src);

41943

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41944

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41945

SDValue BcastLd =

41946

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41947

LN->getMemoryVT(), LN->getMemOperand());

41948

// If the load value is used only by N, replace it via CombineTo N.

41949

bool NoReplaceExtract = Src.hasOneUse();

41950

DCI.CombineTo(N.getNode(), BcastLd);

41951

if (NoReplaceExtract) {

41952

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41953

DCI.recursivelyDeleteUnusedNodes(LN);

41954

} else {

41955

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

41956

DAG.getIntPtrConstant(0, DL));

41957

DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

41958

}

41959

return N; // Return N so it doesn't get rechecked!

41960

}

41961

41962

// Due to isTypeDesirableForOp, we won't always shrink a load truncated to

41963

// i16. So shrink it ourselves if we can make a broadcast_load.

41964

if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

41965

Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

41966

assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41966, __extension__
__PRETTY_FUNCTION__));

41967

SDValue TruncIn = Src.getOperand(0);

41968

41969

// If this is a truncate of a non extending load we can just narrow it to

41970

// use a broadcast_load.

41971

if (ISD::isNormalLoad(TruncIn.getNode())) {

41972

LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

41973

// Unless its volatile or atomic.

41974

if (LN->isSimple()) {

41975

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41976

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41977

SDValue BcastLd = DAG.getMemIntrinsicNode(

41978

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41979

LN->getPointerInfo(), LN->getOriginalAlign(),

41980

LN->getMemOperand()->getFlags());

41981

DCI.CombineTo(N.getNode(), BcastLd);

41982

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41983

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41984

return N; // Return N so it doesn't get rechecked!

41985

}

41986

}

41987

41988

// If this is a truncate of an i16 extload, we can directly replace it.

41989

if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

41990

ISD::isEXTLoad(Src.getOperand(0).getNode())) {

41991

LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

41992

if (LN->getMemoryVT().getSizeInBits() == 16) {

41993

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41994

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41995

SDValue BcastLd =

41996

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41997

LN->getMemoryVT(), LN->getMemOperand());

41998

DCI.CombineTo(N.getNode(), BcastLd);

41999

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

42000

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

42001

return N; // Return N so it doesn't get rechecked!

42002

}

42003

}

42004

42005

// If this is a truncate of load that has been shifted right, we can

42006

// offset the pointer and use a narrower load.

42007

if (TruncIn.getOpcode() == ISD::SRL &&

42008

TruncIn.getOperand(0).hasOneUse() &&

42009

isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

42010

ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

42011

LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

42012

unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

42013

// Make sure the shift amount and the load size are divisible by 16.

42014

// Don't do this if the load is volatile or atomic.

42015

if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

42016

LN->isSimple()) {

42017

unsigned Offset = ShiftAmt / 8;

42018

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

42019

SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),

42020

TypeSize::Fixed(Offset), DL);

42021

SDValue Ops[] = { LN->getChain(), Ptr };

42022

SDValue BcastLd = DAG.getMemIntrinsicNode(

42023

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

42024

LN->getPointerInfo().getWithOffset(Offset),

42025

LN->getOriginalAlign(),

42026

LN->getMemOperand()->getFlags());

42027

DCI.CombineTo(N.getNode(), BcastLd);

42028

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

42029

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

42030

return N; // Return N so it doesn't get rechecked!

42031

}

42032

}

42033

}

42034

42035

// vbroadcast(vzload X) -> vbroadcast_load X

42036

if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

42037

MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

42038

if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

42039

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

42040

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

42041

SDValue BcastLd =

42042

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

42043

LN->getMemoryVT(), LN->getMemOperand());

42044

DCI.CombineTo(N.getNode(), BcastLd);

42045

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

42046

DCI.recursivelyDeleteUnusedNodes(LN);

42047

return N; // Return N so it doesn't get rechecked!

42048

}

42049

}

42050

42051

// vbroadcast(vector load X) -> vbroadcast_load

42052

if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||

42053

SrcVT == MVT::v4i32) &&

42054

Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

42055

LoadSDNode *LN = cast<LoadSDNode>(Src);

42056

// Unless the load is volatile or atomic.

42057

if (LN->isSimple()) {

42058

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

42059

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

42060

SDValue BcastLd = DAG.getMemIntrinsicNode(

42061

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

42062

LN->getPointerInfo(), LN->getOriginalAlign(),

42063

LN->getMemOperand()->getFlags());

42064

DCI.CombineTo(N.getNode(), BcastLd);

42065

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

42066

DCI.recursivelyDeleteUnusedNodes(LN);

42067

return N; // Return N so it doesn't get rechecked!

42068

}

42069

}

42070

42071

return SDValue();

42072

}

42073

case X86ISD::VZEXT_MOVL: {

42074

SDValue N0 = N.getOperand(0);

42075

42076

// If this a vzmovl of a full vector load, replace it with a vzload, unless

42077

// the load is volatile.

42078

if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

42079

auto *LN = cast<LoadSDNode>(N0);

42080

if (SDValue VZLoad =

42081

narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

42082

DCI.CombineTo(N.getNode(), VZLoad);

42083

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

42084

DCI.recursivelyDeleteUnusedNodes(LN);

42085

return N;

42086

}

42087

}

42088

42089

// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

42090

// and can just use a VZEXT_LOAD.

42091

// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

42092

if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

42093

auto *LN = cast<MemSDNode>(N0);

42094

if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

42095

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

42096

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

42097

SDValue VZLoad =

42098

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

42099

LN->getMemoryVT(), LN->getMemOperand());

42100

DCI.CombineTo(N.getNode(), VZLoad);

42101

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

42102

DCI.recursivelyDeleteUnusedNodes(LN);

42103

return N;

42104

}

42105

}

42106

42107

// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

42108

// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

42109

// if the upper bits of the i64 are zero.

42110

if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

42111

N0.getOperand(0).hasOneUse() &&

42112

N0.getOperand(0).getValueType() == MVT::i64) {

42113

SDValue In = N0.getOperand(0);

42114

APInt Mask = APInt::getHighBitsSet(64, 32);

42115

if (DAG.MaskedValueIsZero(In, Mask)) {

42116

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

42117

MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

42118

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

42119

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

42120

return DAG.getBitcast(VT, Movl);

42121

}

42122

}

42123

42124

// Load a scalar integer constant directly to XMM instead of transferring an

42125

// immediate value from GPR.

42126

// vzext_movl (scalar_to_vector C) --> load [C,0...]

42127

if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

42128

if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

42129

// Create a vector constant - scalar constant followed by zeros.

42130

EVT ScalarVT = N0.getOperand(0).getValueType();

42131

Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

42132

unsigned NumElts = VT.getVectorNumElements();

42133

Constant *Zero = ConstantInt::getNullValue(ScalarTy);

42134

SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

42135

ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

42136

42137

// Load the vector constant from constant pool.

42138

MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

42139

SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

42140

MachinePointerInfo MPI =

42141

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

42142

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

42143

return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

42144

MachineMemOperand::MOLoad);

42145

}

42146

}

42147

42148

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

42149

// insert into a zero vector. This helps get VZEXT_MOVL closer to

42150

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

42151

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

42152

if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

42153

SDValue V = peekThroughOneUseBitcasts(N0);

42154

42155

if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

42156

isNullConstant(V.getOperand(2))) {

42157

SDValue In = V.getOperand(1);

42158

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

42159

In.getValueSizeInBits() /

42160

VT.getScalarSizeInBits());

42161

In = DAG.getBitcast(SubVT, In);

42162

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

42163

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

42164

getZeroVector(VT, Subtarget, DAG, DL), Movl,

42165

V.getOperand(2));

42166

}

42167

}

42168

42169

return SDValue();

42170

}

42171

case X86ISD::BLENDI: {

42172

SDValue N0 = N.getOperand(0);

42173

SDValue N1 = N.getOperand(1);

42174

42175

// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

42176

// TODO: Handle MVT::v16i16 repeated blend mask.

42177

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

42178

N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

42179

MVT SrcVT = N0.getOperand(0).getSimpleValueType();

42180

if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&

42181

SrcVT.getScalarSizeInBits() >= 32) {

42182

unsigned BlendMask = N.getConstantOperandVal(2);

42183

unsigned Size = VT.getVectorNumElements();

42184

unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

42185

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);

42186

return DAG.getBitcast(

42187

VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

42188

N1.getOperand(0),

42189

DAG.getTargetConstant(BlendMask, DL, MVT::i8)));

42190

}

42191

}

42192

return SDValue();

42193

}

42194

case X86ISD::SHUFP: {

42195

// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).

42196

// This is a more relaxed shuffle combiner that can ignore oneuse limits.

42197

// TODO: Support types other than v4f32.

42198

if (VT == MVT::v4f32) {

42199

bool Updated = false;

42200

SmallVector<int> Mask;

42201

SmallVector<SDValue> Ops;

42202

if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&

42203

Ops.size() == 2) {

42204

for (int i = 0; i != 2; ++i) {

42205

SmallVector<SDValue> SubOps;

42206

SmallVector<int> SubMask, SubScaledMask;

42207

SDValue Sub = peekThroughBitcasts(Ops[i]);

42208

// TODO: Scaling might be easier if we specify the demanded elts.

42209

if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&

42210

scaleShuffleElements(SubMask, 4, SubScaledMask) &&

42211

SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {

42212

int Ofs = i * 2;

42213

Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);

42214

Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);

42215

Ops[i] = DAG.getBitcast(VT, SubOps[0]);

42216

Updated = true;

42217

}

42218

}

42219

}

42220

if (Updated) {

42221

for (int &M : Mask)

42222

M %= 4;

42223

Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

42224

return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);

42225

}

42226

}

42227

return SDValue();

42228

}

42229

case X86ISD::VPERMI: {

42230

// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

42231

// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

42232

SDValue N0 = N.getOperand(0);

42233

SDValue N1 = N.getOperand(1);

42234

unsigned EltSizeInBits = VT.getScalarSizeInBits();

42235

if (N0.getOpcode() == ISD::BITCAST &&

42236

N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

42237

SDValue Src = N0.getOperand(0);

42238

EVT SrcVT = Src.getValueType();

42239

SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

42240

return DAG.getBitcast(VT, Res);

42241

}

42242

return SDValue();

42243

}

42244

case X86ISD::VPERM2X128: {

42245

// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).

42246

SDValue LHS = N->getOperand(0);

42247

SDValue RHS = N->getOperand(1);

42248

if (LHS.getOpcode() == ISD::BITCAST &&

42249

(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {

42250

EVT SrcVT = LHS.getOperand(0).getValueType();

42251

if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {

42252

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,

42253

DAG.getBitcast(SrcVT, LHS),

42254

DAG.getBitcast(SrcVT, RHS),

42255

N->getOperand(2)));

42256

}

42257

}

42258

42259

// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).

42260

if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))

42261

return Res;

42262

42263

// Fold vperm2x128 subvector shuffle with an inner concat pattern.

42264

// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.

42265

auto FindSubVector128 = [&](unsigned Idx) {

42266

if (Idx > 3)

42267

return SDValue();

42268

SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));

42269

SmallVector<SDValue> SubOps;

42270

if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)

42271

return SubOps[Idx & 1];

42272

unsigned NumElts = Src.getValueType().getVectorNumElements();

42273

if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

42274

Src.getOperand(1).getValueSizeInBits() == 128 &&

42275

Src.getConstantOperandAPInt(2) == (NumElts / 2)) {

42276

return Src.getOperand(1);

42277

}

42278

return SDValue();

42279

};

42280

unsigned Imm = N.getConstantOperandVal(2);

42281

if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {

42282

if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {

42283

MVT SubVT = VT.getHalfNumVectorElementsVT();

42284

SubLo = DAG.getBitcast(SubVT, SubLo);

42285

SubHi = DAG.getBitcast(SubVT, SubHi);

42286

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);

42287

}

42288

}

42289

return SDValue();

42290

}

42291

case X86ISD::PSHUFD:

42292

case X86ISD::PSHUFLW:

42293

case X86ISD::PSHUFHW: {

42294

SDValue N0 = N.getOperand(0);

42295

SDValue N1 = N.getOperand(1);

42296

if (N0->hasOneUse()) {

42297

SDValue V = peekThroughOneUseBitcasts(N0);

42298

switch (V.getOpcode()) {

42299

case X86ISD::VSHL:

42300

case X86ISD::VSRL:

42301

case X86ISD::VSRA:

42302

case X86ISD::VSHLI:

42303

case X86ISD::VSRLI:

42304

case X86ISD::VSRAI:

42305

case X86ISD::VROTLI:

42306

case X86ISD::VROTRI: {

42307

MVT InnerVT = V.getSimpleValueType();

42308

if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {

42309

SDValue Res = DAG.getNode(Opcode, DL, VT,

42310

DAG.getBitcast(VT, V.getOperand(0)), N1);

42311

Res = DAG.getBitcast(InnerVT, Res);

42312

Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));

42313

return DAG.getBitcast(VT, Res);

42314

}

42315

break;

42316

}

42317

}

42318

}

42319

42320

Mask = getPSHUFShuffleMask(N);

42321

assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42321, __extension__ __PRETTY_FUNCTION__));

42322

break;

42323

}

42324

case X86ISD::MOVSD:

42325

case X86ISD::MOVSH:

42326

case X86ISD::MOVSS: {

42327

SDValue N0 = N.getOperand(0);

42328

SDValue N1 = N.getOperand(1);

42329

42330

// Canonicalize scalar FPOps:

42331

// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

42332

// If commutable, allow OP(N1[0], N0[0]).

42333

unsigned Opcode1 = N1.getOpcode();

42334

if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

42335

Opcode1 == ISD::FDIV) {

42336

SDValue N10 = N1.getOperand(0);

42337

SDValue N11 = N1.getOperand(1);

42338

if (N10 == N0 ||

42339

(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

42340

if (N10 != N0)

42341

std::swap(N10, N11);

42342

MVT SVT = VT.getVectorElementType();

42343

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

42344

N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

42345

N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

42346

SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

42347

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

42348

return DAG.getNode(Opcode, DL, VT, N0, SclVec);

42349

}

42350

}

42351

42352

return SDValue();

42353

}

42354

case X86ISD::INSERTPS: {

42355

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42355, __extension__
__PRETTY_FUNCTION__));

42356

SDValue Op0 = N.getOperand(0);

42357

SDValue Op1 = N.getOperand(1);

42358

unsigned InsertPSMask = N.getConstantOperandVal(2);

42359

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

42360

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

42361

unsigned ZeroMask = InsertPSMask & 0xF;

42362

42363

// If we zero out all elements from Op0 then we don't need to reference it.

42364

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

42365

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

42366

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42367

42368

// If we zero out the element from Op1 then we don't need to reference it.

42369

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

42370

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

42371

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42372

42373

// Attempt to merge insertps Op1 with an inner target shuffle node.

42374

SmallVector<int, 8> TargetMask1;

42375

SmallVector<SDValue, 2> Ops1;

42376

APInt KnownUndef1, KnownZero1;

42377

if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

42378

KnownZero1)) {

42379

if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

42380

// Zero/UNDEF insertion - zero out element and remove dependency.

42381

InsertPSMask |= (1u << DstIdx);

42382

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

42383

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42384

}

42385

// Update insertps mask srcidx and reference the source input directly.

42386

int M = TargetMask1[SrcIdx];

42387

assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42387, __extension__
__PRETTY_FUNCTION__));

42388

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

42389

Op1 = Ops1[M < 4 ? 0 : 1];

42390

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

42391

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42392

}

42393

42394

// Attempt to merge insertps Op0 with an inner target shuffle node.

42395

SmallVector<int, 8> TargetMask0;

42396

SmallVector<SDValue, 2> Ops0;

42397

APInt KnownUndef0, KnownZero0;

42398

if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

42399

KnownZero0)) {

42400

bool Updated = false;

42401

bool UseInput00 = false;

42402

bool UseInput01 = false;

42403

for (int i = 0; i != 4; ++i) {

42404

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

42405

// No change if element is already zero or the inserted element.

42406

continue;

42407

}

42408

42409

if (KnownUndef0[i] || KnownZero0[i]) {

42410

// If the target mask is undef/zero then we must zero the element.

42411

InsertPSMask |= (1u << i);

42412

Updated = true;

42413

continue;

42414

}

42415

42416

// The input vector element must be inline.

42417

int M = TargetMask0[i];

42418

if (M != i && M != (i + 4))

42419

return SDValue();

42420

42421

// Determine which inputs of the target shuffle we're using.

42422

UseInput00 |= (0 <= M && M < 4);

42423

UseInput01 |= (4 <= M);

42424

}

42425

42426

// If we're not using both inputs of the target shuffle then use the

42427

// referenced input directly.

42428

if (UseInput00 && !UseInput01) {

42429

Updated = true;

42430

Op0 = Ops0[0];

42431

} else if (!UseInput00 && UseInput01) {

42432

Updated = true;

42433

Op0 = Ops0[1];

42434

}

42435

42436

if (Updated)

42437

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

42438

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42439

}

42440

42441

// If we're inserting an element from a vbroadcast load, fold the

42442

// load into the X86insertps instruction. We need to convert the scalar

42443

// load to a vector and clear the source lane of the INSERTPS control.

42444

if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

42445

auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

42446

if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

42447

SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

42448

MemIntr->getBasePtr(),

42449

MemIntr->getMemOperand());

42450

SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

42451

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

42452

Load),

42453

DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

42454

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

42455

return Insert;

42456

}

42457

}

42458

42459

return SDValue();

42460

}

42461

default:

42462

return SDValue();

42463

}

42464

42465

// Nuke no-op shuffles that show up after combining.

42466

if (isNoopShuffleMask(Mask))

42467

return N.getOperand(0);

42468

42469

// Look for simplifications involving one or two shuffle instructions.

42470

SDValue V = N.getOperand(0);

42471

switch (N.getOpcode()) {

42472

default:

42473

break;

42474

case X86ISD::PSHUFLW:

42475

case X86ISD::PSHUFHW:

42476

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42476, __extension__
__PRETTY_FUNCTION__));

42477

42478

// See if this reduces to a PSHUFD which is no more expensive and can

42479

// combine with more operations. Note that it has to at least flip the

42480

// dwords as otherwise it would have been removed as a no-op.

42481

if (ArrayRef(Mask).equals({2, 3, 0, 1})) {

42482

int DMask[] = {0, 1, 2, 3};

42483

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

42484

DMask[DOffset + 0] = DOffset + 1;

42485

DMask[DOffset + 1] = DOffset + 0;

42486

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

42487

V = DAG.getBitcast(DVT, V);

42488

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

42489

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

42490

return DAG.getBitcast(VT, V);

42491

}

42492

42493

// Look for shuffle patterns which can be implemented as a single unpack.

42494

// FIXME: This doesn't handle the location of the PSHUFD generically, and

42495

// only works when we have a PSHUFD followed by two half-shuffles.

42496

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

42497

(V.getOpcode() == X86ISD::PSHUFLW ||

42498

V.getOpcode() == X86ISD::PSHUFHW) &&

42499

V.getOpcode() != N.getOpcode() &&

42500

V.hasOneUse() && V.getOperand(0).hasOneUse()) {

42501

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

42502

if (D.getOpcode() == X86ISD::PSHUFD) {

42503

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

42504

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

42505

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

42506

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

42507

int WordMask[8];

42508

for (int i = 0; i < 4; ++i) {

42509

WordMask[i + NOffset] = Mask[i] + NOffset;

42510

WordMask[i + VOffset] = VMask[i] + VOffset;

42511

}

42512

// Map the word mask through the DWord mask.

42513

int MappedMask[8];

42514

for (int i = 0; i < 8; ++i)

42515

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

42516

if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

42517

ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

42518

// We can replace all three shuffles with an unpack.

42519

V = DAG.getBitcast(VT, D.getOperand(0));

42520

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

42521

: X86ISD::UNPCKH,

42522

DL, VT, V, V);

42523

}

42524

}

42525

}

42526

42527

break;

42528

42529

case X86ISD::PSHUFD:

42530

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

42531

return NewN;

42532

42533

break;

42534

}

42535

42536

return SDValue();

42537

}

42538

42539

/// Checks if the shuffle mask takes subsequent elements

42540

/// alternately from two vectors.

42541

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.

42542

static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

42543

42544

int ParitySrc[2] = {-1, -1};

42545

unsigned Size = Mask.size();

42546

for (unsigned i = 0; i != Size; ++i) {

42547

int M = Mask[i];

42548

if (M < 0)

42549

continue;

42550

42551

// Make sure we are using the matching element from the input.

42552

if ((M % Size) != i)

42553

return false;

42554

42555

// Make sure we use the same input for all elements of the same parity.

42556

int Src = M / Size;

42557

if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

42558

return false;

42559

ParitySrc[i % 2] = Src;

42560

}

42561

42562

// Make sure each input is used.

42563

if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

42564

return false;

42565

42566

Op0Even = ParitySrc[0] == 0;

42567

return true;

42568

}

42569

42570

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

42571

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

42572

/// are written to the parameters \p Opnd0 and \p Opnd1.

42573

///

42574

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

42575

/// so it is easier to generically match. We also insert dummy vector shuffle

42576

/// nodes for the operands which explicitly discard the lanes which are unused

42577

/// by this operation to try to flow through the rest of the combiner

42578

/// the fact that they're unused.

42579

static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

42580

SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

42581

bool &IsSubAdd) {

42582

42583

EVT VT = N->getValueType(0);

42584

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42585

if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

42586

!VT.getSimpleVT().isFloatingPoint())

42587

return false;

42588

42589

// We only handle target-independent shuffles.

42590

// FIXME: It would be easy and harmless to use the target shuffle mask

42591

// extraction tool to support more.

42592

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

42593

return false;

42594

42595

SDValue V1 = N->getOperand(0);

42596

SDValue V2 = N->getOperand(1);

42597

42598

// Make sure we have an FADD and an FSUB.

42599

if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

42600

(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

42601

V1.getOpcode() == V2.getOpcode())

42602

return false;

42603

42604

// If there are other uses of these operations we can't fold them.

42605

if (!V1->hasOneUse() || !V2->hasOneUse())

42606

return false;

42607

42608

// Ensure that both operations have the same operands. Note that we can

42609

// commute the FADD operands.

42610

SDValue LHS, RHS;

42611

if (V1.getOpcode() == ISD::FSUB) {

42612

LHS = V1->getOperand(0); RHS = V1->getOperand(1);

42613

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

42614

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

42615

return false;

42616

} else {

42617

assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42617, __extension__
__PRETTY_FUNCTION__));

42618

LHS = V2->getOperand(0); RHS = V2->getOperand(1);

42619

if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

42620

(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

42621

return false;

42622

}

42623

42624

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

42625

bool Op0Even;

42626

if (!isAddSubOrSubAddMask(Mask, Op0Even))

42627

return false;

42628

42629

// It's a subadd if the vector in the even parity is an FADD.

42630

IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

42631

: V2->getOpcode() == ISD::FADD;

42632

42633

Opnd0 = LHS;

42634

Opnd1 = RHS;

42635

return true;

42636

}

42637

42638

/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.

42639

static SDValue combineShuffleToFMAddSub(SDNode *N,

42640

const X86Subtarget &Subtarget,

42641

SelectionDAG &DAG) {

42642

// We only handle target-independent shuffles.

42643

// FIXME: It would be easy and harmless to use the target shuffle mask

42644

// extraction tool to support more.

42645

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

42646

return SDValue();

42647

42648

MVT VT = N->getSimpleValueType(0);

42649

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42650

if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

42651

return SDValue();

42652

42653

// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

42654

SDValue Op0 = N->getOperand(0);

42655

SDValue Op1 = N->getOperand(1);

42656

SDValue FMAdd = Op0, FMSub = Op1;

42657

if (FMSub.getOpcode() != X86ISD::FMSUB)

42658

std::swap(FMAdd, FMSub);

42659

42660

if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

42661

FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

42662

FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

42663

FMAdd.getOperand(2) != FMSub.getOperand(2))

42664

return SDValue();

42665

42666

// Check for correct shuffle mask.

42667

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

42668

bool Op0Even;

42669

if (!isAddSubOrSubAddMask(Mask, Op0Even))

42670

return SDValue();

42671

42672

// FMAddSub takes zeroth operand from FMSub node.

42673

SDLoc DL(N);

42674

bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

42675

unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

42676

return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

42677

FMAdd.getOperand(2));

42678

}

42679

42680

/// Try to combine a shuffle into a target-specific add-sub or

42681

/// mul-add-sub node.

42682

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

42683

const X86Subtarget &Subtarget,

42684

SelectionDAG &DAG) {

42685

if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))

42686

return V;

42687

42688

SDValue Opnd0, Opnd1;

42689

bool IsSubAdd;

42690

if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))

42691

return SDValue();

42692

42693

MVT VT = N->getSimpleValueType(0);

42694

SDLoc DL(N);

42695

42696

// Try to generate X86ISD::FMADDSUB node here.

42697

SDValue Opnd2;

42698

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {

42699

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

42700

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

42701

}

42702

42703

if (IsSubAdd)

42704

return SDValue();

42705

42706

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

42707

// the ADDSUB idiom has been successfully recognized. There are no known

42708

// X86 targets with 512-bit ADDSUB instructions!

42709

if (VT.is512BitVector())

42710

return SDValue();

42711

42712

// Do not generate X86ISD::ADDSUB node for FP16's vector types even though

42713

// the ADDSUB idiom has been successfully recognized. There are no known

42714

// X86 targets with FP16 ADDSUB instructions!

42715

if (VT.getVectorElementType() == MVT::f16)

42716

return SDValue();

42717

42718

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

42719

}

42720

42721

// We are looking for a shuffle where both sources are concatenated with undef

42722

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

42723

// if we can express this as a single-source shuffle, that's preferable.

42724

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

42725

const X86Subtarget &Subtarget) {

42726

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

42727

return SDValue();

42728

42729

EVT VT = N->getValueType(0);

42730

42731

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

42732

if (!VT.is128BitVector() && !VT.is256BitVector())

42733

return SDValue();

42734

42735

if (VT.getVectorElementType() != MVT::i32 &&

42736

VT.getVectorElementType() != MVT::i64 &&

42737

VT.getVectorElementType() != MVT::f32 &&

42738

VT.getVectorElementType() != MVT::f64)

42739

return SDValue();

42740

42741

SDValue N0 = N->getOperand(0);

42742

SDValue N1 = N->getOperand(1);

42743

42744

// Check that both sources are concats with undef.

42745

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

42746

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

42747

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

42748

!N1.getOperand(1).isUndef())

42749

return SDValue();

42750

42751

// Construct the new shuffle mask. Elements from the first source retain their

42752

// index, but elements from the second source no longer need to skip an undef.

42753

SmallVector<int, 8> Mask;

42754

int NumElts = VT.getVectorNumElements();

42755

42756

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

42757

for (int Elt : SVOp->getMask())

42758

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

42759

42760

SDLoc DL(N);

42761

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

42762

N1.getOperand(0));

42763

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

42764

}

42765

42766

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

42767

/// low half of each source vector and does not set any high half elements in

42768

/// the destination vector, narrow the shuffle to half its original size.

42769

static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

42770

if (!Shuf->getValueType(0).isSimple())

42771

return SDValue();

42772

MVT VT = Shuf->getSimpleValueType(0);

42773

if (!VT.is256BitVector() && !VT.is512BitVector())

42774

return SDValue();

42775

42776

// See if we can ignore all of the high elements of the shuffle.

42777

ArrayRef<int> Mask = Shuf->getMask();

42778

if (!isUndefUpperHalf(Mask))

42779

return SDValue();

42780

42781

// Check if the shuffle mask accesses only the low half of each input vector

42782

// (half-index output is 0 or 2).

42783

int HalfIdx1, HalfIdx2;

42784

SmallVector<int, 8> HalfMask(Mask.size() / 2);

42785

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

42786

(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

42787

return SDValue();

42788

42789

// Create a half-width shuffle to replace the unnecessarily wide shuffle.

42790

// The trick is knowing that all of the insert/extract are actually free

42791

// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

42792

// of narrow inputs into a narrow output, and that is always cheaper than

42793

// the wide shuffle that we started with.

42794

return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

42795

Shuf->getOperand(1), HalfMask, HalfIdx1,

42796

HalfIdx2, false, DAG, /*UseConcat*/true);

42797

}

42798

42799

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

42800

TargetLowering::DAGCombinerInfo &DCI,

42801

const X86Subtarget &Subtarget) {

42802

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

42803

if (SDValue V = narrowShuffle(Shuf, DAG))

42804

return V;

42805

42806

// If we have legalized the vector types, look for blends of FADD and FSUB

42807

// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

42808

SDLoc dl(N);

42809

EVT VT = N->getValueType(0);

42810

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42811

if (TLI.isTypeLegal(VT))

42812

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

42813

return AddSub;

42814

42815

// Attempt to combine into a vector load/broadcast.

42816

if (SDValue LD = combineToConsecutiveLoads(

42817

VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))

42818

return LD;

42819

42820

// For AVX2, we sometimes want to combine

42821

// (vector_shuffle <mask> (concat_vectors t1, undef)

42822

// (concat_vectors t2, undef))

42823

// Into:

42824

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

42825

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

42826

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

42827

return ShufConcat;

42828

42829

if (isTargetShuffle(N->getOpcode())) {

42830

SDValue Op(N, 0);

42831

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

42832

return Shuffle;

42833

42834

// Try recursively combining arbitrary sequences of x86 shuffle

42835

// instructions into higher-order shuffles. We do this after combining

42836

// specific PSHUF instruction sequences into their minimal form so that we

42837

// can evaluate how many specialized shuffle instructions are involved in

42838

// a particular chain.

42839

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

42840

return Res;

42841

42842

// Simplify source operands based on shuffle mask.

42843

// TODO - merge this into combineX86ShufflesRecursively.

42844

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

42845

if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))

42846

return SDValue(N, 0);

42847

42848

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

42849

// Perform this after other shuffle combines to allow inner shuffles to be

42850

// combined away first.

42851

if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))

42852

return BinOp;

42853

}

42854

42855

return SDValue();

42856

}

42857

42858

// Simplify variable target shuffle masks based on the demanded elements.

42859

// TODO: Handle DemandedBits in mask indices as well?

42860

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

42861

SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

42862

TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

42863

// If we're demanding all elements don't bother trying to simplify the mask.

42864

unsigned NumElts = DemandedElts.getBitWidth();

42865

if (DemandedElts.isAllOnes())

42866

return false;

42867

42868

SDValue Mask = Op.getOperand(MaskIndex);

42869

if (!Mask.hasOneUse())

42870

return false;

42871

42872

// Attempt to generically simplify the variable shuffle mask.

42873

APInt MaskUndef, MaskZero;

42874

if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

42875

Depth + 1))

42876

return true;

42877

42878

// Attempt to extract+simplify a (constant pool load) shuffle mask.

42879

// TODO: Support other types from getTargetShuffleMaskIndices?

42880

SDValue BC = peekThroughOneUseBitcasts(Mask);

42881

EVT BCVT = BC.getValueType();

42882

auto *Load = dyn_cast<LoadSDNode>(BC);

42883

if (!Load)

42884

return false;

42885

42886

const Constant *C = getTargetConstantFromNode(Load);

42887

if (!C)

42888

return false;

42889

42890

Type *CTy = C->getType();

42891

if (!CTy->isVectorTy() ||

42892

CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

42893

return false;

42894

42895

// Handle scaling for i64 elements on 32-bit targets.

42896

unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

42897

if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

42898

return false;

42899

unsigned Scale = NumCstElts / NumElts;

42900

42901

// Simplify mask if we have an undemanded element that is not undef.

42902

bool Simplified = false;

42903

SmallVector<Constant *, 32> ConstVecOps;

42904

for (unsigned i = 0; i != NumCstElts; ++i) {

42905

Constant *Elt = C->getAggregateElement(i);

42906

if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

42907

ConstVecOps.push_back(UndefValue::get(Elt->getType()));

42908

Simplified = true;

42909

continue;

42910

}

42911

ConstVecOps.push_back(Elt);

42912

}

42913

if (!Simplified)

42914

return false;

42915

42916

// Generate new constant pool entry + legalize immediately for the load.

42917

SDLoc DL(Op);

42918

SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

42919

SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

42920

SDValue NewMask = TLO.DAG.getLoad(

42921

BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

42922

MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

42923

Load->getAlign());

42924

return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

42925

}

42926

42927

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

42928

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

42929

TargetLoweringOpt &TLO, unsigned Depth) const {

42930

int NumElts = DemandedElts.getBitWidth();

42931

unsigned Opc = Op.getOpcode();

42932

EVT VT = Op.getValueType();

42933

42934

// Handle special case opcodes.

42935

switch (Opc) {

42936

case X86ISD::PMULDQ:

42937

case X86ISD::PMULUDQ: {

42938

APInt LHSUndef, LHSZero;

42939

APInt RHSUndef, RHSZero;

42940

SDValue LHS = Op.getOperand(0);

42941

SDValue RHS = Op.getOperand(1);

42942

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42943

Depth + 1))

42944

return true;

42945

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42946

Depth + 1))

42947

return true;

42948

// Multiply by zero.

42949

KnownZero = LHSZero | RHSZero;

42950

break;

42951

}

42952

case X86ISD::VPMADDWD: {

42953

APInt LHSUndef, LHSZero;

42954

APInt RHSUndef, RHSZero;

42955

SDValue LHS = Op.getOperand(0);

42956

SDValue RHS = Op.getOperand(1);

42957

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);

42958

42959

if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,

42960

Depth + 1))

42961

return true;

42962

if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,

42963

Depth + 1))

42964

return true;

42965

42966

// TODO: Multiply by zero.

42967

42968

// If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.

42969

APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;

42970

if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,

42971

Depth + 1))

42972

return true;

42973

APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;

42974

if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,

42975

Depth + 1))

42976

return true;

42977

break;

42978

}

42979

case X86ISD::PSADBW: {

42980

SDValue LHS = Op.getOperand(0);

42981

SDValue RHS = Op.getOperand(1);

42982

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))

42983

LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))

42984

LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__))

42985

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__
__PRETTY_FUNCTION__));

42986

42987

// Aggressively peek through ops to get at the demanded elts.

42988

if (!DemandedElts.isAllOnes()) {

42989

unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

42990

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

42991

SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(

42992

LHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42993

SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(

42994

RHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42995

if (NewLHS || NewRHS) {

42996

NewLHS = NewLHS ? NewLHS : LHS;

42997

NewRHS = NewRHS ? NewRHS : RHS;

42998

return TLO.CombineTo(

42999

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

43000

}

43001

}

43002

break;

43003

}

43004

case X86ISD::VSHL:

43005

case X86ISD::VSRL:

43006

case X86ISD::VSRA: {

43007

// We only need the bottom 64-bits of the (128-bit) shift amount.

43008

SDValue Amt = Op.getOperand(1);

43009

MVT AmtVT = Amt.getSimpleValueType();

43010

assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43010, __extension__
__PRETTY_FUNCTION__));

43011

43012

// If we reuse the shift amount just for sse shift amounts then we know that

43013

// only the bottom 64-bits are only ever used.

43014

bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {

43015

unsigned UseOpc = Use->getOpcode();

43016

return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

43017

UseOpc == X86ISD::VSRA) &&

43018

Use->getOperand(0) != Amt;

43019

});

43020

43021

APInt AmtUndef, AmtZero;

43022

unsigned NumAmtElts = AmtVT.getVectorNumElements();

43023

APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

43024

if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

43025

Depth + 1, AssumeSingleUse))

43026

return true;

43027

[[fallthrough]];

43028

}

43029

case X86ISD::VSHLI:

43030

case X86ISD::VSRLI:

43031

case X86ISD::VSRAI: {

43032

SDValue Src = Op.getOperand(0);

43033

APInt SrcUndef;

43034

if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

43035

Depth + 1))

43036

return true;

43037

43038

// Fold shift(0,x) -> 0

43039

if (DemandedElts.isSubsetOf(KnownZero))

43040

return TLO.CombineTo(

43041

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

43042

43043

// Aggressively peek through ops to get at the demanded elts.

43044

if (!DemandedElts.isAllOnes())

43045

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

43046

Src, DemandedElts, TLO.DAG, Depth + 1))

43047

return TLO.CombineTo(

43048

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

43049

break;

43050

}

43051

case X86ISD::VPSHA:

43052

case X86ISD::VPSHL:

43053

case X86ISD::VSHLV:

43054

case X86ISD::VSRLV:

43055

case X86ISD::VSRAV: {

43056

APInt LHSUndef, LHSZero;

43057

APInt RHSUndef, RHSZero;

43058

SDValue LHS = Op.getOperand(0);

43059

SDValue RHS = Op.getOperand(1);

43060

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

43061

Depth + 1))

43062

return true;

43063

43064

// Fold shift(0,x) -> 0

43065

if (DemandedElts.isSubsetOf(LHSZero))

43066

return TLO.CombineTo(

43067

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

43068

43069

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

43070

Depth + 1))

43071

return true;

43072

43073

KnownZero = LHSZero;

43074

break;

43075

}

43076

case X86ISD::KSHIFTL: {

43077

SDValue Src = Op.getOperand(0);

43078

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

43079

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43079, __extension__
__PRETTY_FUNCTION__));

43080

unsigned ShiftAmt = Amt->getZExtValue();

43081

43082

if (ShiftAmt == 0)

43083

return TLO.CombineTo(Op, Src);

43084

43085

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

43086

// single shift. We can do this if the bottom bits (which are shifted

43087

// out) are never demanded.

43088

if (Src.getOpcode() == X86ISD::KSHIFTR) {

43089

if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

43090

unsigned C1 = Src.getConstantOperandVal(1);

43091

unsigned NewOpc = X86ISD::KSHIFTL;

43092

int Diff = ShiftAmt - C1;

43093

if (Diff < 0) {

43094

Diff = -Diff;

43095

NewOpc = X86ISD::KSHIFTR;

43096

}

43097

43098

SDLoc dl(Op);

43099

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

43100

return TLO.CombineTo(

43101

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

43102

}

43103

}

43104

43105

APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

43106

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

43107

Depth + 1))

43108

return true;

43109

43110

KnownUndef <<= ShiftAmt;

43111

KnownZero <<= ShiftAmt;

43112

KnownZero.setLowBits(ShiftAmt);

43113

break;

43114

}

43115

case X86ISD::KSHIFTR: {

43116

SDValue Src = Op.getOperand(0);

43117

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

43118

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43118, __extension__
__PRETTY_FUNCTION__));

43119

unsigned ShiftAmt = Amt->getZExtValue();

43120

43121

if (ShiftAmt == 0)

43122

return TLO.CombineTo(Op, Src);

43123

43124

// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

43125

// single shift. We can do this if the top bits (which are shifted

43126

// out) are never demanded.

43127

if (Src.getOpcode() == X86ISD::KSHIFTL) {

43128

if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

43129

unsigned C1 = Src.getConstantOperandVal(1);

43130

unsigned NewOpc = X86ISD::KSHIFTR;

43131

int Diff = ShiftAmt - C1;

43132

if (Diff < 0) {

43133

Diff = -Diff;

43134

NewOpc = X86ISD::KSHIFTL;

43135

}

43136

43137

SDLoc dl(Op);

43138

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

43139

return TLO.CombineTo(

43140

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

43141

}

43142

}

43143

43144

APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

43145

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

43146

Depth + 1))

43147

return true;

43148

43149

KnownUndef.lshrInPlace(ShiftAmt);

43150

KnownZero.lshrInPlace(ShiftAmt);

43151

KnownZero.setHighBits(ShiftAmt);

43152

break;

43153

}

43154

case X86ISD::ANDNP: {

43155

// ANDNP = (~LHS & RHS);

43156

SDValue LHS = Op.getOperand(0);

43157

SDValue RHS = Op.getOperand(1);

43158

43159

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

43160

APInt UndefElts;

43161

SmallVector<APInt> EltBits;

43162

int NumElts = VT.getVectorNumElements();

43163

int EltSizeInBits = VT.getScalarSizeInBits();

43164

APInt OpBits = APInt::getAllOnes(EltSizeInBits);

43165

APInt OpElts = DemandedElts;

43166

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

43167

EltBits)) {

43168

OpBits.clearAllBits();

43169

OpElts.clearAllBits();

43170

for (int I = 0; I != NumElts; ++I) {

43171

if (!DemandedElts[I])

43172

continue;

43173

if (UndefElts[I]) {

43174

// We can't assume an undef src element gives an undef dst - the

43175

// other src might be zero.

43176

OpBits.setAllBits();

43177

OpElts.setBit(I);

43178

} else if ((Invert && !EltBits[I].isAllOnes()) ||

43179

(!Invert && !EltBits[I].isZero())) {

43180

OpBits |= Invert ? ~EltBits[I] : EltBits[I];

43181

OpElts.setBit(I);

43182

}

43183

}

43184

}

43185

return std::make_pair(OpBits, OpElts);

43186

};

43187

APInt BitsLHS, EltsLHS;

43188

APInt BitsRHS, EltsRHS;

43189

std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);

43190

std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);

43191

43192

APInt LHSUndef, LHSZero;

43193

APInt RHSUndef, RHSZero;

43194

if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,

43195

Depth + 1))

43196

return true;

43197

if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,

43198

Depth + 1))

43199

return true;

43200

43201

if (!DemandedElts.isAllOnes()) {

43202

SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,

43203

TLO.DAG, Depth + 1);

43204

SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,

43205

TLO.DAG, Depth + 1);

43206

if (NewLHS || NewRHS) {

43207

NewLHS = NewLHS ? NewLHS : LHS;

43208

NewRHS = NewRHS ? NewRHS : RHS;

43209

return TLO.CombineTo(

43210

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

43211

}

43212

}

43213

break;

43214

}

43215

case X86ISD::CVTSI2P:

43216

case X86ISD::CVTUI2P: {

43217

SDValue Src = Op.getOperand(0);

43218

MVT SrcVT = Src.getSimpleValueType();

43219

APInt SrcUndef, SrcZero;

43220

APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

43221

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

43222

Depth + 1))

43223

return true;

43224

break;

43225

}

43226

case X86ISD::PACKSS:

43227

case X86ISD::PACKUS: {

43228

SDValue N0 = Op.getOperand(0);

43229

SDValue N1 = Op.getOperand(1);

43230

43231

APInt DemandedLHS, DemandedRHS;

43232

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

43233

43234

APInt LHSUndef, LHSZero;

43235

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

43236

Depth + 1))

43237

return true;

43238

APInt RHSUndef, RHSZero;

43239

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

43240

Depth + 1))

43241

return true;

43242

43243

// TODO - pass on known zero/undef.

43244

43245

// Aggressively peek through ops to get at the demanded elts.

43246

// TODO - we should do this for all target/faux shuffles ops.

43247

if (!DemandedElts.isAllOnes()) {

43248

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

43249

TLO.DAG, Depth + 1);

43250

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

43251

TLO.DAG, Depth + 1);

43252

if (NewN0 || NewN1) {

43253

NewN0 = NewN0 ? NewN0 : N0;

43254

NewN1 = NewN1 ? NewN1 : N1;

43255

return TLO.CombineTo(Op,

43256

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

43257

}

43258

}

43259

break;

43260

}

43261

case X86ISD::HADD:

43262

case X86ISD::HSUB:

43263

case X86ISD::FHADD:

43264

case X86ISD::FHSUB: {

43265

SDValue N0 = Op.getOperand(0);

43266

SDValue N1 = Op.getOperand(1);

43267

43268

APInt DemandedLHS, DemandedRHS;

43269

getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

43270

43271

APInt LHSUndef, LHSZero;

43272

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

43273

Depth + 1))

43274

return true;

43275

APInt RHSUndef, RHSZero;

43276

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

43277

Depth + 1))

43278

return true;

43279

43280

// TODO - pass on known zero/undef.

43281

43282

// Aggressively peek through ops to get at the demanded elts.

43283

// TODO: Handle repeated operands.

43284

if (N0 != N1 && !DemandedElts.isAllOnes()) {

43285

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

43286

TLO.DAG, Depth + 1);

43287

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

43288

TLO.DAG, Depth + 1);

43289

if (NewN0 || NewN1) {

43290

NewN0 = NewN0 ? NewN0 : N0;

43291

NewN1 = NewN1 ? NewN1 : N1;

43292

return TLO.CombineTo(Op,

43293

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

43294

}

43295

}

43296

break;

43297

}

43298

case X86ISD::VTRUNC:

43299

case X86ISD::VTRUNCS:

43300

case X86ISD::VTRUNCUS: {

43301

SDValue Src = Op.getOperand(0);

43302

MVT SrcVT = Src.getSimpleValueType();

43303

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

43304

APInt SrcUndef, SrcZero;

43305

if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

43306

Depth + 1))

43307

return true;

43308

KnownZero = SrcZero.zextOrTrunc(NumElts);

43309

KnownUndef = SrcUndef.zextOrTrunc(NumElts);

43310

break;

43311

}

43312

case X86ISD::BLENDV: {

43313

APInt SelUndef, SelZero;

43314

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

43315

SelZero, TLO, Depth + 1))

43316

return true;

43317

43318

// TODO: Use SelZero to adjust LHS/RHS DemandedElts.

43319

APInt LHSUndef, LHSZero;

43320

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

43321

LHSZero, TLO, Depth + 1))

43322

return true;

43323

43324

APInt RHSUndef, RHSZero;

43325

if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

43326

RHSZero, TLO, Depth + 1))

43327

return true;

43328

43329

KnownZero = LHSZero & RHSZero;

43330

KnownUndef = LHSUndef & RHSUndef;

43331

break;

43332

}

43333

case X86ISD::VZEXT_MOVL: {

43334

// If upper demanded elements are already zero then we have nothing to do.

43335

SDValue Src = Op.getOperand(0);

43336

APInt DemandedUpperElts = DemandedElts;

43337

DemandedUpperElts.clearLowBits(1);

43338

if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))

43339

return TLO.CombineTo(Op, Src);

43340

break;

43341

}

43342

case X86ISD::VBROADCAST: {

43343

SDValue Src = Op.getOperand(0);

43344

MVT SrcVT = Src.getSimpleValueType();

43345

if (!SrcVT.isVector())

43346

break;

43347

// Don't bother broadcasting if we just need the 0'th element.

43348

if (DemandedElts == 1) {

43349

if (Src.getValueType() != VT)

43350

Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

43351

SDLoc(Op));

43352

return TLO.CombineTo(Op, Src);

43353

}

43354

APInt SrcUndef, SrcZero;

43355

APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

43356

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

43357

Depth + 1))

43358

return true;

43359

// Aggressively peek through src to get at the demanded elt.

43360

// TODO - we should do this for all target/faux shuffles ops.

43361

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

43362

Src, SrcElts, TLO.DAG, Depth + 1))

43363

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43364

break;

43365

}

43366

case X86ISD::VPERMV:

43367

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

43368

Depth))

43369

return true;

43370

break;

43371

case X86ISD::PSHUFB:

43372

case X86ISD::VPERMV3:

43373

case X86ISD::VPERMILPV:

43374

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

43375

Depth))

43376

return true;

43377

break;

43378

case X86ISD::VPPERM:

43379

case X86ISD::VPERMIL2:

43380

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

43381

Depth))

43382

return true;

43383

break;

43384

}

43385

43386

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

43387

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

43388

// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

43389

if ((VT.is256BitVector() || VT.is512BitVector()) &&

43390

DemandedElts.lshr(NumElts / 2) == 0) {

43391

unsigned SizeInBits = VT.getSizeInBits();

43392

unsigned ExtSizeInBits = SizeInBits / 2;

43393

43394

// See if 512-bit ops only use the bottom 128-bits.

43395

if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

43396

ExtSizeInBits = SizeInBits / 4;

43397

43398

switch (Opc) {

43399

// Scalar broadcast.

43400

case X86ISD::VBROADCAST: {

43401

SDLoc DL(Op);

43402

SDValue Src = Op.getOperand(0);

43403

if (Src.getValueSizeInBits() > ExtSizeInBits)

43404

Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

43405

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43406

ExtSizeInBits / VT.getScalarSizeInBits());

43407

SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);

43408

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

43409

TLO.DAG, DL, ExtSizeInBits));

43410

}

43411

case X86ISD::VBROADCAST_LOAD: {

43412

SDLoc DL(Op);

43413

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

43414

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43415

ExtSizeInBits / VT.getScalarSizeInBits());

43416

SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);

43417

SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};

43418

SDValue Bcst = TLO.DAG.getMemIntrinsicNode(

43419

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

43420

MemIntr->getMemOperand());

43421

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

43422

Bcst.getValue(1));

43423

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

43424

TLO.DAG, DL, ExtSizeInBits));

43425

}

43426

// Subvector broadcast.

43427

case X86ISD::SUBV_BROADCAST_LOAD: {

43428

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

43429

EVT MemVT = MemIntr->getMemoryVT();

43430

if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {

43431

SDLoc DL(Op);

43432

SDValue Ld =

43433

TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),

43434

MemIntr->getBasePtr(), MemIntr->getMemOperand());

43435

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

43436

Ld.getValue(1));

43437

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,

43438

TLO.DAG, DL, ExtSizeInBits));

43439

} else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {

43440

SDLoc DL(Op);

43441

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43442

ExtSizeInBits / VT.getScalarSizeInBits());

43443

if (SDValue BcstLd =

43444

getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))

43445

return TLO.CombineTo(Op,

43446

insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,

43447

TLO.DAG, DL, ExtSizeInBits));

43448

}

43449

break;

43450

}

43451

// Byte shifts by immediate.

43452

case X86ISD::VSHLDQ:

43453

case X86ISD::VSRLDQ:

43454

// Shift by uniform.

43455

case X86ISD::VSHL:

43456

case X86ISD::VSRL:

43457

case X86ISD::VSRA:

43458

// Shift by immediate.

43459

case X86ISD::VSHLI:

43460

case X86ISD::VSRLI:

43461

case X86ISD::VSRAI: {

43462

SDLoc DL(Op);

43463

SDValue Ext0 =

43464

extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

43465

SDValue ExtOp =

43466

TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

43467

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43468

SDValue Insert =

43469

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43470

return TLO.CombineTo(Op, Insert);

43471

}

43472

case X86ISD::VPERMI: {

43473

// Simplify PERMPD/PERMQ to extract_subvector.

43474

// TODO: This should be done in shuffle combining.

43475

if (VT == MVT::v4f64 || VT == MVT::v4i64) {

43476

SmallVector<int, 4> Mask;

43477

DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

43478

if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

43479

SDLoc DL(Op);

43480

SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

43481

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43482

SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

43483

return TLO.CombineTo(Op, Insert);

43484

}

43485

}

43486

break;

43487

}

43488

case X86ISD::VPERM2X128: {

43489

// Simplify VPERM2F128/VPERM2I128 to extract_subvector.

43490

SDLoc DL(Op);

43491

unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;

43492

if (LoMask & 0x8)

43493

return TLO.CombineTo(

43494

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));

43495

unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);

43496

unsigned SrcIdx = (LoMask & 0x2) >> 1;

43497

SDValue ExtOp =

43498

extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);

43499

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43500

SDValue Insert =

43501

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43502

return TLO.CombineTo(Op, Insert);

43503

}

43504

// Zero upper elements.

43505

case X86ISD::VZEXT_MOVL:

43506

// Target unary shuffles by immediate:

43507

case X86ISD::PSHUFD:

43508

case X86ISD::PSHUFLW:

43509

case X86ISD::PSHUFHW:

43510

case X86ISD::VPERMILPI:

43511

// (Non-Lane Crossing) Target Shuffles.

43512

case X86ISD::VPERMILPV:

43513

case X86ISD::VPERMIL2:

43514

case X86ISD::PSHUFB:

43515

case X86ISD::UNPCKL:

43516

case X86ISD::UNPCKH:

43517

case X86ISD::BLENDI:

43518

// Integer ops.

43519

case X86ISD::PACKSS:

43520

case X86ISD::PACKUS:

43521

// Horizontal Ops.

43522

case X86ISD::HADD:

43523

case X86ISD::HSUB:

43524

case X86ISD::FHADD:

43525

case X86ISD::FHSUB: {

43526

SDLoc DL(Op);

43527

SmallVector<SDValue, 4> Ops;

43528

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

43529

SDValue SrcOp = Op.getOperand(i);

43530

EVT SrcVT = SrcOp.getValueType();

43531

assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))

43532

"Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__));

43533

Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

43534

ExtSizeInBits)

43535

: SrcOp);

43536

}

43537

MVT ExtVT = VT.getSimpleVT();

43538

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

43539

ExtSizeInBits / ExtVT.getScalarSizeInBits());

43540

SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

43541

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43542

SDValue Insert =

43543

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43544

return TLO.CombineTo(Op, Insert);

43545

}

43546

}

43547

}

43548

43549

// For splats, unless we *only* demand the 0'th element,

43550

// stop attempts at simplification here, we aren't going to improve things,

43551

// this is better than any potential shuffle.

43552

if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))

43553

return false;

43554

43555

// Get target/faux shuffle mask.

43556

APInt OpUndef, OpZero;

43557

SmallVector<int, 64> OpMask;

43558

SmallVector<SDValue, 2> OpInputs;

43559

if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

43560

OpZero, TLO.DAG, Depth, false))

43561

return false;

43562

43563

// Shuffle inputs must be the same size as the result.

43564

if (OpMask.size() != (unsigned)NumElts ||

43565

llvm::any_of(OpInputs, [VT](SDValue V) {

43566

return VT.getSizeInBits() != V.getValueSizeInBits() ||

43567

!V.getValueType().isVector();

43568

}))

43569

return false;

43570

43571

KnownZero = OpZero;

43572

KnownUndef = OpUndef;

43573

43574

// Check if shuffle mask can be simplified to undef/zero/identity.

43575

int NumSrcs = OpInputs.size();

43576

for (int i = 0; i != NumElts; ++i)

43577

if (!DemandedElts[i])

43578

OpMask[i] = SM_SentinelUndef;

43579

43580

if (isUndefInRange(OpMask, 0, NumElts)) {

43581

KnownUndef.setAllBits();

43582

return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

43583

}

43584

if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

43585

KnownZero.setAllBits();

43586

return TLO.CombineTo(

43587

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

43588

}

43589

for (int Src = 0; Src != NumSrcs; ++Src)

43590

if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

43591

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

43592

43593

// Attempt to simplify inputs.

43594

for (int Src = 0; Src != NumSrcs; ++Src) {

43595

// TODO: Support inputs of different types.

43596

if (OpInputs[Src].getValueType() != VT)

43597

continue;

43598

43599

int Lo = Src * NumElts;

43600

APInt SrcElts = APInt::getZero(NumElts);

43601

for (int i = 0; i != NumElts; ++i)

43602

if (DemandedElts[i]) {

43603

int M = OpMask[i] - Lo;

43604

if (0 <= M && M < NumElts)

43605

SrcElts.setBit(M);

43606

}

43607

43608

// TODO - Propagate input undef/zero elts.

43609

APInt SrcUndef, SrcZero;

43610

if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

43611

TLO, Depth + 1))

43612

return true;

43613

}

43614

43615

// If we don't demand all elements, then attempt to combine to a simpler

43616

// shuffle.

43617

// We need to convert the depth to something combineX86ShufflesRecursively

43618

// can handle - so pretend its Depth == 0 again, and reduce the max depth

43619

// to match. This prevents combineX86ShuffleChain from returning a

43620

// combined shuffle that's the same as the original root, causing an

43621

// infinite loop.

43622

if (!DemandedElts.isAllOnes()) {

43623

assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43623, __extension__
__PRETTY_FUNCTION__));

43624

43625

SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

43626

for (int i = 0; i != NumElts; ++i)

43627

if (DemandedElts[i])

43628

DemandedMask[i] = i;

43629

43630

SDValue NewShuffle = combineX86ShufflesRecursively(

43631

{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,

43632

/*HasVarMask*/ false,

43633

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,

43634

Subtarget);

43635

if (NewShuffle)

43636

return TLO.CombineTo(Op, NewShuffle);

43637

}

43638

43639

return false;

43640

}

43641

43642

bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

43643

SDValue Op, const APInt &OriginalDemandedBits,

43644

const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

43645

unsigned Depth) const {

43646

EVT VT = Op.getValueType();

43647

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

43648

unsigned Opc = Op.getOpcode();

43649

switch(Opc) {

43650

case X86ISD::VTRUNC: {

43651

KnownBits KnownOp;

43652

SDValue Src = Op.getOperand(0);

43653

MVT SrcVT = Src.getSimpleValueType();

43654

43655

// Simplify the input, using demanded bit information.

43656

APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

43657

APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

43658

if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

43659

return true;

43660

break;

43661

}

43662

case X86ISD::PMULDQ:

43663

case X86ISD::PMULUDQ: {

43664

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

43665

KnownBits KnownLHS, KnownRHS;

43666

SDValue LHS = Op.getOperand(0);

43667

SDValue RHS = Op.getOperand(1);

43668

43669

// Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.

43670

// FIXME: Can we bound this better?

43671

APInt DemandedMask = APInt::getLowBitsSet(64, 32);

43672

APInt DemandedMaskLHS = APInt::getAllOnes(64);

43673

APInt DemandedMaskRHS = APInt::getAllOnes(64);

43674

43675

bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();

43676

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))

43677

DemandedMaskLHS = DemandedMask;

43678

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))

43679

DemandedMaskRHS = DemandedMask;

43680

43681

if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,

43682

KnownLHS, TLO, Depth + 1))

43683

return true;

43684

if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,

43685

KnownRHS, TLO, Depth + 1))

43686

return true;

43687

43688

// PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.

43689

KnownRHS = KnownRHS.trunc(32);

43690

if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&

43691

KnownRHS.getConstant().isOne()) {

43692

SDLoc DL(Op);

43693

SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);

43694

return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));

43695

}

43696

43697

// Aggressively peek through ops to get at the demanded low bits.

43698

SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

43699

LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

43700

SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

43701

RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

43702

if (DemandedLHS || DemandedRHS) {

43703

DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

43704

DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

43705

return TLO.CombineTo(

43706

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

43707

}

43708

break;

43709

}

43710

case X86ISD::VSHLI: {

43711

SDValue Op0 = Op.getOperand(0);

43712

43713

unsigned ShAmt = Op.getConstantOperandVal(1);

43714

if (ShAmt >= BitWidth)

43715

break;

43716

43717

APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

43718

43719

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

43720

// single shift. We can do this if the bottom bits (which are shifted

43721

// out) are never demanded.

43722

if (Op0.getOpcode() == X86ISD::VSRLI &&

43723

OriginalDemandedBits.countr_zero() >= ShAmt) {

43724

unsigned Shift2Amt = Op0.getConstantOperandVal(1);

43725

if (Shift2Amt < BitWidth) {

43726

int Diff = ShAmt - Shift2Amt;

43727

if (Diff == 0)

43728

return TLO.CombineTo(Op, Op0.getOperand(0));

43729

43730

unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

43731

SDValue NewShift = TLO.DAG.getNode(

43732

NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

43733

TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

43734

return TLO.CombineTo(Op, NewShift);

43735

}

43736

}

43737

43738

// If we are only demanding sign bits then we can use the shift source directly.

43739

unsigned NumSignBits =

43740

TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

43741

unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();

43742

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

43743

return TLO.CombineTo(Op, Op0);

43744

43745

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

43746

TLO, Depth + 1))

43747

return true;

43748

43749

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43749, __extension__
__PRETTY_FUNCTION__));

43750

Known.Zero <<= ShAmt;

43751

Known.One <<= ShAmt;

43752

43753

// Low bits known zero.

43754

Known.Zero.setLowBits(ShAmt);

43755

return false;

43756

}

43757

case X86ISD::VSRLI: {

43758

unsigned ShAmt = Op.getConstantOperandVal(1);

43759

if (ShAmt >= BitWidth)

43760

break;

43761

43762

APInt DemandedMask = OriginalDemandedBits << ShAmt;

43763

43764

if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

43765

OriginalDemandedElts, Known, TLO, Depth + 1))

43766

return true;

43767

43768

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43768, __extension__
__PRETTY_FUNCTION__));

43769

Known.Zero.lshrInPlace(ShAmt);

43770

Known.One.lshrInPlace(ShAmt);

43771

43772

// High bits known zero.

43773

Known.Zero.setHighBits(ShAmt);

43774

return false;

43775

}

43776

case X86ISD::VSRAI: {

43777

SDValue Op0 = Op.getOperand(0);

43778

SDValue Op1 = Op.getOperand(1);

43779

43780

unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

43781

if (ShAmt >= BitWidth)

43782

break;

43783

43784

APInt DemandedMask = OriginalDemandedBits << ShAmt;

43785

43786

// If we just want the sign bit then we don't need to shift it.

43787

if (OriginalDemandedBits.isSignMask())

43788

return TLO.CombineTo(Op, Op0);

43789

43790

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

43791

if (Op0.getOpcode() == X86ISD::VSHLI &&

43792

Op.getOperand(1) == Op0.getOperand(1)) {

43793

SDValue Op00 = Op0.getOperand(0);

43794

unsigned NumSignBits =

43795

TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

43796

if (ShAmt < NumSignBits)

43797

return TLO.CombineTo(Op, Op00);

43798

}

43799

43800

// If any of the demanded bits are produced by the sign extension, we also

43801

// demand the input sign bit.

43802

if (OriginalDemandedBits.countl_zero() < ShAmt)

43803

DemandedMask.setSignBit();

43804

43805

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

43806

TLO, Depth + 1))

43807

return true;

43808

43809

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43809, __extension__
__PRETTY_FUNCTION__));

43810

Known.Zero.lshrInPlace(ShAmt);

43811

Known.One.lshrInPlace(ShAmt);

43812

43813

// If the input sign bit is known to be zero, or if none of the top bits

43814

// are demanded, turn this into an unsigned shift right.

43815

if (Known.Zero[BitWidth - ShAmt - 1] ||

43816

OriginalDemandedBits.countl_zero() >= ShAmt)

43817

return TLO.CombineTo(

43818

Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

43819

43820

// High bits are known one.

43821

if (Known.One[BitWidth - ShAmt - 1])

43822

Known.One.setHighBits(ShAmt);

43823

return false;

43824

}

43825

case X86ISD::BLENDV: {

43826

SDValue Sel = Op.getOperand(0);

43827

SDValue LHS = Op.getOperand(1);

43828

SDValue RHS = Op.getOperand(2);

43829

43830

APInt SignMask = APInt::getSignMask(BitWidth);

43831

SDValue NewSel = SimplifyMultipleUseDemandedBits(

43832

Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

43833

SDValue NewLHS = SimplifyMultipleUseDemandedBits(

43834

LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43835

SDValue NewRHS = SimplifyMultipleUseDemandedBits(

43836

RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43837

43838

if (NewSel || NewLHS || NewRHS) {

43839

NewSel = NewSel ? NewSel : Sel;

43840

NewLHS = NewLHS ? NewLHS : LHS;

43841

NewRHS = NewRHS ? NewRHS : RHS;

43842

return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,

43843

NewSel, NewLHS, NewRHS));

43844

}

43845

break;

43846

}

43847

case X86ISD::PEXTRB:

43848

case X86ISD::PEXTRW: {

43849

SDValue Vec = Op.getOperand(0);

43850

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

43851

MVT VecVT = Vec.getSimpleValueType();

43852

unsigned NumVecElts = VecVT.getVectorNumElements();

43853

43854

if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

43855

unsigned Idx = CIdx->getZExtValue();

43856

unsigned VecBitWidth = VecVT.getScalarSizeInBits();

43857

43858

// If we demand no bits from the vector then we must have demanded

43859

// bits from the implict zext - simplify to zero.

43860

APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

43861

if (DemandedVecBits == 0)

43862

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43863

43864

APInt KnownUndef, KnownZero;

43865

APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

43866

if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

43867

KnownZero, TLO, Depth + 1))

43868

return true;

43869

43870

KnownBits KnownVec;

43871

if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

43872

KnownVec, TLO, Depth + 1))

43873

return true;

43874

43875

if (SDValue V = SimplifyMultipleUseDemandedBits(

43876

Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

43877

return TLO.CombineTo(

43878

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

43879

43880

Known = KnownVec.zext(BitWidth);

43881

return false;

43882

}

43883

break;

43884

}

43885

case X86ISD::PINSRB:

43886

case X86ISD::PINSRW: {

43887

SDValue Vec = Op.getOperand(0);

43888

SDValue Scl = Op.getOperand(1);

43889

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43890

MVT VecVT = Vec.getSimpleValueType();

43891

43892

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

43893

unsigned Idx = CIdx->getZExtValue();

43894

if (!OriginalDemandedElts[Idx])

43895

return TLO.CombineTo(Op, Vec);

43896

43897

KnownBits KnownVec;

43898

APInt DemandedVecElts(OriginalDemandedElts);

43899

DemandedVecElts.clearBit(Idx);

43900

if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

43901

KnownVec, TLO, Depth + 1))

43902

return true;

43903

43904

KnownBits KnownScl;

43905

unsigned NumSclBits = Scl.getScalarValueSizeInBits();

43906

APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

43907

if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

43908

return true;

43909

43910

KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

43911

Known = KnownBits::commonBits(KnownVec, KnownScl);

43912

return false;

43913

}

43914

break;

43915

}

43916

case X86ISD::PACKSS:

43917

// PACKSS saturates to MIN/MAX integer values. So if we just want the

43918

// sign bit then we can just ask for the source operands sign bit.

43919

// TODO - add known bits handling.

43920

if (OriginalDemandedBits.isSignMask()) {

43921

APInt DemandedLHS, DemandedRHS;

43922

getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

43923

43924

KnownBits KnownLHS, KnownRHS;

43925

APInt SignMask = APInt::getSignMask(BitWidth * 2);

43926

if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

43927

KnownLHS, TLO, Depth + 1))

43928

return true;

43929

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

43930

KnownRHS, TLO, Depth + 1))

43931

return true;

43932

43933

// Attempt to avoid multi-use ops if we don't need anything from them.

43934

SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

43935

Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

43936

SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

43937

Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

43938

if (DemandedOp0 || DemandedOp1) {

43939

SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

43940

SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

43941

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

43942

}

43943

}

43944

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

43945

break;

43946

case X86ISD::VBROADCAST: {

43947

SDValue Src = Op.getOperand(0);

43948

MVT SrcVT = Src.getSimpleValueType();

43949

APInt DemandedElts = APInt::getOneBitSet(

43950

SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);

43951

if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,

43952

TLO, Depth + 1))

43953

return true;

43954

// If we don't need the upper bits, attempt to narrow the broadcast source.

43955

// Don't attempt this on AVX512 as it might affect broadcast folding.

43956

// TODO: Should we attempt this for i32/i16 splats? They tend to be slower.

43957

if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&

43958

OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&

43959

Src->hasOneUse()) {

43960

MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);

43961

SDValue NewSrc =

43962

TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

43963

MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);

43964

SDValue NewBcst =

43965

TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);

43966

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));

43967

}

43968

break;

43969

}

43970

case X86ISD::PCMPGT:

43971

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43972

// iff we only need the sign bit then we can use R directly.

43973

if (OriginalDemandedBits.isSignMask() &&

43974

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43975

return TLO.CombineTo(Op, Op.getOperand(1));

43976

break;

43977

case X86ISD::MOVMSK: {

43978

SDValue Src = Op.getOperand(0);

43979

MVT SrcVT = Src.getSimpleValueType();

43980

unsigned SrcBits = SrcVT.getScalarSizeInBits();

43981

unsigned NumElts = SrcVT.getVectorNumElements();

43982

43983

// If we don't need the sign bits at all just return zero.

43984

if (OriginalDemandedBits.countr_zero() >= NumElts)

43985

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43986

43987

// See if we only demand bits from the lower 128-bit vector.

43988

if (SrcVT.is256BitVector() &&

43989

OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {

43990

SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));

43991

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43992

}

43993

43994

// Only demand the vector elements of the sign bits we need.

43995

APInt KnownUndef, KnownZero;

43996

APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

43997

if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

43998

TLO, Depth + 1))

43999

return true;

44000

44001

Known.Zero = KnownZero.zext(BitWidth);

44002

Known.Zero.setHighBits(BitWidth - NumElts);

44003

44004

// MOVMSK only uses the MSB from each vector element.

44005

KnownBits KnownSrc;

44006

APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

44007

if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

44008

Depth + 1))

44009

return true;

44010

44011

if (KnownSrc.One[SrcBits - 1])

44012

Known.One.setLowBits(NumElts);

44013

else if (KnownSrc.Zero[SrcBits - 1])

44014

Known.Zero.setLowBits(NumElts);

44015

44016

// Attempt to avoid multi-use os if we don't need anything from it.

44017

if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

44018

Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

44019

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

44020

return false;

44021

}

44022

case X86ISD::TESTP: {

44023

SDValue Op0 = Op.getOperand(0);

44024

SDValue Op1 = Op.getOperand(1);

44025

MVT OpVT = Op0.getSimpleValueType();

44026

assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))

44027

OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__))

44028

"Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__
__PRETTY_FUNCTION__));

44029

44030

// TESTPS/TESTPD only demands the sign bits of ALL the elements.

44031

KnownBits KnownSrc;

44032

APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());

44033

bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());

44034

return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,

44035

AssumeSingleUse) ||

44036

SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,

44037

AssumeSingleUse);

44038

}

44039

case X86ISD::BEXTR:

44040

case X86ISD::BEXTRI: {

44041

SDValue Op0 = Op.getOperand(0);

44042

SDValue Op1 = Op.getOperand(1);

44043

44044

// Only bottom 16-bits of the control bits are required.

44045

if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

44046

// NOTE: SimplifyDemandedBits won't do this for constants.

44047

uint64_t Val1 = Cst1->getZExtValue();

44048

uint64_t MaskedVal1 = Val1 & 0xFFFF;

44049

if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {

44050

SDLoc DL(Op);

44051

return TLO.CombineTo(

44052

Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

44053

TLO.DAG.getConstant(MaskedVal1, DL, VT)));

44054

}

44055

44056

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

44057

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

44058

44059

// If the length is 0, the result is 0.

44060

if (Length == 0) {

44061

Known.setAllZero();

44062

return false;

44063

}

44064

44065

if ((Shift + Length) <= BitWidth) {

44066

APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);

44067

if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))

44068

return true;

44069

44070

Known = Known.extractBits(Length, Shift);

44071

Known = Known.zextOrTrunc(BitWidth);

44072

return false;

44073

}

44074

} else {

44075

assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44075, __extension__
__PRETTY_FUNCTION__));

44076

KnownBits Known1;

44077

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

44078

if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

44079

return true;

44080

44081

// If the length is 0, replace with 0.

44082

KnownBits LengthBits = Known1.extractBits(8, 8);

44083

if (LengthBits.isZero())

44084

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

44085

}

44086

44087

break;

44088

}

44089

case X86ISD::PDEP: {

44090

SDValue Op0 = Op.getOperand(0);

44091

SDValue Op1 = Op.getOperand(1);

44092

44093

unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();

44094

APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);

44095

44096

// If the demanded bits has leading zeroes, we don't demand those from the

44097

// mask.

44098

if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))

44099

return true;

44100

44101

// The number of possible 1s in the mask determines the number of LSBs of

44102

// operand 0 used. Undemanded bits from the mask don't matter so filter

44103

// them before counting.

44104

KnownBits Known2;

44105

uint64_t Count = (~Known.Zero & LoMask).popcount();

44106

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));

44107

if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))

44108

return true;

44109

44110

// Zeroes are retained from the mask, but not ones.

44111

Known.One.clearAllBits();

44112

// The result will have at least as many trailing zeros as the non-mask

44113

// operand since bits can only map to the same or higher bit position.

44114

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

44115

return false;

44116

}

44117

}

44118

44119

return TargetLowering::SimplifyDemandedBitsForTargetNode(

44120

Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

44121

}

44122

44123

SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

44124

SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

44125

SelectionDAG &DAG, unsigned Depth) const {

44126

int NumElts = DemandedElts.getBitWidth();

44127

unsigned Opc = Op.getOpcode();

44128

EVT VT = Op.getValueType();

44129

44130

switch (Opc) {

44131

case X86ISD::PINSRB:

44132

case X86ISD::PINSRW: {

44133

// If we don't demand the inserted element, return the base vector.

44134

SDValue Vec = Op.getOperand(0);

44135

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

44136

MVT VecVT = Vec.getSimpleValueType();

44137

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

44138

!DemandedElts[CIdx->getZExtValue()])

44139

return Vec;

44140

break;

44141

}

44142

case X86ISD::VSHLI: {

44143

// If we are only demanding sign bits then we can use the shift source

44144

// directly.

44145

SDValue Op0 = Op.getOperand(0);

44146

unsigned ShAmt = Op.getConstantOperandVal(1);

44147

unsigned BitWidth = DemandedBits.getBitWidth();

44148

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

44149

unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();

44150

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

44151

return Op0;

44152

break;

44153

}

44154

case X86ISD::VSRAI:

44155

// iff we only need the sign bit then we can use the source directly.

44156

// TODO: generalize where we only demand extended signbits.

44157

if (DemandedBits.isSignMask())

44158

return Op.getOperand(0);

44159

break;

44160

case X86ISD::PCMPGT:

44161

// icmp sgt(0, R) == ashr(R, BitWidth-1).

44162

// iff we only need the sign bit then we can use R directly.

44163

if (DemandedBits.isSignMask() &&

44164

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

44165

return Op.getOperand(1);

44166

break;

44167

case X86ISD::ANDNP: {

44168

// ANDNP = (~LHS & RHS);

44169

SDValue LHS = Op.getOperand(0);

44170

SDValue RHS = Op.getOperand(1);

44171

44172

KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);

44173

KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);

44174

44175

// If all of the demanded bits are known 0 on LHS and known 0 on RHS, then

44176

// the (inverted) LHS bits cannot contribute to the result of the 'andn' in

44177

// this context, so return RHS.

44178

if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))

44179

return RHS;

44180

break;

44181

}

44182

}

44183

44184

APInt ShuffleUndef, ShuffleZero;

44185

SmallVector<int, 16> ShuffleMask;

44186

SmallVector<SDValue, 2> ShuffleOps;

44187

if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

44188

ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

44189

// If all the demanded elts are from one operand and are inline,

44190

// then we can use the operand directly.

44191

int NumOps = ShuffleOps.size();

44192

if (ShuffleMask.size() == (unsigned)NumElts &&

44193

llvm::all_of(ShuffleOps, [VT](SDValue V) {

44194

return VT.getSizeInBits() == V.getValueSizeInBits();

44195

})) {

44196

44197

if (DemandedElts.isSubsetOf(ShuffleUndef))

44198

return DAG.getUNDEF(VT);

44199

if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

44200

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

44201

44202

// Bitmask that indicates which ops have only been accessed 'inline'.

44203

APInt IdentityOp = APInt::getAllOnes(NumOps);

44204

for (int i = 0; i != NumElts; ++i) {

44205

int M = ShuffleMask[i];

44206

if (!DemandedElts[i] || ShuffleUndef[i])

44207

continue;

44208

int OpIdx = M / NumElts;

44209

int EltIdx = M % NumElts;

44210

if (M < 0 || EltIdx != i) {

44211

IdentityOp.clearAllBits();

44212

break;

44213

}

44214

IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

44215

if (IdentityOp == 0)

44216

break;

44217

}

44218

assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__
__PRETTY_FUNCTION__))

44219

"Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__
__PRETTY_FUNCTION__));

44220

44221

if (IdentityOp != 0)

44222

return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);

44223

}

44224

}

44225

44226

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

44227

Op, DemandedBits, DemandedElts, DAG, Depth);

44228

}

44229

44230

bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

44231

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

44232

bool PoisonOnly, unsigned Depth) const {

44233

unsigned EltsBits = Op.getScalarValueSizeInBits();

44234

unsigned NumElts = DemandedElts.getBitWidth();

44235

44236

// TODO: Add more target shuffles.

44237

switch (Op.getOpcode()) {

44238

case X86ISD::PSHUFD:

44239

case X86ISD::VPERMILPI: {

44240

SmallVector<int, 8> Mask;

44241

DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);

44242

44243

APInt DemandedSrcElts = APInt::getZero(NumElts);

44244

for (unsigned I = 0; I != NumElts; ++I)

44245

if (DemandedElts[I])

44246

DemandedSrcElts.setBit(Mask[I]);

44247

44248

return DAG.isGuaranteedNotToBeUndefOrPoison(

44249

Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);

44250

}

44251

}

44252

return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

44253

Op, DemandedElts, DAG, PoisonOnly, Depth);

44254

}

44255

44256

bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(

44257

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

44258

bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {

44259

44260

// TODO: Add more target shuffles.

44261

switch (Op.getOpcode()) {

44262

case X86ISD::PSHUFD:

44263

case X86ISD::VPERMILPI:

44264

return false;

44265

}

44266

return TargetLowering::canCreateUndefOrPoisonForTargetNode(

44267

Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);

44268

}

44269

44270

bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,

44271

const APInt &DemandedElts,

44272

APInt &UndefElts,

44273

const SelectionDAG &DAG,

44274

unsigned Depth) const {

44275

unsigned NumElts = DemandedElts.getBitWidth();

44276

unsigned Opc = Op.getOpcode();

44277

44278

switch (Opc) {

44279

case X86ISD::VBROADCAST:

44280

case X86ISD::VBROADCAST_LOAD:

44281

UndefElts = APInt::getZero(NumElts);

44282

return true;

44283

}

44284

44285

return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,

44286

DAG, Depth);

44287

}

44288

44289

// Helper to peek through bitops/trunc/setcc to determine size of source vector.

44290

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

44291

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,

44292

bool AllowTruncate) {

44293

switch (Src.getOpcode()) {

44294

case ISD::TRUNCATE:

44295

if (!AllowTruncate)

44296

return false;

44297

[[fallthrough]];

44298

case ISD::SETCC:

44299

return Src.getOperand(0).getValueSizeInBits() == Size;

44300

case ISD::AND:

44301

case ISD::XOR:

44302

case ISD::OR:

44303

return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&

44304

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);

44305

case ISD::SELECT:

44306

case ISD::VSELECT:

44307

return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&

44308

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&

44309

checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);

44310

case ISD::BUILD_VECTOR:

44311

return ISD::isBuildVectorAllZeros(Src.getNode()) ||

44312

ISD::isBuildVectorAllOnes(Src.getNode());

44313

}

44314

return false;

44315

}

44316

44317

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

44318

static unsigned getAltBitOpcode(unsigned Opcode) {

44319

switch(Opcode) {

44320

case ISD::AND: return X86ISD::FAND;

44321

case ISD::OR: return X86ISD::FOR;

44322

case ISD::XOR: return X86ISD::FXOR;

44323

case X86ISD::ANDNP: return X86ISD::FANDN;

44324

}

44325

llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44325);

44326

}

44327

44328

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

44329

static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

44330

const SDLoc &DL) {

44331

EVT SrcVT = Src.getValueType();

44332

if (SrcVT != MVT::v4i1)

44333

return SDValue();

44334

44335

switch (Src.getOpcode()) {

44336

case ISD::SETCC:

44337

if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

44338

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

44339

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

44340

SDValue Op0 = Src.getOperand(0);

44341

if (ISD::isNormalLoad(Op0.getNode()))

44342

return DAG.getBitcast(MVT::v4f32, Op0);

44343

if (Op0.getOpcode() == ISD::BITCAST &&

44344

Op0.getOperand(0).getValueType() == MVT::v4f32)

44345

return Op0.getOperand(0);

44346

}

44347

break;

44348

case ISD::AND:

44349

case ISD::XOR:

44350

case ISD::OR: {

44351

SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

44352

SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

44353

if (Op0 && Op1)

44354

return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

44355

Op1);

44356

break;

44357

}

44358

}

44359

return SDValue();

44360

}

44361

44362

// Helper to push sign extension of vXi1 SETCC result through bitops.

44363

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

44364

SDValue Src, const SDLoc &DL) {

44365

switch (Src.getOpcode()) {

44366

case ISD::SETCC:

44367

case ISD::TRUNCATE:

44368

case ISD::BUILD_VECTOR:

44369

return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

44370

case ISD::AND:

44371

case ISD::XOR:

44372

case ISD::OR:

44373

return DAG.getNode(

44374

Src.getOpcode(), DL, SExtVT,

44375

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

44376

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

44377

case ISD::SELECT:

44378

case ISD::VSELECT:

44379

return DAG.getSelect(

44380

DL, SExtVT, Src.getOperand(0),

44381

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),

44382

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));

44383

}

44384

llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44384);

44385

}

44386

44387

// Try to match patterns such as

44388

// (i16 bitcast (v16i1 x))

44389

// ->

44390

// (i16 movmsk (16i8 sext (v16i1 x)))

44391

// before the illegal vector is scalarized on subtargets that don't have legal

44392

// vxi1 types.

44393

static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

44394

const SDLoc &DL,

44395

const X86Subtarget &Subtarget) {

44396

EVT SrcVT = Src.getValueType();

44397

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

44398

return SDValue();

44399

44400

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

44401

// legalization destroys the v4i32 type.

44402

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

44403

if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

44404

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

44405

DAG.getBitcast(MVT::v4f32, V));

44406

return DAG.getZExtOrTrunc(V, DL, VT);

44407

}

44408

}

44409

44410

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

44411

// movmskb even with avx512. This will be better than truncating to vXi1 and

44412

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

44413

// vpcmpeqb/vpcmpgtb.

44414

bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

44415

(Src.getOperand(0).getValueType() == MVT::v16i8 ||

44416

Src.getOperand(0).getValueType() == MVT::v32i8 ||

44417

Src.getOperand(0).getValueType() == MVT::v64i8);

44418

44419

// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

44420

// directly with vpmovmskb/vmovmskps/vmovmskpd.

44421

if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

44422

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

44423

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

44424

EVT CmpVT = Src.getOperand(0).getValueType();

44425

EVT EltVT = CmpVT.getVectorElementType();

44426

if (CmpVT.getSizeInBits() <= 256 &&

44427

(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

44428

PreferMovMsk = true;

44429

}

44430

44431

// With AVX512 vxi1 types are legal and we prefer using k-regs.

44432

// MOVMSK is supported in SSE2 or later.

44433

if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

44434

return SDValue();

44435

44436

// If the upper ops of a concatenation are undef, then try to bitcast the

44437

// lower op and extend.

44438

SmallVector<SDValue, 4> SubSrcOps;

44439

if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&

44440

SubSrcOps.size() >= 2) {

44441

SDValue LowerOp = SubSrcOps[0];

44442

ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());

44443

if (LowerOp.getOpcode() == ISD::SETCC &&

44444

all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {

44445

EVT SubVT = VT.getIntegerVT(

44446

*DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());

44447

if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {

44448

EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

44449

return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));

44450

}

44451

}

44452

}

44453

44454

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

44455

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

44456

// v8i16 and v16i16.

44457

// For these two cases, we can shuffle the upper element bytes to a

44458

// consecutive sequence at the start of the vector and treat the results as

44459

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

44460

// for v16i16 this is not the case, because the shuffle is expensive, so we

44461

// avoid sign-extending to this type entirely.

44462

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

44463

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

44464

MVT SExtVT;

44465

bool PropagateSExt = false;

44466

switch (SrcVT.getSimpleVT().SimpleTy) {

44467

default:

44468

return SDValue();

44469

case MVT::v2i1:

44470

SExtVT = MVT::v2i64;

44471

break;

44472

case MVT::v4i1:

44473

SExtVT = MVT::v4i32;

44474

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

44475

// sign-extend to a 256-bit operation to avoid truncation.

44476

if (Subtarget.hasAVX() &&

44477

checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {

44478

SExtVT = MVT::v4i64;

44479

PropagateSExt = true;

44480

}

44481

break;

44482

case MVT::v8i1:

44483

SExtVT = MVT::v8i16;

44484

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

44485

// sign-extend to a 256-bit operation to match the compare.

44486

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

44487

// 256-bit because the shuffle is cheaper than sign extending the result of

44488

// the compare.

44489

if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||

44490

checkBitcastSrcVectorSize(Src, 512, true))) {

44491

SExtVT = MVT::v8i32;

44492

PropagateSExt = true;

44493

}

44494

break;

44495

case MVT::v16i1:

44496

SExtVT = MVT::v16i8;

44497

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

44498

// it is not profitable to sign-extend to 256-bit because this will

44499

// require an extra cross-lane shuffle which is more expensive than

44500

// truncating the result of the compare to 128-bits.

44501

break;

44502

case MVT::v32i1:

44503

SExtVT = MVT::v32i8;

44504

break;

44505

case MVT::v64i1:

44506

// If we have AVX512F, but not AVX512BW and the input is truncated from

44507

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

44508

if (Subtarget.hasAVX512()) {

44509

if (Subtarget.hasBWI())

44510

return SDValue();

44511

SExtVT = MVT::v64i8;

44512

break;

44513

}

44514

// Split if this is a <64 x i8> comparison result.

44515

if (checkBitcastSrcVectorSize(Src, 512, false)) {

44516

SExtVT = MVT::v64i8;

44517

break;

44518

}

44519

return SDValue();

44520

};

44521

44522

SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

44523

: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

44524

44525

if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

44526

V = getPMOVMSKB(DL, V, DAG, Subtarget);

44527

} else {

44528

if (SExtVT == MVT::v8i16)

44529

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

44530

DAG.getUNDEF(MVT::v8i16));

44531

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

44532

}

44533

44534

EVT IntVT =

44535

EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

44536

V = DAG.getZExtOrTrunc(V, DL, IntVT);

44537

return DAG.getBitcast(VT, V);

44538

}

44539

44540

// Convert a vXi1 constant build vector to the same width scalar integer.

44541

static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

44542

EVT SrcVT = Op.getValueType();

44543

assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__
__PRETTY_FUNCTION__))

44544

"Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__
__PRETTY_FUNCTION__));

44545

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__
__PRETTY_FUNCTION__))

44546

"Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__
__PRETTY_FUNCTION__));

44547

44548

APInt Imm(SrcVT.getVectorNumElements(), 0);

44549

for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

44550

SDValue In = Op.getOperand(Idx);

44551

if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))

44552

Imm.setBit(Idx);

44553

}

44554

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

44555

return DAG.getConstant(Imm, SDLoc(Op), IntVT);

44556

}

44557

44558

static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

44559

TargetLowering::DAGCombinerInfo &DCI,

44560

const X86Subtarget &Subtarget) {

44561

assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44561, __extension__
__PRETTY_FUNCTION__));

44562

44563

if (!DCI.isBeforeLegalizeOps())

44564

return SDValue();

44565

44566

// Only do this if we have k-registers.

44567

if (!Subtarget.hasAVX512())

44568

return SDValue();

44569

44570

EVT DstVT = N->getValueType(0);

44571

SDValue Op = N->getOperand(0);

44572

EVT SrcVT = Op.getValueType();

44573

44574

if (!Op.hasOneUse())

44575

return SDValue();

44576

44577

// Look for logic ops.

44578

if (Op.getOpcode() != ISD::AND &&

44579

Op.getOpcode() != ISD::OR &&

44580

Op.getOpcode() != ISD::XOR)

44581

return SDValue();

44582

44583

// Make sure we have a bitcast between mask registers and a scalar type.

44584

if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

44585

DstVT.isScalarInteger()) &&

44586

!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

44587

SrcVT.isScalarInteger()))

44588

return SDValue();

44589

44590

SDValue LHS = Op.getOperand(0);

44591

SDValue RHS = Op.getOperand(1);

44592

44593

if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&

44594

LHS.getOperand(0).getValueType() == DstVT)

44595

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),

44596

DAG.getBitcast(DstVT, RHS));

44597

44598

if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&

44599

RHS.getOperand(0).getValueType() == DstVT)

44600

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

44601

DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

44602

44603

// If the RHS is a vXi1 build vector, this is a good reason to flip too.

44604

// Most of these have to move a constant from the scalar domain anyway.

44605

if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

44606

RHS = combinevXi1ConstantToInteger(RHS, DAG);

44607

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

44608

DAG.getBitcast(DstVT, LHS), RHS);

44609

}

44610

44611

return SDValue();

44612

}

44613

44614

static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

44615

const X86Subtarget &Subtarget) {

44616

SDLoc DL(BV);

44617

unsigned NumElts = BV->getNumOperands();

44618

SDValue Splat = BV->getSplatValue();

44619

44620

// Build MMX element from integer GPR or SSE float values.

44621

auto CreateMMXElement = [&](SDValue V) {

44622

if (V.isUndef())

44623

return DAG.getUNDEF(MVT::x86mmx);

44624

if (V.getValueType().isFloatingPoint()) {

44625

if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

44626

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

44627

V = DAG.getBitcast(MVT::v2i64, V);

44628

return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

44629

}

44630

V = DAG.getBitcast(MVT::i32, V);

44631

} else {

44632

V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

44633

}

44634

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

44635

};

44636

44637

// Convert build vector ops to MMX data in the bottom elements.

44638

SmallVector<SDValue, 8> Ops;

44639

44640

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44641

44642

// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

44643

if (Splat) {

44644

if (Splat.isUndef())

44645

return DAG.getUNDEF(MVT::x86mmx);

44646

44647

Splat = CreateMMXElement(Splat);

44648

44649

if (Subtarget.hasSSE1()) {

44650

// Unpack v8i8 to splat i8 elements to lowest 16-bits.

44651

if (NumElts == 8)

44652

Splat = DAG.getNode(

44653

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

44654

DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

44655

TLI.getPointerTy(DAG.getDataLayout())),

44656

Splat, Splat);

44657

44658

// Use PSHUFW to repeat 16-bit elements.

44659

unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

44660

return DAG.getNode(

44661

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

44662

DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

44663

TLI.getPointerTy(DAG.getDataLayout())),

44664

Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

44665

}

44666

Ops.append(NumElts, Splat);

44667

} else {

44668

for (unsigned i = 0; i != NumElts; ++i)

44669

Ops.push_back(CreateMMXElement(BV->getOperand(i)));

44670

}

44671

44672

// Use tree of PUNPCKLs to build up general MMX vector.

44673

while (Ops.size() > 1) {

44674

unsigned NumOps = Ops.size();

44675

unsigned IntrinOp =

44676

(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

44677

: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

44678

: Intrinsic::x86_mmx_punpcklbw));

44679

SDValue Intrin = DAG.getTargetConstant(

44680

IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

44681

for (unsigned i = 0; i != NumOps; i += 2)

44682

Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

44683

Ops[i], Ops[i + 1]);

44684

Ops.resize(NumOps / 2);

44685

}

44686

44687

return Ops[0];

44688

}

44689

44690

// Recursive function that attempts to find if a bool vector node was originally

44691

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

44692

// integer. If so, replace the scalar ops with bool vector equivalents back down

44693

// the chain.

44694

static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

44695

SelectionDAG &DAG,

44696

const X86Subtarget &Subtarget) {

44697

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44698

unsigned Opc = V.getOpcode();

44699

switch (Opc) {

44700

case ISD::BITCAST: {

44701

// Bitcast from a vector/float/double, we can cheaply bitcast to VT.

44702

SDValue Src = V.getOperand(0);

44703

EVT SrcVT = Src.getValueType();

44704

if (SrcVT.isVector() || SrcVT.isFloatingPoint())

44705

return DAG.getBitcast(VT, Src);

44706

break;

44707

}

44708

case ISD::TRUNCATE: {

44709

// If we find a suitable source, a truncated scalar becomes a subvector.

44710

SDValue Src = V.getOperand(0);

44711

EVT NewSrcVT =

44712

EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

44713

if (TLI.isTypeLegal(NewSrcVT))

44714

if (SDValue N0 =

44715

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

44716

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

44717

DAG.getIntPtrConstant(0, DL));

44718

break;

44719

}

44720

case ISD::ANY_EXTEND:

44721

case ISD::ZERO_EXTEND: {

44722

// If we find a suitable source, an extended scalar becomes a subvector.

44723

SDValue Src = V.getOperand(0);

44724

EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

44725

Src.getScalarValueSizeInBits());

44726

if (TLI.isTypeLegal(NewSrcVT))

44727

if (SDValue N0 =

44728

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

44729

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

44730

Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

44731

: DAG.getConstant(0, DL, VT),

44732

N0, DAG.getIntPtrConstant(0, DL));

44733

break;

44734

}

44735

case ISD::OR: {

44736

// If we find suitable sources, we can just move an OR to the vector domain.

44737

SDValue Src0 = V.getOperand(0);

44738

SDValue Src1 = V.getOperand(1);

44739

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

44740

if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

44741

return DAG.getNode(Opc, DL, VT, N0, N1);

44742

break;

44743

}

44744

case ISD::SHL: {

44745

// If we find a suitable source, a SHL becomes a KSHIFTL.

44746

SDValue Src0 = V.getOperand(0);

44747

if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

44748

((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

44749

break;

44750

44751

if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

44752

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

44753

return DAG.getNode(

44754

X86ISD::KSHIFTL, DL, VT, N0,

44755

DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

44756

break;

44757

}

44758

}

44759

return SDValue();

44760

}

44761

44762

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

44763

TargetLowering::DAGCombinerInfo &DCI,

44764

const X86Subtarget &Subtarget) {

44765

SDValue N0 = N->getOperand(0);

44766

EVT VT = N->getValueType(0);

44767

EVT SrcVT = N0.getValueType();

44768

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44769

44770

// Try to match patterns such as

44771

// (i16 bitcast (v16i1 x))

44772

// ->

44773

// (i16 movmsk (16i8 sext (v16i1 x)))

44774

// before the setcc result is scalarized on subtargets that don't have legal

44775

// vxi1 types.

44776

if (DCI.isBeforeLegalize()) {

44777

SDLoc dl(N);

44778

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

44779

return V;

44780

44781

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

44782

// type, widen both sides to avoid a trip through memory.

44783

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

44784

Subtarget.hasAVX512()) {

44785

N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

44786

N0 = DAG.getBitcast(MVT::v8i1, N0);

44787

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

44788

DAG.getIntPtrConstant(0, dl));

44789

}

44790

44791

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

44792

// type, widen both sides to avoid a trip through memory.

44793

if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

44794

Subtarget.hasAVX512()) {

44795

// Use zeros for the widening if we already have some zeroes. This can

44796

// allow SimplifyDemandedBits to remove scalar ANDs that may be down

44797

// stream of this.

44798

// FIXME: It might make sense to detect a concat_vectors with a mix of

44799

// zeroes and undef and turn it into insert_subvector for i1 vectors as

44800

// a separate combine. What we can't do is canonicalize the operands of

44801

// such a concat or we'll get into a loop with SimplifyDemandedBits.

44802

if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

44803

SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

44804

if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

44805

SrcVT = LastOp.getValueType();

44806

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

44807

SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());

44808

Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

44809

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

44810

N0 = DAG.getBitcast(MVT::i8, N0);

44811

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

44812

}

44813

}

44814

44815

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

44816

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

44817

Ops[0] = N0;

44818

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

44819

N0 = DAG.getBitcast(MVT::i8, N0);

44820

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

44821

}

44822

} else {

44823

// If we're bitcasting from iX to vXi1, see if the integer originally

44824

// began as a vXi1 and whether we can remove the bitcast entirely.

44825

if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

44826

SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {

44827

if (SDValue V =

44828

combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

44829

return V;

44830

}

44831

}

44832

44833

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

44834

// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

44835

// due to insert_subvector legalization on KNL. By promoting the copy to i16

44836

// we can help with known bits propagation from the vXi1 domain to the

44837

// scalar domain.

44838

if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

44839

!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

44840

N0.getOperand(0).getValueType() == MVT::v16i1 &&

44841

isNullConstant(N0.getOperand(1)))

44842

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

44843

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

44844

44845

// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

44846

// and the vbroadcast_load are both integer or both fp. In some cases this

44847

// will remove the bitcast entirely.

44848

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

44849

VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

44850

auto *BCast = cast<MemIntrinsicSDNode>(N0);

44851

unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

44852

unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

44853

// Don't swap i8/i16 since don't have fp types that size.

44854

if (MemSize >= 32) {

44855

MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

44856

: MVT::getIntegerVT(MemSize);

44857

MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

44858

: MVT::getIntegerVT(SrcVTSize);

44859

LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

44860

44861

SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

44862

SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

44863

SDValue ResNode =

44864

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

44865

MemVT, BCast->getMemOperand());

44866

DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

44867

return DAG.getBitcast(VT, ResNode);

44868

}

44869

}

44870

44871

// Since MMX types are special and don't usually play with other vector types,

44872

// it's better to handle them early to be sure we emit efficient code by

44873

// avoiding store-load conversions.

44874

if (VT == MVT::x86mmx) {

44875

// Detect MMX constant vectors.

44876

APInt UndefElts;

44877

SmallVector<APInt, 1> EltBits;

44878

if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {

44879

SDLoc DL(N0);

44880

// Handle zero-extension of i32 with MOVD.

44881

if (EltBits[0].countl_zero() >= 32)

44882

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

44883

DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

44884

// Else, bitcast to a double.

44885

// TODO - investigate supporting sext 32-bit immediates on x86_64.

44886

APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

44887

return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

44888

}

44889

44890

// Detect bitcasts to x86mmx low word.

44891

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44892

(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

44893

N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

44894

bool LowUndef = true, AllUndefOrZero = true;

44895

for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

44896

SDValue Op = N0.getOperand(i);

44897

LowUndef &= Op.isUndef() || (i >= e/2);

44898

AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));

44899

}

44900

if (AllUndefOrZero) {

44901

SDValue N00 = N0.getOperand(0);

44902

SDLoc dl(N00);

44903

N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

44904

: DAG.getZExtOrTrunc(N00, dl, MVT::i32);

44905

return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

44906

}

44907

}

44908

44909

// Detect bitcasts of 64-bit build vectors and convert to a

44910

// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

44911

// lowest element.

44912

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44913

(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

44914

SrcVT == MVT::v8i8))

44915

return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

44916

44917

// Detect bitcasts between element or subvector extraction to x86mmx.

44918

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

44919

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

44920

isNullConstant(N0.getOperand(1))) {

44921

SDValue N00 = N0.getOperand(0);

44922

if (N00.getValueType().is128BitVector())

44923

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

44924

DAG.getBitcast(MVT::v2i64, N00));

44925

}

44926

44927

// Detect bitcasts from FP_TO_SINT to x86mmx.

44928

if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

44929

SDLoc DL(N0);

44930

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

44931

DAG.getUNDEF(MVT::v2i32));

44932

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

44933

DAG.getBitcast(MVT::v2i64, Res));

44934

}

44935

}

44936

44937

// Try to remove a bitcast of constant vXi1 vector. We have to legalize

44938

// most of these to scalar anyway.

44939

if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

44940

SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

44941

ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

44942

return combinevXi1ConstantToInteger(N0, DAG);

44943

}

44944

44945

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44946

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44947

isa<ConstantSDNode>(N0)) {

44948

auto *C = cast<ConstantSDNode>(N0);

44949

if (C->isAllOnes())

44950

return DAG.getConstant(1, SDLoc(N0), VT);

44951

if (C->isZero())

44952

return DAG.getConstant(0, SDLoc(N0), VT);

44953

}

44954

44955

// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

44956

// Turn it into a sign bit compare that produces a k-register. This avoids

44957

// a trip through a GPR.

44958

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44959

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44960

isPowerOf2_32(VT.getVectorNumElements())) {

44961

unsigned NumElts = VT.getVectorNumElements();

44962

SDValue Src = N0;

44963

44964

// Peek through truncate.

44965

if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

44966

Src = N0.getOperand(0);

44967

44968

if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

44969

SDValue MovmskIn = Src.getOperand(0);

44970

MVT MovmskVT = MovmskIn.getSimpleValueType();

44971

unsigned MovMskElts = MovmskVT.getVectorNumElements();

44972

44973

// We allow extra bits of the movmsk to be used since they are known zero.

44974

// We can't convert a VPMOVMSKB without avx512bw.

44975

if (MovMskElts <= NumElts &&

44976

(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

44977

EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

44978

MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

44979

SDLoc dl(N);

44980

MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

44981

SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

44982

DAG.getConstant(0, dl, IntVT), ISD::SETLT);

44983

if (EVT(CmpVT) == VT)

44984

return Cmp;

44985

44986

// Pad with zeroes up to original VT to replace the zeroes that were

44987

// being used from the MOVMSK.

44988

unsigned NumConcats = NumElts / MovMskElts;

44989

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

44990

Ops[0] = Cmp;

44991

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

44992

}

44993

}

44994

}

44995

44996

// Try to remove bitcasts from input and output of mask arithmetic to

44997

// remove GPR<->K-register crossings.

44998

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

44999

return V;

45000

45001

// Convert a bitcasted integer logic operation that has one bitcasted

45002

// floating-point operand into a floating-point logic operation. This may

45003

// create a load of a constant, but that is cheaper than materializing the

45004

// constant in an integer register and transferring it to an SSE register or

45005

// transferring the SSE operand to integer register and back.

45006

unsigned FPOpcode;

45007

switch (N0.getOpcode()) {

45008

case ISD::AND: FPOpcode = X86ISD::FAND; break;

45009

case ISD::OR: FPOpcode = X86ISD::FOR; break;

45010

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

45011

default: return SDValue();

45012

}

45013

45014

// Check if we have a bitcast from another integer type as well.

45015

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

45016

(Subtarget.hasSSE2() && VT == MVT::f64) ||

45017

(Subtarget.hasFP16() && VT == MVT::f16) ||

45018

(Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&

45019

TLI.isTypeLegal(VT))))

45020

return SDValue();

45021

45022

SDValue LogicOp0 = N0.getOperand(0);

45023

SDValue LogicOp1 = N0.getOperand(1);

45024

SDLoc DL0(N0);

45025

45026

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

45027

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

45028

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&

45029

LogicOp0.getOperand(0).getValueType() == VT &&

45030

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

45031

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

45032

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

45033

return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

45034

}

45035

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

45036

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

45037

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&

45038

LogicOp1.getOperand(0).getValueType() == VT &&

45039

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

45040

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

45041

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

45042

return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

45043

}

45044

45045

return SDValue();

45046

}

45047

45048

// (mul (zext a), (sext, b))

45049

static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,

45050

SDValue &Op1) {

45051

Op0 = Mul.getOperand(0);

45052

Op1 = Mul.getOperand(1);

45053

45054

// The operand1 should be signed extend

45055

if (Op0.getOpcode() == ISD::SIGN_EXTEND)

45056

std::swap(Op0, Op1);

45057

45058

auto IsFreeTruncation = [](SDValue &Op) -> bool {

45059

if ((Op.getOpcode() == ISD::ZERO_EXTEND ||

45060

Op.getOpcode() == ISD::SIGN_EXTEND) &&

45061

Op.getOperand(0).getScalarValueSizeInBits() <= 8)

45062

return true;

45063

45064

auto *BV = dyn_cast<BuildVectorSDNode>(Op);

45065

return (BV && BV->isConstant());

45066

};

45067

45068

// (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned

45069

// value, we need to check Op0 is zero extended value. Op1 should be signed

45070

// value, so we just check the signed bits.

45071

if ((IsFreeTruncation(Op0) &&

45072

DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&

45073

(IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))

45074

return true;

45075

45076

return false;

45077

}

45078

45079

// Given a ABS node, detect the following pattern:

45080

// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).

45081

// This is useful as it is the input into a SAD pattern.

45082

static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {

45083

SDValue AbsOp1 = Abs->getOperand(0);

45084

if (AbsOp1.getOpcode() != ISD::SUB)

45085

return false;

45086

45087

Op0 = AbsOp1.getOperand(0);

45088

Op1 = AbsOp1.getOperand(1);

45089

45090

// Check if the operands of the sub are zero-extended from vectors of i8.

45091

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

45092

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

45093

Op1.getOpcode() != ISD::ZERO_EXTEND ||

45094

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

45095

return false;

45096

45097

return true;

45098

}

45099

45100

static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,

45101

unsigned &LogBias, const SDLoc &DL,

45102

const X86Subtarget &Subtarget) {

45103

// Extend or truncate to MVT::i8 first.

45104

MVT Vi8VT =

45105

MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());

45106

LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);

45107

RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);

45108

45109

// VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element

45110

// C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].

45111

// The src A, B element type is i8, but the dst C element type is i32.

45112

// When we calculate the reduce stage, we use src vector type vXi8 for it

45113

// so we need logbias 2 to avoid extra 2 stages.

45114

LogBias = 2;

45115

45116

unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());

45117

if (Subtarget.hasVNNI() && !Subtarget.hasVLX())

45118

RegSize = std::max(512u, RegSize);

45119

45120

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

45121

// fill in the missing vector elements with 0.

45122

unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();

45123

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));

45124

Ops[0] = LHS;

45125

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

45126

SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

45127

Ops[0] = RHS;

45128

SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

45129

45130

// Actually build the DotProduct, split as 256/512 bits for

45131

// AVXVNNI/AVX512VNNI.

45132

auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45133

ArrayRef<SDValue> Ops) {

45134

MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

45135

return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);

45136

};

45137

MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

45138

SDValue Zero = DAG.getConstant(0, DL, DpVT);

45139

45140

return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},

45141

DpBuilder, false);

45142

}

45143

45144

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

45145

// to these zexts.

45146

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

45147

const SDValue &Zext1, const SDLoc &DL,

45148

const X86Subtarget &Subtarget) {

45149

// Find the appropriate width for the PSADBW.

45150

EVT InVT = Zext0.getOperand(0).getValueType();

45151

unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

45152

45153

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

45154

// fill in the missing vector elements with 0.

45155

unsigned NumConcat = RegSize / InVT.getSizeInBits();

45156

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

45157

Ops[0] = Zext0.getOperand(0);

45158

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

45159

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

45160

Ops[0] = Zext1.getOperand(0);

45161

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

45162

45163

// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

45164

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45165

ArrayRef<SDValue> Ops) {

45166

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

45167

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

45168

};

45169

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

45170

return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },

45171

PSADBWBuilder);

45172

}

45173

45174

// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

45175

// PHMINPOSUW.

45176

static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,

45177

const X86Subtarget &Subtarget) {

45178

// Bail without SSE41.

45179

if (!Subtarget.hasSSE41())

45180

return SDValue();

45181

45182

EVT ExtractVT = Extract->getValueType(0);

45183

if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

45184

return SDValue();

45185

45186

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

45187

ISD::NodeType BinOp;

45188

SDValue Src = DAG.matchBinOpReduction(

45189

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

45190

if (!Src)

45191

return SDValue();

45192

45193

EVT SrcVT = Src.getValueType();

45194

EVT SrcSVT = SrcVT.getScalarType();

45195

if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

45196

return SDValue();

45197

45198

SDLoc DL(Extract);

45199

SDValue MinPos = Src;

45200

45201

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

45202

while (SrcVT.getSizeInBits() > 128) {

45203

SDValue Lo, Hi;

45204

std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

45205

SrcVT = Lo.getValueType();

45206

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

45207

}

45208

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))

45209

(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__))

45210

"Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__
__PRETTY_FUNCTION__));

45211

45212

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

45213

// to flip the value accordingly.

45214

SDValue Mask;

45215

unsigned MaskEltsBits = ExtractVT.getSizeInBits();

45216

if (BinOp == ISD::SMAX)

45217

Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

45218

else if (BinOp == ISD::SMIN)

45219

Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

45220

else if (BinOp == ISD::UMAX)

45221

Mask = DAG.getAllOnesConstant(DL, SrcVT);

45222

45223

if (Mask)

45224

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

45225

45226

// For v16i8 cases we need to perform UMIN on pairs of byte elements,

45227

// shuffling each upper element down and insert zeros. This means that the

45228

// v16i8 UMIN will leave the upper element as zero, performing zero-extension

45229

// ready for the PHMINPOS.

45230

if (ExtractVT == MVT::i8) {

45231

SDValue Upper = DAG.getVectorShuffle(

45232

SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

45233

{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

45234

MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

45235

}

45236

45237

// Perform the PHMINPOS on a v8i16 vector,

45238

MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

45239

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

45240

MinPos = DAG.getBitcast(SrcVT, MinPos);

45241

45242

if (Mask)

45243

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

45244

45245

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

45246

DAG.getIntPtrConstant(0, DL));

45247

}

45248

45249

// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.

45250

static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,

45251

const X86Subtarget &Subtarget) {

45252

// Bail without SSE2.

45253

if (!Subtarget.hasSSE2())

45254

return SDValue();

45255

45256

EVT ExtractVT = Extract->getValueType(0);

45257

unsigned BitWidth = ExtractVT.getSizeInBits();

45258

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

45259

ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

45260

return SDValue();

45261

45262

// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

45263

ISD::NodeType BinOp;

45264

SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

45265

if (!Match && ExtractVT == MVT::i1)

45266

Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

45267

if (!Match)

45268

return SDValue();

45269

45270

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

45271

// which we can't support here for now.

45272

if (Match.getScalarValueSizeInBits() != BitWidth)

45273

return SDValue();

45274

45275

SDValue Movmsk;

45276

SDLoc DL(Extract);

45277

EVT MatchVT = Match.getValueType();

45278

unsigned NumElts = MatchVT.getVectorNumElements();

45279

unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

45280

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45281

LLVMContext &Ctx = *DAG.getContext();

45282

45283

if (ExtractVT == MVT::i1) {

45284

// Special case for (pre-legalization) vXi1 reductions.

45285

if (NumElts > 64 || !isPowerOf2_32(NumElts))

45286

return SDValue();

45287

if (Match.getOpcode() == ISD::SETCC) {

45288

ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();

45289

if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||

45290

(BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {

45291

// For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.

45292

// For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.

45293

X86::CondCode X86CC;

45294

SDValue LHS = DAG.getFreeze(Match.getOperand(0));

45295

SDValue RHS = DAG.getFreeze(Match.getOperand(1));

45296

APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());

45297

if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,

45298

DAG, X86CC))

45299

return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,

45300

getSETCC(X86CC, V, DL, DAG));

45301

}

45302

}

45303

if (TLI.isTypeLegal(MatchVT)) {

45304

// If this is a legal AVX512 predicate type then we can just bitcast.

45305

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

45306

Movmsk = DAG.getBitcast(MovmskVT, Match);

45307

} else {

45308

// Use combineBitcastvxi1 to create the MOVMSK.

45309

while (NumElts > MaxElts) {

45310

SDValue Lo, Hi;

45311

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

45312

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

45313

NumElts /= 2;

45314

}

45315

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

45316

Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

45317

}

45318

if (!Movmsk)

45319

return SDValue();

45320

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

45321

} else {

45322

// FIXME: Better handling of k-registers or 512-bit vectors?

45323

unsigned MatchSizeInBits = Match.getValueSizeInBits();

45324

if (!(MatchSizeInBits == 128 ||

45325

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

45326

return SDValue();

45327

45328

// Make sure this isn't a vector of 1 element. The perf win from using

45329

// MOVMSK diminishes with less elements in the reduction, but it is

45330

// generally better to get the comparison over to the GPRs as soon as

45331

// possible to reduce the number of vector ops.

45332

if (Match.getValueType().getVectorNumElements() < 2)

45333

return SDValue();

45334

45335

// Check that we are extracting a reduction of all sign bits.

45336

if (DAG.ComputeNumSignBits(Match) != BitWidth)

45337

return SDValue();

45338

45339

if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

45340

SDValue Lo, Hi;

45341

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

45342

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

45343

MatchSizeInBits = Match.getValueSizeInBits();

45344

}

45345

45346

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

45347

MVT MaskSrcVT;

45348

if (64 == BitWidth || 32 == BitWidth)

45349

MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

45350

MatchSizeInBits / BitWidth);

45351

else

45352

MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

45353

45354

SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

45355

Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

45356

NumElts = MaskSrcVT.getVectorNumElements();

45357

}

45358

assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__
__PRETTY_FUNCTION__))

45359

"Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__
__PRETTY_FUNCTION__));

45360

45361

MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

45362

if (BinOp == ISD::XOR) {

45363

// parity -> (PARITY(MOVMSK X))

45364

SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

45365

return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

45366

}

45367

45368

SDValue CmpC;

45369

ISD::CondCode CondCode;

45370

if (BinOp == ISD::OR) {

45371

// any_of -> MOVMSK != 0

45372

CmpC = DAG.getConstant(0, DL, CmpVT);

45373

CondCode = ISD::CondCode::SETNE;

45374

} else {

45375

// all_of -> MOVMSK == ((1 << NumElts) - 1)

45376

CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

45377

DL, CmpVT);

45378

CondCode = ISD::CondCode::SETEQ;

45379

}

45380

45381

// The setcc produces an i8 of 0/1, so extend that to the result width and

45382

// negate to get the final 0/-1 mask value.

45383

EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);

45384

SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

45385

SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

45386

SDValue Zero = DAG.getConstant(0, DL, ExtractVT);

45387

return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);

45388

}

45389

45390

static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,

45391

const X86Subtarget &Subtarget) {

45392

if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())

45393

return SDValue();

45394

45395

EVT ExtractVT = Extract->getValueType(0);

45396

// Verify the type we're extracting is i32, as the output element type of

45397

// vpdpbusd is i32.

45398

if (ExtractVT != MVT::i32)

45399

return SDValue();

45400

45401

EVT VT = Extract->getOperand(0).getValueType();

45402

if (!isPowerOf2_32(VT.getVectorNumElements()))

45403

return SDValue();

45404

45405

// Match shuffle + add pyramid.

45406

ISD::NodeType BinOp;

45407

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

45408

45409

// We can't combine to vpdpbusd for zext, because each of the 4 multiplies

45410

// done by vpdpbusd compute a signed 16-bit product that will be sign extended

45411

// before adding into the accumulator.

45412

// TODO:

45413

// We also need to verify that the multiply has at least 2x the number of bits

45414

// of the input. We shouldn't match

45415

// (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).

45416

// if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))

45417

// Root = Root.getOperand(0);

45418

45419

// If there was a match, we want Root to be a mul.

45420

if (!Root || Root.getOpcode() != ISD::MUL)

45421

return SDValue();

45422

45423

// Check whether we have an extend and mul pattern

45424

SDValue LHS, RHS;

45425

if (!detectExtMul(DAG, Root, LHS, RHS))

45426

return SDValue();

45427

45428

// Create the dot product instruction.

45429

SDLoc DL(Extract);

45430

unsigned StageBias;

45431

SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);

45432

45433

// If the original vector was wider than 4 elements, sum over the results

45434

// in the DP vector.

45435

unsigned Stages = Log2_32(VT.getVectorNumElements());

45436

EVT DpVT = DP.getValueType();

45437

45438

if (Stages > StageBias) {

45439

unsigned DpElems = DpVT.getVectorNumElements();

45440

45441

for (unsigned i = Stages - StageBias; i > 0; --i) {

45442

SmallVector<int, 16> Mask(DpElems, -1);

45443

for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

45444

Mask[j] = MaskEnd + j;

45445

45446

SDValue Shuffle =

45447

DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);

45448

DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);

45449

}

45450

}

45451

45452

// Return the lowest ExtractSizeInBits bits.

45453

EVT ResVT =

45454

EVT::getVectorVT(*DAG.getContext(), ExtractVT,

45455

DpVT.getSizeInBits() / ExtractVT.getSizeInBits());

45456

DP = DAG.getBitcast(ResVT, DP);

45457

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,

45458

Extract->getOperand(1));

45459

}

45460

45461

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

45462

const X86Subtarget &Subtarget) {

45463

// PSADBW is only supported on SSE2 and up.

45464

if (!Subtarget.hasSSE2())

45465

return SDValue();

45466

45467

EVT ExtractVT = Extract->getValueType(0);

45468

// Verify the type we're extracting is either i32 or i64.

45469

// FIXME: Could support other types, but this is what we have coverage for.

45470

if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

45471

return SDValue();

45472

45473

EVT VT = Extract->getOperand(0).getValueType();

45474

if (!isPowerOf2_32(VT.getVectorNumElements()))

45475

return SDValue();

45476

45477

// Match shuffle + add pyramid.

45478

ISD::NodeType BinOp;

45479

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

45480

45481

// The operand is expected to be zero extended from i8

45482

// (verified in detectZextAbsDiff).

45483

// In order to convert to i64 and above, additional any/zero/sign

45484

// extend is expected.

45485

// The zero extend from 32 bit has no mathematical effect on the result.

45486

// Also the sign extend is basically zero extend

45487

// (extends the sign bit which is zero).

45488

// So it is correct to skip the sign/zero extend instruction.

45489

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

45490

Root.getOpcode() == ISD::ZERO_EXTEND ||

45491

Root.getOpcode() == ISD::ANY_EXTEND))

45492

Root = Root.getOperand(0);

45493

45494

// If there was a match, we want Root to be a select that is the root of an

45495

// abs-diff pattern.

45496

if (!Root || Root.getOpcode() != ISD::ABS)

45497

return SDValue();

45498

45499

// Check whether we have an abs-diff pattern feeding into the select.

45500

SDValue Zext0, Zext1;

45501

if (!detectZextAbsDiff(Root, Zext0, Zext1))

45502

return SDValue();

45503

45504

// Create the SAD instruction.

45505

SDLoc DL(Extract);

45506

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

45507

45508

// If the original vector was wider than 8 elements, sum over the results

45509

// in the SAD vector.

45510

unsigned Stages = Log2_32(VT.getVectorNumElements());

45511

EVT SadVT = SAD.getValueType();

45512

if (Stages

8.1	'Stages' is > 3

> 3) {

45513

unsigned SadElems = SadVT.getVectorNumElements();

45514

45515

for(unsigned i = Stages - 3; i > 0; --i) {

45516

SmallVector<int, 16> Mask(SadElems, -1);

45517

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

45518

Mask[j] = MaskEnd + j;

45519

45520

SDValue Shuffle =

45521

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

45522

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

45523

}

45524

}

45525

45526

unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

45527

// Return the lowest ExtractSizeInBits bits.

45528

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

45529

SadVT.getSizeInBits() / ExtractSizeInBits);

45530

SAD = DAG.getBitcast(ResVT, SAD);

45531

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

45532

Extract->getOperand(1));

45533

}

45534

45535

// Attempt to peek through a target shuffle and extract the scalar from the

45536

// source.

45537

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

45538

TargetLowering::DAGCombinerInfo &DCI,

45539

const X86Subtarget &Subtarget) {

45540

if (DCI.isBeforeLegalizeOps())

45541

return SDValue();

45542

45543

SDLoc dl(N);

45544

SDValue Src = N->getOperand(0);

45545

SDValue Idx = N->getOperand(1);

45546

45547

EVT VT = N->getValueType(0);

45548

EVT SrcVT = Src.getValueType();

45549

EVT SrcSVT = SrcVT.getVectorElementType();

45550

unsigned SrcEltBits = SrcSVT.getSizeInBits();

45551

unsigned NumSrcElts = SrcVT.getVectorNumElements();

45552

45553

// Don't attempt this for boolean mask vectors or unknown extraction indices.

45554

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

45555

return SDValue();

45556

45557

const APInt &IdxC = N->getConstantOperandAPInt(1);

45558

if (IdxC.uge(NumSrcElts))

45559

return SDValue();

45560

45561

SDValue SrcBC = peekThroughBitcasts(Src);

45562

45563

// Handle extract(bitcast(broadcast(scalar_value))).

45564

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

45565

SDValue SrcOp = SrcBC.getOperand(0);

45566

EVT SrcOpVT = SrcOp.getValueType();

45567

if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

45568

(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

45569

unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

45570

unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

45571

// TODO support non-zero offsets.

45572

if (Offset == 0) {

45573

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

45574

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

45575

return SrcOp;

45576

}

45577

}

45578

}

45579

45580

// If we're extracting a single element from a broadcast load and there are

45581

// no other users, just create a single load.

45582

if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {

45583

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

45584

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

45585

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

45586

VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

45587

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

45588

MemIntr->getBasePtr(),

45589

MemIntr->getPointerInfo(),

45590

MemIntr->getOriginalAlign(),

45591

MemIntr->getMemOperand()->getFlags());

45592

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

45593

return Load;

45594

}

45595

}

45596

45597

// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

45598

// TODO: Move to DAGCombine?

45599

if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

45600

SrcBC.getValueType().isInteger() &&

45601

(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

45602

SrcBC.getScalarValueSizeInBits() ==

45603

SrcBC.getOperand(0).getValueSizeInBits()) {

45604

unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

45605

if (IdxC.ult(Scale)) {

45606

unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

45607

SDValue Scl = SrcBC.getOperand(0);

45608

EVT SclVT = Scl.getValueType();

45609

if (Offset) {

45610

Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

45611

DAG.getShiftAmountConstant(Offset, SclVT, dl));

45612

}

45613

Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

45614

Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

45615

return Scl;

45616

}

45617

}

45618

45619

// Handle extract(truncate(x)) for 0'th index.

45620

// TODO: Treat this as a faux shuffle?

45621

// TODO: When can we use this for general indices?

45622

if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

45623

(SrcVT.getSizeInBits() % 128) == 0) {

45624

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

45625

MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

45626

return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

45627

Idx);

45628

}

45629

45630

// We can only legally extract other elements from 128-bit vectors and in

45631

// certain circumstances, depending on SSE-level.

45632

// TODO: Investigate float/double extraction if it will be just stored.

45633

auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,

45634

unsigned Idx) {

45635

EVT VecSVT = VecVT.getScalarType();

45636

if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&

45637

(VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||

45638

VecSVT == MVT::i64)) {

45639

unsigned EltSizeInBits = VecSVT.getSizeInBits();

45640

unsigned NumEltsPerLane = 128 / EltSizeInBits;

45641

unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;

45642

unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();

45643

VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);

45644

Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);

45645

Idx &= (NumEltsPerLane - 1);

45646

}

45647

if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&

45648

((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

45649

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

45650

DAG.getBitcast(VecVT, Vec),

45651

DAG.getIntPtrConstant(Idx, dl));

45652

}

45653

if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

45654

(VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {

45655

unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

45656

return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),

45657

DAG.getTargetConstant(Idx, dl, MVT::i8));

45658

}

45659

return SDValue();

45660

};

45661

45662

// Resolve the target shuffle inputs and mask.

45663

SmallVector<int, 16> Mask;

45664

SmallVector<SDValue, 2> Ops;

45665

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

45666

return SDValue();

45667

45668

// Shuffle inputs must be the same size as the result.

45669

if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

45670

return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

45671

}))

45672

return SDValue();

45673

45674

// Attempt to narrow/widen the shuffle mask to the correct size.

45675

if (Mask.size() != NumSrcElts) {

45676

if ((NumSrcElts % Mask.size()) == 0) {

45677

SmallVector<int, 16> ScaledMask;

45678

int Scale = NumSrcElts / Mask.size();

45679

narrowShuffleMaskElts(Scale, Mask, ScaledMask);

45680

Mask = std::move(ScaledMask);

45681

} else if ((Mask.size() % NumSrcElts) == 0) {

45682

// Simplify Mask based on demanded element.

45683

int ExtractIdx = (int)IdxC.getZExtValue();

45684

int Scale = Mask.size() / NumSrcElts;

45685

int Lo = Scale * ExtractIdx;

45686

int Hi = Scale * (ExtractIdx + 1);

45687

for (int i = 0, e = (int)Mask.size(); i != e; ++i)

45688

if (i < Lo || Hi <= i)

45689

Mask[i] = SM_SentinelUndef;

45690

45691

SmallVector<int, 16> WidenedMask;

45692

while (Mask.size() > NumSrcElts &&

45693

canWidenShuffleElements(Mask, WidenedMask))

45694

Mask = std::move(WidenedMask);

45695

}

45696

}

45697

45698

// If narrowing/widening failed, see if we can extract+zero-extend.

45699

int ExtractIdx;

45700

EVT ExtractVT;

45701

if (Mask.size() == NumSrcElts) {

45702

ExtractIdx = Mask[IdxC.getZExtValue()];

45703

ExtractVT = SrcVT;

45704

} else {

45705

unsigned Scale = Mask.size() / NumSrcElts;

45706

if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())

45707

return SDValue();

45708

unsigned ScaledIdx = Scale * IdxC.getZExtValue();

45709

if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))

45710

return SDValue();

45711

ExtractIdx = Mask[ScaledIdx];

45712

EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);

45713

ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());

45714

assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__
__PRETTY_FUNCTION__))

45715

"Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__
__PRETTY_FUNCTION__));

45716

}

45717

45718

// If the shuffle source element is undef/zero then we can just accept it.

45719

if (ExtractIdx == SM_SentinelUndef)

45720

return DAG.getUNDEF(VT);

45721

45722

if (ExtractIdx == SM_SentinelZero)

45723

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

45724

: DAG.getConstant(0, dl, VT);

45725

45726

SDValue SrcOp = Ops[ExtractIdx / Mask.size()];

45727

ExtractIdx = ExtractIdx % Mask.size();

45728

if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))

45729

return DAG.getZExtOrTrunc(V, dl, VT);

45730

45731

return SDValue();

45732

}

45733

45734

/// Extracting a scalar FP value from vector element 0 is free, so extract each

45735

/// operand first, then perform the math as a scalar op.

45736

static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,

45737

const X86Subtarget &Subtarget) {

45738

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45738, __extension__
__PRETTY_FUNCTION__));

45739

SDValue Vec = ExtElt->getOperand(0);

45740

SDValue Index = ExtElt->getOperand(1);

45741

EVT VT = ExtElt->getValueType(0);

45742

EVT VecVT = Vec.getValueType();

45743

45744

// TODO: If this is a unary/expensive/expand op, allow extraction from a

45745

// non-zero element because the shuffle+scalar op will be cheaper?

45746

if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

45747

return SDValue();

45748

45749

// Vector FP compares don't fit the pattern of FP math ops (propagate, not

45750

// extract, the condition code), so deal with those as a special-case.

45751

if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

45752

EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

45753

if (OpVT != MVT::f32 && OpVT != MVT::f64)

45754

return SDValue();

45755

45756

// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

45757

SDLoc DL(ExtElt);

45758

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

45759

Vec.getOperand(0), Index);

45760

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

45761

Vec.getOperand(1), Index);

45762

return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

45763

}

45764

45765

if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&

45766

VT != MVT::f64)

45767

return SDValue();

45768

45769

// Vector FP selects don't fit the pattern of FP math ops (because the

45770

// condition has a different type and we have to change the opcode), so deal

45771

// with those here.

45772

// FIXME: This is restricted to pre type legalization by ensuring the setcc

45773

// has i1 elements. If we loosen this we need to convert vector bool to a

45774

// scalar bool.

45775

if (Vec.getOpcode() == ISD::VSELECT &&

45776

Vec.getOperand(0).getOpcode() == ISD::SETCC &&

45777

Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&

45778

Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {

45779

// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

45780

SDLoc DL(ExtElt);

45781

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

45782

Vec.getOperand(0).getValueType().getScalarType(),

45783

Vec.getOperand(0), Index);

45784

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

45785

Vec.getOperand(1), Index);

45786

SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

45787

Vec.getOperand(2), Index);

45788

return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

45789

}

45790

45791

// TODO: This switch could include FNEG and the x86-specific FP logic ops

45792

// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

45793

// missed load folding and fma+fneg combining.

45794

switch (Vec.getOpcode()) {

45795

case ISD::FMA: // Begin 3 operands

45796

case ISD::FMAD:

45797

case ISD::FADD: // Begin 2 operands

45798

case ISD::FSUB:

45799

case ISD::FMUL:

45800

case ISD::FDIV:

45801

case ISD::FREM:

45802

case ISD::FCOPYSIGN:

45803

case ISD::FMINNUM:

45804

case ISD::FMAXNUM:

45805

case ISD::FMINNUM_IEEE:

45806

case ISD::FMAXNUM_IEEE:

45807

case ISD::FMAXIMUM:

45808

case ISD::FMINIMUM:

45809

case X86ISD::FMAX:

45810

case X86ISD::FMIN:

45811

case ISD::FABS: // Begin 1 operand

45812

case ISD::FSQRT:

45813

case ISD::FRINT:

45814

case ISD::FCEIL:

45815

case ISD::FTRUNC:

45816

case ISD::FNEARBYINT:

45817

case ISD::FROUND:

45818

case ISD::FFLOOR:

45819

case X86ISD::FRCP:

45820

case X86ISD::FRSQRT: {

45821

// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

45822

SDLoc DL(ExtElt);

45823

SmallVector<SDValue, 4> ExtOps;

45824

for (SDValue Op : Vec->ops())

45825

ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

45826

return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

45827

}

45828

default:

45829

return SDValue();

45830

}

45831

llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45831);

45832

}

45833

45834

/// Try to convert a vector reduction sequence composed of binops and shuffles

45835

/// into horizontal ops.

45836

static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,

45837

const X86Subtarget &Subtarget) {

45838

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45838, __extension__
__PRETTY_FUNCTION__));

45839

45840

// We need at least SSE2 to anything here.

45841

if (!Subtarget.hasSSE2())

45842

return SDValue();

45843

45844

ISD::NodeType Opc;

45845

SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,

45846

{ISD::ADD, ISD::MUL, ISD::FADD}, true);

45847

if (!Rdx)

45848

return SDValue();

45849

45850

SDValue Index = ExtElt->getOperand(1);

45851

assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__
__PRETTY_FUNCTION__))

45852

"Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__
__PRETTY_FUNCTION__));

45853

45854

EVT VT = ExtElt->getValueType(0);

45855

EVT VecVT = Rdx.getValueType();

45856

if (VecVT.getScalarType() != VT)

45857

return SDValue();

45858

45859

SDLoc DL(ExtElt);

45860

unsigned NumElts = VecVT.getVectorNumElements();

45861

unsigned EltSizeInBits = VecVT.getScalarSizeInBits();

45862

45863

// Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.

45864

auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {

45865

if (V.getValueType() == MVT::v4i8) {

45866

if (ZeroExtend && Subtarget.hasSSE41()) {

45867

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

45868

DAG.getConstant(0, DL, MVT::v4i32),

45869

DAG.getBitcast(MVT::i32, V),

45870

DAG.getIntPtrConstant(0, DL));

45871

return DAG.getBitcast(MVT::v16i8, V);

45872

}

45873

V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,

45874

ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)

45875

: DAG.getUNDEF(MVT::v4i8));

45876

}

45877

return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,

45878

DAG.getUNDEF(MVT::v8i8));

45879

};

45880

45881

// vXi8 mul reduction - promote to vXi16 mul reduction.

45882

if (Opc == ISD::MUL) {

45883

if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))

45884

return SDValue();

45885

if (VecVT.getSizeInBits() >= 128) {

45886

EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);

45887

SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45888

SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45889

Lo = DAG.getBitcast(WideVT, Lo);

45890

Hi = DAG.getBitcast(WideVT, Hi);

45891

Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);

45892

while (Rdx.getValueSizeInBits() > 128) {

45893

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45894

Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);

45895

}

45896

} else {

45897

Rdx = WidenToV16I8(Rdx, false);

45898

Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));

45899

Rdx = DAG.getBitcast(MVT::v8i16, Rdx);

45900

}

45901

if (NumElts >= 8)

45902

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45903

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45904

{4, 5, 6, 7, -1, -1, -1, -1}));

45905

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45906

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45907

{2, 3, -1, -1, -1, -1, -1, -1}));

45908

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45909

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45910

{1, -1, -1, -1, -1, -1, -1, -1}));

45911

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45912

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45913

}

45914

45915

// vXi8 add reduction - sub 128-bit vector.

45916

if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

45917

Rdx = WidenToV16I8(Rdx, true);

45918

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45919

DAG.getConstant(0, DL, MVT::v16i8));

45920

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45921

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45922

}

45923

45924

// Must be a >=128-bit vector with pow2 elements.

45925

if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))

45926

return SDValue();

45927

45928

// vXi8 add reduction - sum lo/hi halves then use PSADBW.

45929

if (VT == MVT::i8) {

45930

while (Rdx.getValueSizeInBits() > 128) {

45931

SDValue Lo, Hi;

45932

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45933

VecVT = Lo.getValueType();

45934

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45935

}

45936

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45936, __extension__
__PRETTY_FUNCTION__));

45937

45938

SDValue Hi = DAG.getVectorShuffle(

45939

MVT::v16i8, DL, Rdx, Rdx,

45940

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

45941

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

45942

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45943

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

45944

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45945

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45946

}

45947

45948

// See if we can use vXi8 PSADBW add reduction for larger zext types.

45949

// If the source vector values are 0-255, then we can use PSADBW to

45950

// sum+zext v8i8 subvectors to vXi64, then perform the reduction.

45951

// TODO: See if its worth avoiding vXi16/i32 truncations?

45952

if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&

45953

DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&

45954

(EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||

45955

Subtarget.hasAVX512())) {

45956

EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);

45957

Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);

45958

if (ByteVT.getSizeInBits() < 128)

45959

Rdx = WidenToV16I8(Rdx, true);

45960

45961

// Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

45962

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45963

ArrayRef<SDValue> Ops) {

45964

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

45965

SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());

45966

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);

45967

};

45968

MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);

45969

Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);

45970

45971

// TODO: We could truncate to vXi16/vXi32 before performing the reduction.

45972

while (Rdx.getValueSizeInBits() > 128) {

45973

SDValue Lo, Hi;

45974

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45975

VecVT = Lo.getValueType();

45976

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45977

}

45978

assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45978, __extension__
__PRETTY_FUNCTION__));

45979

45980

if (NumElts > 8) {

45981

SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});

45982

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);

45983

}

45984

45985

VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());

45986

Rdx = DAG.getBitcast(VecVT, Rdx);

45987

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45988

}

45989

45990

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

45991

if (!shouldUseHorizontalOp(true, DAG, Subtarget))

45992

return SDValue();

45993

45994

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

45995

45996

// 256-bit horizontal instructions operate on 128-bit chunks rather than

45997

// across the whole vector, so we need an extract + hop preliminary stage.

45998

// This is the only step where the operands of the hop are not the same value.

45999

// TODO: We could extend this to handle 512-bit or even longer vectors.

46000

if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

46001

((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

46002

unsigned NumElts = VecVT.getVectorNumElements();

46003

SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

46004

SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

46005

Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

46006

VecVT = Rdx.getValueType();

46007

}

46008

if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

46009

!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

46010

return SDValue();

46011

46012

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

46013

unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

46014

for (unsigned i = 0; i != ReductionSteps; ++i)

46015

Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

46016

46017

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

46018

}

46019

46020

/// Detect vector gather/scatter index generation and convert it from being a

46021

/// bunch of shuffles and extracts into a somewhat faster sequence.

46022

/// For i686, the best sequence is apparently storing the value and loading

46023

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

46024

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

46025

TargetLowering::DAGCombinerInfo &DCI,

46026

const X86Subtarget &Subtarget) {

46027

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

46028

return NewOp;

46029

46030

SDValue InputVector = N->getOperand(0);

46031

SDValue EltIdx = N->getOperand(1);

46032

auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

46033

46034

EVT SrcVT = InputVector.getValueType();

46035

EVT VT = N->getValueType(0);

46036

SDLoc dl(InputVector);

46037

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

46038

unsigned NumSrcElts = SrcVT.getVectorNumElements();

46039

unsigned NumEltBits = VT.getScalarSizeInBits();

46040

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46041

46042

if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

46043

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

46044

46045

// Integer Constant Folding.

46046

if (CIdx && VT.isInteger()) {

46047

APInt UndefVecElts;

46048

SmallVector<APInt, 16> EltBits;

46049

unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

46050

if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

46051

EltBits, true, false)) {

46052

uint64_t Idx = CIdx->getZExtValue();

46053

if (UndefVecElts[Idx])

46054

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

46055

return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);

46056

}

46057

46058

// Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).

46059

// Improves lowering of bool masks on rust which splits them into byte array.

46060

if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {

46061

SDValue Src = peekThroughBitcasts(InputVector);

46062

if (Src.getValueType().getScalarType() == MVT::i1 &&

46063

TLI.isTypeLegal(Src.getValueType())) {

46064

MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);

46065

SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,

46066

DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));

46067

return DAG.getBitcast(VT, Sub);

46068

}

46069

}

46070

}

46071

46072

if (IsPextr) {

46073

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),

46074

DCI))

46075

return SDValue(N, 0);

46076

46077

// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

46078

if ((InputVector.getOpcode() == X86ISD::PINSRB ||

46079

InputVector.getOpcode() == X86ISD::PINSRW) &&

46080

InputVector.getOperand(2) == EltIdx) {

46081

assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__
__PRETTY_FUNCTION__))

46082

"Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__
__PRETTY_FUNCTION__));

46083

SDValue Scl = InputVector.getOperand(1);

46084

Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

46085

return DAG.getZExtOrTrunc(Scl, dl, VT);

46086

}

46087

46088

// TODO - Remove this once we can handle the implicit zero-extension of

46089

// X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and

46090

// combineBasicSADPattern.

46091

return SDValue();

46092

}

46093

46094

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

46095

if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&

46096

InputVector.getOpcode() == ISD::BITCAST &&

46097

InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

46098

isNullConstant(EltIdx) && InputVector.hasOneUse())

46099

return DAG.getBitcast(VT, InputVector);

46100

46101

// Detect mmx to i32 conversion through a v2i32 elt extract.

46102

if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&

46103

InputVector.getOpcode() == ISD::BITCAST &&

46104

InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

46105

isNullConstant(EltIdx) && InputVector.hasOneUse())

46106

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,

46107

InputVector.getOperand(0));

46108

46109

// Check whether this extract is the root of a sum of absolute differences

46110

// pattern. This has to be done here because we really want it to happen

46111

// pre-legalization,

46112

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

46113

return SAD;

46114

46115

if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))

46116

return VPDPBUSD;

46117

46118

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

46119

if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))

46120

return Cmp;

46121

46122

// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

46123

if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))

46124

return MinMax;

46125

46126

// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..

46127

if (SDValue V = combineArithReduction(N, DAG, Subtarget))

46128

return V;

46129

46130

if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))

46131

return V;

46132

46133

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

46134

// and then testing the relevant element.

46135

//

46136

// Note that we only combine extracts on the *same* result number, i.e.

46137

// t0 = merge_values a0, a1, a2, a3

46138

// i1 = extract_vector_elt t0, Constant:i64<2>

46139

// i1 = extract_vector_elt t0, Constant:i64<3>

46140

// but not

46141

// i1 = extract_vector_elt t0:1, Constant:i64<2>

46142

// since the latter would need its own MOVMSK.

46143

if (SrcVT.getScalarType() == MVT::i1) {

46144

bool IsVar = !CIdx;

46145

SmallVector<SDNode *, 16> BoolExtracts;

46146

unsigned ResNo = InputVector.getResNo();

46147

auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {

46148

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

46149

Use->getOperand(0).getResNo() == ResNo &&

46150

Use->getValueType(0) == MVT::i1) {

46151

BoolExtracts.push_back(Use);

46152

IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));

46153

return true;

46154

}

46155

return false;

46156

};

46157

// TODO: Can we drop the oneuse check for constant extracts?

46158

if (all_of(InputVector->uses(), IsBoolExtract) &&

46159

(IsVar || BoolExtracts.size() > 1)) {

46160

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

46161

if (SDValue BC =

46162

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

46163

for (SDNode *Use : BoolExtracts) {

46164

// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

46165

// Mask = 1 << MaskIdx

46166

SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);

46167

SDValue MaskBit = DAG.getConstant(1, dl, BCVT);

46168

SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);

46169

SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

46170

Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

46171

DCI.CombineTo(Use, Res);

46172

}

46173

return SDValue(N, 0);

46174

}

46175

}

46176

}

46177

46178

// If this extract is from a loaded vector value and will be used as an

46179

// integer, that requires a potentially expensive XMM -> GPR transfer.

46180

// Additionally, if we can convert to a scalar integer load, that will likely

46181

// be folded into a subsequent integer op.

46182

// Note: Unlike the related fold for this in DAGCombiner, this is not limited

46183

// to a single-use of the loaded vector. For the reasons above, we

46184

// expect this to be profitable even if it creates an extra load.

46185

bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {

46186

return Use->getOpcode() == ISD::STORE ||

46187

Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||

46188

Use->getOpcode() == ISD::SCALAR_TO_VECTOR;

46189

});

46190

auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);

46191

if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&

46192

SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&

46193

!LikelyUsedAsVector && LoadVec->isSimple()) {

46194

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46195

SDValue NewPtr =

46196

TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);

46197

unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;

46198

MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);

46199

Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);

46200

SDValue Load =

46201

DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,

46202

LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());

46203

DAG.makeEquivalentMemoryOrdering(LoadVec, Load);

46204

return Load;

46205

}

46206

46207

return SDValue();

46208

}

46209

46210

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

46211

// This is more or less the reverse of combineBitcastvxi1.

46212

static SDValue combineToExtendBoolVectorInReg(

46213

unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,

46214

TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

46215

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

46216

Opcode != ISD::ANY_EXTEND)

46217

return SDValue();

46218

if (!DCI.isBeforeLegalizeOps())

46219

return SDValue();

46220

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

46221

return SDValue();

46222

46223

EVT SVT = VT.getScalarType();

46224

EVT InSVT = N0.getValueType().getScalarType();

46225

unsigned EltSizeInBits = SVT.getSizeInBits();

46226

46227

// Input type must be extending a bool vector (bit-casted from a scalar

46228

// integer) to legal integer types.

46229

if (!VT.isVector())

46230

return SDValue();

46231

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

46232

return SDValue();

46233

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

46234

return SDValue();

46235

46236

SDValue N00 = N0.getOperand(0);

46237

EVT SclVT = N00.getValueType();

46238

if (!SclVT.isScalarInteger())

46239

return SDValue();

46240

46241

SDValue Vec;

46242

SmallVector<int> ShuffleMask;

46243

unsigned NumElts = VT.getVectorNumElements();

46244

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46244, __extension__
__PRETTY_FUNCTION__));

46245

46246

// Broadcast the scalar integer to the vector elements.

46247

if (NumElts > EltSizeInBits) {

46248

// If the scalar integer is greater than the vector element size, then we

46249

// must split it down into sub-sections for broadcasting. For example:

46250

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

46251

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

46252

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46252, __extension__
__PRETTY_FUNCTION__));

46253

unsigned Scale = NumElts / EltSizeInBits;

46254

EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

46255

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

46256

Vec = DAG.getBitcast(VT, Vec);

46257

46258

for (unsigned i = 0; i != Scale; ++i)

46259

ShuffleMask.append(EltSizeInBits, i);

46260

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

46261

} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

46262

(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

46263

// If we have register broadcast instructions, use the scalar size as the

46264

// element type for the shuffle. Then cast to the wider element type. The

46265

// widened bits won't be used, and this might allow the use of a broadcast

46266

// load.

46267

assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46267, __extension__
__PRETTY_FUNCTION__));

46268

unsigned Scale = EltSizeInBits / NumElts;

46269

EVT BroadcastVT =

46270

EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

46271

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

46272

ShuffleMask.append(NumElts * Scale, 0);

46273

Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

46274

Vec = DAG.getBitcast(VT, Vec);

46275

} else {

46276

// For smaller scalar integers, we can simply any-extend it to the vector

46277

// element size (we don't care about the upper bits) and broadcast it to all

46278

// elements.

46279

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

46280

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

46281

ShuffleMask.append(NumElts, 0);

46282

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

46283

}

46284

46285

// Now, mask the relevant bit in each element.

46286

SmallVector<SDValue, 32> Bits;

46287

for (unsigned i = 0; i != NumElts; ++i) {

46288

int BitIdx = (i % EltSizeInBits);

46289

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

46290

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

46291

}

46292

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

46293

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

46294

46295

// Compare against the bitmask and extend the result.

46296

EVT CCVT = VT.changeVectorElementType(MVT::i1);

46297

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

46298

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

46299

46300

// For SEXT, this is now done, otherwise shift the result down for

46301

// zero-extension.

46302

if (Opcode == ISD::SIGN_EXTEND)

46303

return Vec;

46304

return DAG.getNode(ISD::SRL, DL, VT, Vec,

46305

DAG.getConstant(EltSizeInBits - 1, DL, VT));

46306

}

46307

46308

/// If a vector select has an operand that is -1 or 0, try to simplify the

46309

/// select to a bitwise logic operation.

46310

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

46311

static SDValue

46312

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

46313

TargetLowering::DAGCombinerInfo &DCI,

46314

const X86Subtarget &Subtarget) {

46315

SDValue Cond = N->getOperand(0);

46316

SDValue LHS = N->getOperand(1);

46317

SDValue RHS = N->getOperand(2);

46318

EVT VT = LHS.getValueType();

46319

EVT CondVT = Cond.getValueType();

46320

SDLoc DL(N);

46321

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46322

46323

if (N->getOpcode() != ISD::VSELECT)

46324

return SDValue();

46325

46326

assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46326, __extension__
__PRETTY_FUNCTION__));

46327

46328

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

46329

// TODO: Can we assert that both operands are not zeros (because that should

46330

// get simplified at node creation time)?

46331

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

46332

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

46333

46334

// If both inputs are 0/undef, create a complete zero vector.

46335

// FIXME: As noted above this should be handled by DAGCombiner/getNode.

46336

if (TValIsAllZeros && FValIsAllZeros) {

46337

if (VT.isFloatingPoint())

46338

return DAG.getConstantFP(0.0, DL, VT);

46339

return DAG.getConstant(0, DL, VT);

46340

}

46341

46342

// To use the condition operand as a bitwise mask, it must have elements that

46343

// are the same size as the select elements. Ie, the condition operand must

46344

// have already been promoted from the IR select condition type <N x i1>.

46345

// Don't check if the types themselves are equal because that excludes

46346

// vector floating-point selects.

46347

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

46348

return SDValue();

46349

46350

// Try to invert the condition if true value is not all 1s and false value is

46351

// not all 0s. Only do this if the condition has one use.

46352

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

46353

if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&

46354

// Check if the selector will be produced by CMPP*/PCMP*.

46355

Cond.getOpcode() == ISD::SETCC &&

46356

// Check if SETCC has already been promoted.

46357

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

46358

CondVT) {

46359

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

46360

46361

if (TValIsAllZeros || FValIsAllOnes) {

46362

SDValue CC = Cond.getOperand(2);

46363

ISD::CondCode NewCC = ISD::getSetCCInverse(

46364

cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

46365

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

46366

NewCC);

46367

std::swap(LHS, RHS);

46368

TValIsAllOnes = FValIsAllOnes;

46369

FValIsAllZeros = TValIsAllZeros;

46370

}

46371

}

46372

46373

// Cond value must be 'sign splat' to be converted to a logical op.

46374

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

46375

return SDValue();

46376

46377

// vselect Cond, 111..., 000... -> Cond

46378

if (TValIsAllOnes && FValIsAllZeros)

46379

return DAG.getBitcast(VT, Cond);

46380

46381

if (!TLI.isTypeLegal(CondVT))

46382

return SDValue();

46383

46384

// vselect Cond, 111..., X -> or Cond, X

46385

if (TValIsAllOnes) {

46386

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

46387

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

46388

return DAG.getBitcast(VT, Or);

46389

}

46390

46391

// vselect Cond, X, 000... -> and Cond, X

46392

if (FValIsAllZeros) {

46393

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

46394

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

46395

return DAG.getBitcast(VT, And);

46396

}

46397

46398

// vselect Cond, 000..., X -> andn Cond, X

46399

if (TValIsAllZeros) {

46400

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

46401

SDValue AndN;

46402

// The canonical form differs for i1 vectors - x86andnp is not used

46403

if (CondVT.getScalarType() == MVT::i1)

46404

AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),

46405

CastRHS);

46406

else

46407

AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);

46408

return DAG.getBitcast(VT, AndN);

46409

}

46410

46411

return SDValue();

46412

}

46413

46414

/// If both arms of a vector select are concatenated vectors, split the select,

46415

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

46416

/// vselect Cond, (concat T0, T1), (concat F0, F1) -->

46417

/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)

46418

static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,

46419

const X86Subtarget &Subtarget) {

46420

unsigned Opcode = N->getOpcode();

46421

if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

46422

return SDValue();

46423

46424

// TODO: Split 512-bit vectors too?

46425

EVT VT = N->getValueType(0);

46426

if (!VT.is256BitVector())

46427

return SDValue();

46428

46429

// TODO: Split as long as any 2 of the 3 operands are concatenated?

46430

SDValue Cond = N->getOperand(0);

46431

SDValue TVal = N->getOperand(1);

46432

SDValue FVal = N->getOperand(2);

46433

SmallVector<SDValue, 4> CatOpsT, CatOpsF;

46434

if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

46435

!collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||

46436

!collectConcatOps(FVal.getNode(), CatOpsF, DAG))

46437

return SDValue();

46438

46439

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

46440

ArrayRef<SDValue> Ops) {

46441

return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

46442

};

46443

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },

46444

makeBlend, /*CheckBWI*/ false);

46445

}

46446

46447

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

46448

SDValue Cond = N->getOperand(0);

46449

SDValue LHS = N->getOperand(1);

46450

SDValue RHS = N->getOperand(2);

46451

SDLoc DL(N);

46452

46453

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

46454

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

46455

if (!TrueC || !FalseC)

46456

return SDValue();

46457

46458

// Don't do this for crazy integer types.

46459

EVT VT = N->getValueType(0);

46460

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

46461

return SDValue();

46462

46463

// We're going to use the condition bit in math or logic ops. We could allow

46464

// this with a wider condition value (post-legalization it becomes an i8),

46465

// but if nothing is creating selects that late, it doesn't matter.

46466

if (Cond.getValueType() != MVT::i1)

46467

return SDValue();

46468

46469

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

46470

// 3, 5, or 9 with i32/i64, so those get transformed too.

46471

// TODO: For constants that overflow or do not differ by power-of-2 or small

46472

// multiplier, convert to 'and' + 'add'.

46473

const APInt &TrueVal = TrueC->getAPIntValue();

46474

const APInt &FalseVal = FalseC->getAPIntValue();

46475

46476

// We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.

46477

if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&

46478

Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {

46479

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46480

if (CC == ISD::SETEQ || CC == ISD::SETNE)

46481

return SDValue();

46482

}

46483

46484

bool OV;

46485

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

46486

if (OV)

46487

return SDValue();

46488

46489

APInt AbsDiff = Diff.abs();

46490

if (AbsDiff.isPowerOf2() ||

46491

((VT == MVT::i32 || VT == MVT::i64) &&

46492

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

46493

46494

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

46495

// of the condition can usually be folded into a compare predicate, but even

46496

// without that, the sequence should be cheaper than a CMOV alternative.

46497

if (TrueVal.slt(FalseVal)) {

46498

Cond = DAG.getNOT(DL, Cond, MVT::i1);

46499

std::swap(TrueC, FalseC);

46500

}

46501

46502

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

46503

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

46504

46505

// Multiply condition by the difference if non-one.

46506

if (!AbsDiff.isOne())

46507

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

46508

46509

// Add the base if non-zero.

46510

if (!FalseC->isZero())

46511

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

46512

46513

return R;

46514

}

46515

46516

return SDValue();

46517

}

46518

46519

/// If this is a *dynamic* select (non-constant condition) and we can match

46520

/// this node with one of the variable blend instructions, restructure the

46521

/// condition so that blends can use the high (sign) bit of each element.

46522

/// This function will also call SimplifyDemandedBits on already created

46523

/// BLENDV to perform additional simplifications.

46524

static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

46525

TargetLowering::DAGCombinerInfo &DCI,

46526

const X86Subtarget &Subtarget) {

46527

SDValue Cond = N->getOperand(0);

46528

if ((N->getOpcode() != ISD::VSELECT &&

46529

N->getOpcode() != X86ISD::BLENDV) ||

46530

ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

46531

return SDValue();

46532

46533

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46534

unsigned BitWidth = Cond.getScalarValueSizeInBits();

46535

EVT VT = N->getValueType(0);

46536

46537

// We can only handle the cases where VSELECT is directly legal on the

46538

// subtarget. We custom lower VSELECT nodes with constant conditions and

46539

// this makes it hard to see whether a dynamic VSELECT will correctly

46540

// lower, so we both check the operation's status and explicitly handle the

46541

// cases where a *dynamic* blend will fail even though a constant-condition

46542

// blend could be custom lowered.

46543

// FIXME: We should find a better way to handle this class of problems.

46544

// Potentially, we should combine constant-condition vselect nodes

46545

// pre-legalization into shuffles and not mark as many types as custom

46546

// lowered.

46547

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

46548

return SDValue();

46549

// FIXME: We don't support i16-element blends currently. We could and

46550

// should support them by making *all* the bits in the condition be set

46551

// rather than just the high bit and using an i8-element blend.

46552

if (VT.getVectorElementType() == MVT::i16)

46553

return SDValue();

46554

// Dynamic blending was only available from SSE4.1 onward.

46555

if (VT.is128BitVector() && !Subtarget.hasSSE41())

46556

return SDValue();

46557

// Byte blends are only available in AVX2

46558

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

46559

return SDValue();

46560

// There are no 512-bit blend instructions that use sign bits.

46561

if (VT.is512BitVector())

46562

return SDValue();

46563

46564

// Don't optimize before the condition has been transformed to a legal type

46565

// and don't ever optimize vector selects that map to AVX512 mask-registers.

46566

if (BitWidth < 8 || BitWidth > 64)

46567

return SDValue();

46568

46569

auto OnlyUsedAsSelectCond = [](SDValue Cond) {

46570

for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

46571

UI != UE; ++UI)

46572

if ((UI->getOpcode() != ISD::VSELECT &&

46573

UI->getOpcode() != X86ISD::BLENDV) ||

46574

UI.getOperandNo() != 0)

46575

return false;

46576

46577

return true;

46578

};

46579

46580

APInt DemandedBits(APInt::getSignMask(BitWidth));

46581

46582

if (OnlyUsedAsSelectCond(Cond)) {

46583

KnownBits Known;

46584

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

46585

!DCI.isBeforeLegalizeOps());

46586

if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

46587

return SDValue();

46588

46589

// If we changed the computation somewhere in the DAG, this change will

46590

// affect all users of Cond. Update all the nodes so that we do not use

46591

// the generic VSELECT anymore. Otherwise, we may perform wrong

46592

// optimizations as we messed with the actual expectation for the vector

46593

// boolean values.

46594

for (SDNode *U : Cond->uses()) {

46595

if (U->getOpcode() == X86ISD::BLENDV)

46596

continue;

46597

46598

SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

46599

Cond, U->getOperand(1), U->getOperand(2));

46600

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

46601

DCI.AddToWorklist(U);

46602

}

46603

DCI.CommitTargetLoweringOpt(TLO);

46604

return SDValue(N, 0);

46605

}

46606

46607

// Otherwise we can still at least try to simplify multiple use bits.

46608

if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

46609

return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

46610

N->getOperand(1), N->getOperand(2));

46611

46612

return SDValue();

46613

}

46614

46615

// Try to match:

46616

// (or (and (M, (sub 0, X)), (pandn M, X)))

46617

// which is a special case of:

46618

// (select M, (sub 0, X), X)

46619

// Per:

46620

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

46621

// We know that, if fNegate is 0 or 1:

46622

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

46623

//

46624

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

46625

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

46626

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

46627

// This lets us transform our vselect to:

46628

// (add (xor X, M), (and M, 1))

46629

// And further to:

46630

// (sub (xor X, M), M)

46631

static SDValue combineLogicBlendIntoConditionalNegate(

46632

EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

46633

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

46634

EVT MaskVT = Mask.getValueType();

46635

assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))

46636

DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__))

46637

"Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__
__PRETTY_FUNCTION__));

46638

46639

if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

46640

return SDValue();

46641

if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

46642

return SDValue();

46643

46644

auto IsNegV = [](SDNode *N, SDValue V) {

46645

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

46646

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

46647

};

46648

46649

SDValue V;

46650

if (IsNegV(Y.getNode(), X))

46651

V = X;

46652

else if (IsNegV(X.getNode(), Y))

46653

V = Y;

46654

else

46655

return SDValue();

46656

46657

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

46658

SDValue SubOp2 = Mask;

46659

46660

// If the negate was on the false side of the select, then

46661

// the operands of the SUB need to be swapped. PR 27251.

46662

// This is because the pattern being matched above is

46663

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

46664

// but if the pattern matched was

46665

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

46666

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

46667

// pattern also needs to be a negation of the replacement pattern above.

46668

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

46669

// sub accomplishes the negation of the replacement pattern.

46670

if (V == Y)

46671

std::swap(SubOp1, SubOp2);

46672

46673

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

46674

return DAG.getBitcast(VT, Res);

46675

}

46676

46677

/// Do target-specific dag combines on SELECT and VSELECT nodes.

46678

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

46679

TargetLowering::DAGCombinerInfo &DCI,

46680

const X86Subtarget &Subtarget) {

46681

SDLoc DL(N);

46682

SDValue Cond = N->getOperand(0);

46683

SDValue LHS = N->getOperand(1);

46684

SDValue RHS = N->getOperand(2);

46685

46686

// Try simplification again because we use this function to optimize

46687

// BLENDV nodes that are not handled by the generic combiner.

46688

if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

46689

return V;

46690

46691

EVT VT = LHS.getValueType();

46692

EVT CondVT = Cond.getValueType();

46693

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46694

bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

46695

46696

// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

46697

// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

46698

// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

46699

if (CondVT.isVector() && CondVT.isInteger() &&

46700

CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

46701

(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

46702

DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

46703

if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

46704

DL, DAG, Subtarget))

46705

return V;

46706

46707

// Convert vselects with constant condition into shuffles.

46708

if (CondConstantVector && DCI.isBeforeLegalizeOps() &&

46709

(N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {

46710

SmallVector<int, 64> Mask;

46711

if (createShuffleMaskFromVSELECT(Mask, Cond,

46712

N->getOpcode() == X86ISD::BLENDV))

46713

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

46714

}

46715

46716

// fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

46717

// by forcing the unselected elements to zero.

46718

// TODO: Can we handle more shuffles with this?

46719

if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&

46720

LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&

46721

LHS.hasOneUse() && RHS.hasOneUse()) {

46722

MVT SimpleVT = VT.getSimpleVT();

46723

SmallVector<SDValue, 1> LHSOps, RHSOps;

46724

SmallVector<int, 64> LHSMask, RHSMask, CondMask;

46725

if (createShuffleMaskFromVSELECT(CondMask, Cond) &&

46726

getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&

46727

getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {

46728

int NumElts = VT.getVectorNumElements();

46729

for (int i = 0; i != NumElts; ++i) {

46730

// getConstVector sets negative shuffle mask values as undef, so ensure

46731

// we hardcode SM_SentinelZero values to zero (0x80).

46732

if (CondMask[i] < NumElts) {

46733

LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];

46734

RHSMask[i] = 0x80;

46735

} else {

46736

LHSMask[i] = 0x80;

46737

RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];

46738

}

46739

}

46740

LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),

46741

getConstVector(LHSMask, SimpleVT, DAG, DL, true));

46742

RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),

46743

getConstVector(RHSMask, SimpleVT, DAG, DL, true));

46744

return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

46745

}

46746

}

46747

46748

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

46749

// instructions match the semantics of the common C idiom x<y?x:y but not

46750

// x<=y?x:y, because of how they handle negative zero (which can be

46751

// ignored in unsafe-math mode).

46752

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

46753

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

46754

VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&

46755

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

46756

(Subtarget.hasSSE2() ||

46757

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

46758

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46759

46760

unsigned Opcode = 0;

46761

// Check for x CC y ? x : y.

46762

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

46763

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

46764

switch (CC) {

46765

default: break;

46766

case ISD::SETULT:

46767

// Converting this to a min would handle NaNs incorrectly, and swapping

46768

// the operands would cause it to handle comparisons between positive

46769

// and negative zero incorrectly.

46770

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

46771

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46772

!(DAG.isKnownNeverZeroFloat(LHS) ||

46773

DAG.isKnownNeverZeroFloat(RHS)))

46774

break;

46775

std::swap(LHS, RHS);

46776

}

46777

Opcode = X86ISD::FMIN;

46778

break;

46779

case ISD::SETOLE:

46780

// Converting this to a min would handle comparisons between positive

46781

// and negative zero incorrectly.

46782

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46783

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

46784

break;

46785

Opcode = X86ISD::FMIN;

46786

break;

46787

case ISD::SETULE:

46788

// Converting this to a min would handle both negative zeros and NaNs

46789

// incorrectly, but we can swap the operands to fix both.

46790

std::swap(LHS, RHS);

46791

[[fallthrough]];

46792

case ISD::SETOLT:

46793

case ISD::SETLT:

46794

case ISD::SETLE:

46795

Opcode = X86ISD::FMIN;

46796

break;

46797

46798

case ISD::SETOGE:

46799

// Converting this to a max would handle comparisons between positive

46800

// and negative zero incorrectly.

46801

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46802

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

46803

break;

46804

Opcode = X86ISD::FMAX;

46805

break;

46806

case ISD::SETUGT:

46807

// Converting this to a max would handle NaNs incorrectly, and swapping

46808

// the operands would cause it to handle comparisons between positive

46809

// and negative zero incorrectly.

46810

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

46811

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46812

!(DAG.isKnownNeverZeroFloat(LHS) ||

46813

DAG.isKnownNeverZeroFloat(RHS)))

46814

break;

46815

std::swap(LHS, RHS);

46816

}

46817

Opcode = X86ISD::FMAX;

46818

break;

46819

case ISD::SETUGE:

46820

// Converting this to a max would handle both negative zeros and NaNs

46821

// incorrectly, but we can swap the operands to fix both.

46822

std::swap(LHS, RHS);

46823

[[fallthrough]];

46824

case ISD::SETOGT:

46825

case ISD::SETGT:

46826

case ISD::SETGE:

46827

Opcode = X86ISD::FMAX;

46828

break;

46829

}

46830

// Check for x CC y ? y : x -- a min/max with reversed arms.

46831

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

46832

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

46833

switch (CC) {

46834

default: break;

46835

case ISD::SETOGE:

46836

// Converting this to a min would handle comparisons between positive

46837

// and negative zero incorrectly, and swapping the operands would

46838

// cause it to handle NaNs incorrectly.

46839

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46840

!(DAG.isKnownNeverZeroFloat(LHS) ||

46841

DAG.isKnownNeverZeroFloat(RHS))) {

46842

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46843

break;

46844

std::swap(LHS, RHS);

46845

}

46846

Opcode = X86ISD::FMIN;

46847

break;

46848

case ISD::SETUGT:

46849

// Converting this to a min would handle NaNs incorrectly.

46850

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46851

break;

46852

Opcode = X86ISD::FMIN;

46853

break;

46854

case ISD::SETUGE:

46855

// Converting this to a min would handle both negative zeros and NaNs

46856

// incorrectly, but we can swap the operands to fix both.

46857

std::swap(LHS, RHS);

46858

[[fallthrough]];

46859

case ISD::SETOGT:

46860

case ISD::SETGT:

46861

case ISD::SETGE:

46862

Opcode = X86ISD::FMIN;

46863

break;

46864

46865

case ISD::SETULT:

46866

// Converting this to a max would handle NaNs incorrectly.

46867

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46868

break;

46869

Opcode = X86ISD::FMAX;

46870

break;

46871

case ISD::SETOLE:

46872

// Converting this to a max would handle comparisons between positive

46873

// and negative zero incorrectly, and swapping the operands would

46874

// cause it to handle NaNs incorrectly.

46875

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46876

!DAG.isKnownNeverZeroFloat(LHS) &&

46877

!DAG.isKnownNeverZeroFloat(RHS)) {

46878

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46879

break;

46880

std::swap(LHS, RHS);

46881

}

46882

Opcode = X86ISD::FMAX;

46883

break;

46884

case ISD::SETULE:

46885

// Converting this to a max would handle both negative zeros and NaNs

46886

// incorrectly, but we can swap the operands to fix both.

46887

std::swap(LHS, RHS);

46888

[[fallthrough]];

46889

case ISD::SETOLT:

46890

case ISD::SETLT:

46891

case ISD::SETLE:

46892

Opcode = X86ISD::FMAX;

46893

break;

46894

}

46895

}

46896

46897

if (Opcode)

46898

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

46899

}

46900

46901

// Some mask scalar intrinsics rely on checking if only one bit is set

46902

// and implement it in C code like this:

46903

// A[0] = (U & 1) ? A[0] : W[0];

46904

// This creates some redundant instructions that break pattern matching.

46905

// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

46906

if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

46907

Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

46908

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46909

SDValue AndNode = Cond.getOperand(0);

46910

if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

46911

isNullConstant(Cond.getOperand(1)) &&

46912

isOneConstant(AndNode.getOperand(1))) {

46913

// LHS and RHS swapped due to

46914

// setcc outputting 1 when AND resulted in 0 and vice versa.

46915

AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

46916

return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

46917

}

46918

}

46919

46920

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

46921

// lowering on KNL. In this case we convert it to

46922

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

46923

// The same situation all vectors of i8 and i16 without BWI.

46924

// Make sure we extend these even before type legalization gets a chance to

46925

// split wide vectors.

46926

// Since SKX these selects have a proper lowering.

46927

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

46928

CondVT.getVectorElementType() == MVT::i1 &&

46929

(VT.getVectorElementType() == MVT::i8 ||

46930

VT.getVectorElementType() == MVT::i16)) {

46931

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

46932

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

46933

}

46934

46935

// AVX512 - Extend select with zero to merge with target shuffle.

46936

// select(mask, extract_subvector(shuffle(x)), zero) -->

46937

// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))

46938

// TODO - support non target shuffles as well.

46939

if (Subtarget.hasAVX512() && CondVT.isVector() &&

46940

CondVT.getVectorElementType() == MVT::i1) {

46941

auto SelectableOp = [&TLI](SDValue Op) {

46942

return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

46943

isTargetShuffle(Op.getOperand(0).getOpcode()) &&

46944

isNullConstant(Op.getOperand(1)) &&

46945

TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

46946

Op.hasOneUse() && Op.getOperand(0).hasOneUse();

46947

};

46948

46949

bool SelectableLHS = SelectableOp(LHS);

46950

bool SelectableRHS = SelectableOp(RHS);

46951

bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());

46952

bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

46953

46954

if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {

46955

EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

46956

: RHS.getOperand(0).getValueType();

46957

EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);

46958

LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

46959

VT.getSizeInBits());

46960

RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

46961

VT.getSizeInBits());

46962

Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

46963

DAG.getUNDEF(SrcCondVT), Cond,

46964

DAG.getIntPtrConstant(0, DL));

46965

SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

46966

return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

46967

}

46968

}

46969

46970

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

46971

return V;

46972

46973

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

46974

Cond.hasOneUse()) {

46975

EVT CondVT = Cond.getValueType();

46976

SDValue Cond0 = Cond.getOperand(0);

46977

SDValue Cond1 = Cond.getOperand(1);

46978

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46979

46980

// Canonicalize min/max:

46981

// (x > 0) ? x : 0 -> (x >= 0) ? x : 0

46982

// (x < -1) ? x : -1 -> (x <= -1) ? x : -1

46983

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

46984

// the need for an extra compare against zero. e.g.

46985

// (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

46986

// subl %esi, %edi

46987

// testl %edi, %edi

46988

// movl $0, %eax

46989

// cmovgl %edi, %eax

46990

// =>

46991

// xorl %eax, %eax

46992

// subl %esi, $edi

46993

// cmovsl %eax, %edi

46994

//

46995

// We can also canonicalize

46996

// (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1

46997

// (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1

46998

// This allows the use of a test instruction for the compare.

46999

if (LHS == Cond0 && RHS == Cond1) {

47000

if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||

47001

(CC == ISD::SETLT && isAllOnesConstant(RHS))) {

47002

ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

47003

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

47004

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

47005

}

47006

if (CC == ISD::SETUGT && isOneConstant(RHS)) {

47007

ISD::CondCode NewCC = ISD::SETUGE;

47008

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

47009

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

47010

}

47011

}

47012

47013

// Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.

47014

// fold eq + gt/lt nested selects into ge/le selects

47015

// select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)

47016

// --> (select (cmpuge Cond0, Cond1), LHS, Y)

47017

// select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)

47018

// --> (select (cmpsle Cond0, Cond1), LHS, Y)

47019

// .. etc ..

47020

if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&

47021

RHS.getOperand(0).getOpcode() == ISD::SETCC) {

47022

SDValue InnerSetCC = RHS.getOperand(0);

47023

ISD::CondCode InnerCC =

47024

cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();

47025

if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&

47026

Cond0 == InnerSetCC.getOperand(0) &&

47027

Cond1 == InnerSetCC.getOperand(1)) {

47028

ISD::CondCode NewCC;

47029

switch (CC == ISD::SETEQ ? InnerCC : CC) {

47030

case ISD::SETGT: NewCC = ISD::SETGE; break;

47031

case ISD::SETLT: NewCC = ISD::SETLE; break;

47032

case ISD::SETUGT: NewCC = ISD::SETUGE; break;

47033

case ISD::SETULT: NewCC = ISD::SETULE; break;

47034

default: NewCC = ISD::SETCC_INVALID; break;

47035

}

47036

if (NewCC != ISD::SETCC_INVALID) {

47037

Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);

47038

return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));

47039

}

47040

}

47041

}

47042

}

47043

47044

// Check if the first operand is all zeros and Cond type is vXi1.

47045

// If this an avx512 target we can improve the use of zero masking by

47046

// swapping the operands and inverting the condition.

47047

if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

47048

Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

47049

ISD::isBuildVectorAllZeros(LHS.getNode()) &&

47050

!ISD::isBuildVectorAllZeros(RHS.getNode())) {

47051

// Invert the cond to not(cond) : xor(op,allones)=not(op)

47052

SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

47053

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

47054

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

47055

}

47056

47057

// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might

47058

// get split by legalization.

47059

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&

47060

CondVT.getVectorElementType() == MVT::i1 &&

47061

TLI.isTypeLegal(VT.getScalarType())) {

47062

EVT ExtCondVT = VT.changeVectorElementTypeToInteger();

47063

if (SDValue ExtCond = combineToExtendBoolVectorInReg(

47064

ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {

47065

ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);

47066

return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);

47067

}

47068

}

47069

47070

// Early exit check

47071

if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))

47072

return SDValue();

47073

47074

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

47075

return V;

47076

47077

if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))

47078

return V;

47079

47080

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))

47081

return V;

47082

47083

// select(~Cond, X, Y) -> select(Cond, Y, X)

47084

if (CondVT.getScalarType() != MVT::i1) {

47085

if (SDValue CondNot = IsNOT(Cond, DAG))

47086

return DAG.getNode(N->getOpcode(), DL, VT,

47087

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

47088

47089

// pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the

47090

// signbit.

47091

if (Cond.getOpcode() == X86ISD::PCMPGT &&

47092

ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&

47093

Cond.hasOneUse()) {

47094

Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

47095

DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

47096

return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

47097

}

47098

}

47099

47100

// Try to optimize vXi1 selects if both operands are either all constants or

47101

// bitcasts from scalar integer type. In that case we can convert the operands

47102

// to integer and use an integer select which will be converted to a CMOV.

47103

// We need to take a little bit of care to avoid creating an i64 type after

47104

// type legalization.

47105

if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

47106

VT.getVectorElementType() == MVT::i1 &&

47107

(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

47108

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

47109

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {

47110

bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

47111

bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

47112

47113

if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&

47114

LHS.getOperand(0).getValueType() == IntVT)) &&

47115

(RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&

47116

RHS.getOperand(0).getValueType() == IntVT))) {

47117

if (LHSIsConst)

47118

LHS = combinevXi1ConstantToInteger(LHS, DAG);

47119

else

47120

LHS = LHS.getOperand(0);

47121

47122

if (RHSIsConst)

47123

RHS = combinevXi1ConstantToInteger(RHS, DAG);

47124

else

47125

RHS = RHS.getOperand(0);

47126

47127

SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

47128

return DAG.getBitcast(VT, Select);

47129

}

47130

}

47131

}

47132

47133

// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

47134

// single bits, then invert the predicate and swap the select operands.

47135

// This can lower using a vector shift bit-hack rather than mask and compare.

47136

if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

47137

N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

47138

Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

47139

Cond.getOperand(0).getOpcode() == ISD::AND &&

47140

isNullOrNullSplat(Cond.getOperand(1)) &&

47141

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

47142

Cond.getOperand(0).getValueType() == VT) {

47143

// The 'and' mask must be composed of power-of-2 constants.

47144

SDValue And = Cond.getOperand(0);

47145

auto *C = isConstOrConstSplat(And.getOperand(1));

47146

if (C && C->getAPIntValue().isPowerOf2()) {

47147

// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

47148

SDValue NotCond =

47149

DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

47150

return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

47151

}

47152

47153

// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

47154

// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

47155

// 16-bit lacks a proper blendv.

47156

unsigned EltBitWidth = VT.getScalarSizeInBits();

47157

bool CanShiftBlend =

47158

TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

47159

(Subtarget.hasAVX2() && EltBitWidth == 64) ||

47160

(Subtarget.hasXOP()));

47161

if (CanShiftBlend &&

47162

ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

47163

return C->getAPIntValue().isPowerOf2();

47164

})) {

47165

// Create a left-shift constant to get the mask bits over to the sign-bit.

47166

SDValue Mask = And.getOperand(1);

47167

SmallVector<int, 32> ShlVals;

47168

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

47169

auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

47170

ShlVals.push_back(EltBitWidth - 1 -

47171

MaskVal->getAPIntValue().exactLogBase2());

47172

}

47173

// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

47174

SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

47175

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

47176

SDValue NewCond =

47177

DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

47178

return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

47179

}

47180

}

47181

47182

return SDValue();

47183

}

47184

47185

/// Combine:

47186

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

47187

/// to:

47188

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

47189

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

47190

/// Note that this is only legal for some op/cc combinations.

47191

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

47192

SelectionDAG &DAG,

47193

const X86Subtarget &Subtarget) {

47194

// This combine only operates on CMP-like nodes.

47195

if (!(Cmp.getOpcode() == X86ISD::CMP ||

47196

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

47197

return SDValue();

47198

47199

// Can't replace the cmp if it has more uses than the one we're looking at.

47200

// FIXME: We would like to be able to handle this, but would need to make sure

47201

// all uses were updated.

47202

if (!Cmp.hasOneUse())

47203

return SDValue();

47204

47205

// This only applies to variations of the common case:

47206

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

47207

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

47208

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

47209

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

47210

// Using the proper condcodes (see below), overflow is checked for.

47211

47212

// FIXME: We can generalize both constraints:

47213

// - XOR/OR/AND (if they were made to survive AtomicExpand)

47214

// - LHS != 1

47215

// if the result is compared.

47216

47217

SDValue CmpLHS = Cmp.getOperand(0);

47218

SDValue CmpRHS = Cmp.getOperand(1);

47219

EVT CmpVT = CmpLHS.getValueType();

47220

47221

if (!CmpLHS.hasOneUse())

47222

return SDValue();

47223

47224

unsigned Opc = CmpLHS.getOpcode();

47225

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

47226

return SDValue();

47227

47228

SDValue OpRHS = CmpLHS.getOperand(2);

47229

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

47230

if (!OpRHSC)

47231

return SDValue();

47232

47233

APInt Addend = OpRHSC->getAPIntValue();

47234

if (Opc == ISD::ATOMIC_LOAD_SUB)

47235

Addend = -Addend;

47236

47237

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

47238

if (!CmpRHSC)

47239

return SDValue();

47240

47241

APInt Comparison = CmpRHSC->getAPIntValue();

47242

APInt NegAddend = -Addend;

47243

47244

// See if we can adjust the CC to make the comparison match the negated

47245

// addend.

47246

if (Comparison != NegAddend) {

47247

APInt IncComparison = Comparison + 1;

47248

if (IncComparison == NegAddend) {

47249

if (CC == X86::COND_A && !Comparison.isMaxValue()) {

47250

Comparison = IncComparison;

47251

CC = X86::COND_AE;

47252

} else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {

47253

Comparison = IncComparison;

47254

CC = X86::COND_L;

47255

}

47256

}

47257

APInt DecComparison = Comparison - 1;

47258

if (DecComparison == NegAddend) {

47259

if (CC == X86::COND_AE && !Comparison.isMinValue()) {

47260

Comparison = DecComparison;

47261

CC = X86::COND_A;

47262

} else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {

47263

Comparison = DecComparison;

47264

CC = X86::COND_LE;

47265

}

47266

}

47267

}

47268

47269

// If the addend is the negation of the comparison value, then we can do

47270

// a full comparison by emitting the atomic arithmetic as a locked sub.

47271

if (Comparison == NegAddend) {

47272

// The CC is fine, but we need to rewrite the LHS of the comparison as an

47273

// atomic sub.

47274

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

47275

auto AtomicSub = DAG.getAtomic(

47276

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,

47277

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

47278

/*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),

47279

AN->getMemOperand());

47280

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

47281

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

47282

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

47283

return LockOp;

47284

}

47285

47286

// We can handle comparisons with zero in a number of cases by manipulating

47287

// the CC used.

47288

if (!Comparison.isZero())

47289

return SDValue();

47290

47291

if (CC == X86::COND_S && Addend == 1)

47292

CC = X86::COND_LE;

47293

else if (CC == X86::COND_NS && Addend == 1)

47294

CC = X86::COND_G;

47295

else if (CC == X86::COND_G && Addend == -1)

47296

CC = X86::COND_GE;

47297

else if (CC == X86::COND_LE && Addend == -1)

47298

CC = X86::COND_L;

47299

else

47300

return SDValue();

47301

47302

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

47303

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

47304

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

47305

return LockOp;

47306

}

47307

47308

// Check whether a boolean test is testing a boolean value generated by

47309

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

47310

// code.

47311

//

47312

// Simplify the following patterns:

47313

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

47314

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

47315

// to (Op EFLAGS Cond)

47316

//

47317

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

47318

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

47319

// to (Op EFLAGS !Cond)

47320

//

47321

// where Op could be BRCOND or CMOV.

47322

//

47323

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

47324

// This combine only operates on CMP-like nodes.

47325

if (!(Cmp.getOpcode() == X86ISD::CMP ||

47326

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

47327

return SDValue();

47328

47329

// Quit if not used as a boolean value.

47330

if (CC != X86::COND_E && CC != X86::COND_NE)

47331

return SDValue();

47332

47333

// Check CMP operands. One of them should be 0 or 1 and the other should be

47334

// an SetCC or extended from it.

47335

SDValue Op1 = Cmp.getOperand(0);

47336

SDValue Op2 = Cmp.getOperand(1);

47337

47338

SDValue SetCC;

47339

const ConstantSDNode* C = nullptr;

47340

bool needOppositeCond = (CC == X86::COND_E);

47341

bool checkAgainstTrue = false; // Is it a comparison against 1?

47342

47343

if ((C = dyn_cast<ConstantSDNode>(Op1)))

47344

SetCC = Op2;

47345

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

47346

SetCC = Op1;

47347

else // Quit if all operands are not constants.

47348

return SDValue();

47349

47350

if (C->getZExtValue() == 1) {

47351

needOppositeCond = !needOppositeCond;

47352

checkAgainstTrue = true;

47353

} else if (C->getZExtValue() != 0)

47354

// Quit if the constant is neither 0 or 1.

47355

return SDValue();

47356

47357

bool truncatedToBoolWithAnd = false;

47358

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

47359

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

47360

SetCC.getOpcode() == ISD::TRUNCATE ||

47361

SetCC.getOpcode() == ISD::AND) {

47362

if (SetCC.getOpcode() == ISD::AND) {

47363

int OpIdx = -1;

47364

if (isOneConstant(SetCC.getOperand(0)))

47365

OpIdx = 1;

47366

if (isOneConstant(SetCC.getOperand(1)))

47367

OpIdx = 0;

47368

if (OpIdx < 0)

47369

break;

47370

SetCC = SetCC.getOperand(OpIdx);

47371

truncatedToBoolWithAnd = true;

47372

} else

47373

SetCC = SetCC.getOperand(0);

47374

}

47375

47376

switch (SetCC.getOpcode()) {

47377

case X86ISD::SETCC_CARRY:

47378

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

47379

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

47380

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

47381

// truncated to i1 using 'and'.

47382

if (checkAgainstTrue && !truncatedToBoolWithAnd)

47383

break;

47384

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__
__PRETTY_FUNCTION__))

47385

"Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__
__PRETTY_FUNCTION__));

47386

[[fallthrough]];

47387

case X86ISD::SETCC:

47388

// Set the condition code or opposite one if necessary.

47389

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

47390

if (needOppositeCond)

47391

CC = X86::GetOppositeBranchCondition(CC);

47392

return SetCC.getOperand(1);

47393

case X86ISD::CMOV: {

47394

// Check whether false/true value has canonical one, i.e. 0 or 1.

47395

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

47396

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

47397

// Quit if true value is not a constant.

47398

if (!TVal)

47399

return SDValue();

47400

// Quit if false value is not a constant.

47401

if (!FVal) {

47402

SDValue Op = SetCC.getOperand(0);

47403

// Skip 'zext' or 'trunc' node.

47404

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

47405

Op.getOpcode() == ISD::TRUNCATE)

47406

Op = Op.getOperand(0);

47407

// A special case for rdrand/rdseed, where 0 is set if false cond is

47408

// found.

47409

if ((Op.getOpcode() != X86ISD::RDRAND &&

47410

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

47411

return SDValue();

47412

}

47413

// Quit if false value is not the constant 0 or 1.

47414

bool FValIsFalse = true;

47415

if (FVal && FVal->getZExtValue() != 0) {

47416

if (FVal->getZExtValue() != 1)

47417

return SDValue();

47418

// If FVal is 1, opposite cond is needed.

47419

needOppositeCond = !needOppositeCond;

47420

FValIsFalse = false;

47421

}

47422

// Quit if TVal is not the constant opposite of FVal.

47423

if (FValIsFalse && TVal->getZExtValue() != 1)

47424

return SDValue();

47425

if (!FValIsFalse && TVal->getZExtValue() != 0)

47426

return SDValue();

47427

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

47428

if (needOppositeCond)

47429

CC = X86::GetOppositeBranchCondition(CC);

47430

return SetCC.getOperand(3);

47431

}

47432

}

47433

47434

return SDValue();

47435

}

47436

47437

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

47438

/// Match:

47439

/// (X86or (X86setcc) (X86setcc))

47440

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

47441

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

47442

X86::CondCode &CC1, SDValue &Flags,

47443

bool &isAnd) {

47444

if (Cond->getOpcode() == X86ISD::CMP) {

47445

if (!isNullConstant(Cond->getOperand(1)))

47446

return false;

47447

47448

Cond = Cond->getOperand(0);

47449

}

47450

47451

isAnd = false;

47452

47453

SDValue SetCC0, SetCC1;

47454

switch (Cond->getOpcode()) {

47455

default: return false;

47456

case ISD::AND:

47457

case X86ISD::AND:

47458

isAnd = true;

47459

[[fallthrough]];

47460

case ISD::OR:

47461

case X86ISD::OR:

47462

SetCC0 = Cond->getOperand(0);

47463

SetCC1 = Cond->getOperand(1);

47464

break;

47465

};

47466

47467

// Make sure we have SETCC nodes, using the same flags value.

47468

if (SetCC0.getOpcode() != X86ISD::SETCC ||

47469

SetCC1.getOpcode() != X86ISD::SETCC ||

47470

SetCC0->getOperand(1) != SetCC1->getOperand(1))

47471

return false;

47472

47473

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

47474

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

47475

Flags = SetCC0->getOperand(1);

47476

return true;

47477

}

47478

47479

// When legalizing carry, we create carries via add X, -1

47480

// If that comes from an actual carry, via setcc, we use the

47481

// carry directly.

47482

static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

47483

if (EFLAGS.getOpcode() == X86ISD::ADD) {

47484

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

47485

bool FoundAndLSB = false;

47486

SDValue Carry = EFLAGS.getOperand(0);

47487

while (Carry.getOpcode() == ISD::TRUNCATE ||

47488

Carry.getOpcode() == ISD::ZERO_EXTEND ||

47489

(Carry.getOpcode() == ISD::AND &&

47490

isOneConstant(Carry.getOperand(1)))) {

47491

FoundAndLSB |= Carry.getOpcode() == ISD::AND;

47492

Carry = Carry.getOperand(0);

47493

}

47494

if (Carry.getOpcode() == X86ISD::SETCC ||

47495

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

47496

// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

47497

uint64_t CarryCC = Carry.getConstantOperandVal(0);

47498

SDValue CarryOp1 = Carry.getOperand(1);

47499

if (CarryCC == X86::COND_B)

47500

return CarryOp1;

47501

if (CarryCC == X86::COND_A) {

47502

// Try to convert COND_A into COND_B in an attempt to facilitate

47503

// materializing "setb reg".

47504

//

47505

// Do not flip "e > c", where "c" is a constant, because Cmp

47506

// instruction cannot take an immediate as its first operand.

47507

//

47508

if (CarryOp1.getOpcode() == X86ISD::SUB &&

47509

CarryOp1.getNode()->hasOneUse() &&

47510

CarryOp1.getValueType().isInteger() &&

47511

!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

47512

SDValue SubCommute =

47513

DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

47514

CarryOp1.getOperand(1), CarryOp1.getOperand(0));

47515

return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

47516

}

47517

}

47518

// If this is a check of the z flag of an add with 1, switch to the

47519

// C flag.

47520

if (CarryCC == X86::COND_E &&

47521

CarryOp1.getOpcode() == X86ISD::ADD &&

47522

isOneConstant(CarryOp1.getOperand(1)))

47523

return CarryOp1;

47524

} else if (FoundAndLSB) {

47525

SDLoc DL(Carry);

47526

SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());

47527

if (Carry.getOpcode() == ISD::SRL) {

47528

BitNo = Carry.getOperand(1);

47529

Carry = Carry.getOperand(0);

47530

}

47531

return getBT(Carry, BitNo, DL, DAG);

47532

}

47533

}

47534

}

47535

47536

return SDValue();

47537

}

47538

47539

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

47540

/// to avoid the inversion.

47541

static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

47542

SelectionDAG &DAG,

47543

const X86Subtarget &Subtarget) {

47544

// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

47545

if (EFLAGS.getOpcode() != X86ISD::PTEST &&

47546

EFLAGS.getOpcode() != X86ISD::TESTP)

47547

return SDValue();

47548

47549

// PTEST/TESTP sets EFLAGS as:

47550

// TESTZ: ZF = (Op0 & Op1) == 0

47551

// TESTC: CF = (~Op0 & Op1) == 0

47552

// TESTNZC: ZF == 0 && CF == 0

47553

MVT VT = EFLAGS.getSimpleValueType();

47554

SDValue Op0 = EFLAGS.getOperand(0);

47555

SDValue Op1 = EFLAGS.getOperand(1);

47556

MVT OpVT = Op0.getSimpleValueType();

47557

47558

// TEST*(~X,Y) == TEST*(X,Y)

47559

if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

47560

X86::CondCode InvCC;

47561

switch (CC) {

47562

case X86::COND_B:

47563

// testc -> testz.

47564

InvCC = X86::COND_E;

47565

break;

47566

case X86::COND_AE:

47567

// !testc -> !testz.

47568

InvCC = X86::COND_NE;

47569

break;

47570

case X86::COND_E:

47571

// testz -> testc.

47572

InvCC = X86::COND_B;

47573

break;

47574

case X86::COND_NE:

47575

// !testz -> !testc.

47576

InvCC = X86::COND_AE;

47577

break;

47578

case X86::COND_A:

47579

case X86::COND_BE:

47580

// testnzc -> testnzc (no change).

47581

InvCC = CC;

47582

break;

47583

default:

47584

InvCC = X86::COND_INVALID;

47585

break;

47586

}

47587

47588

if (InvCC != X86::COND_INVALID) {

47589

CC = InvCC;

47590

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47591

DAG.getBitcast(OpVT, NotOp0), Op1);

47592

}

47593

}

47594

47595

if (CC == X86::COND_B || CC == X86::COND_AE) {

47596

// TESTC(X,~X) == TESTC(X,-1)

47597

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

47598

if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {

47599

SDLoc DL(EFLAGS);

47600

return DAG.getNode(

47601

EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),

47602

DAG.getBitcast(OpVT,

47603

DAG.getAllOnesConstant(DL, NotOp1.getValueType())));

47604

}

47605

}

47606

}

47607

47608

if (CC == X86::COND_E || CC == X86::COND_NE) {

47609

// TESTZ(X,~Y) == TESTC(Y,X)

47610

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

47611

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

47612

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47613

DAG.getBitcast(OpVT, NotOp1), Op0);

47614

}

47615

47616

if (Op0 == Op1) {

47617

SDValue BC = peekThroughBitcasts(Op0);

47618

EVT BCVT = BC.getValueType();

47619

47620

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

47621

if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

47622

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47623

DAG.getBitcast(OpVT, BC.getOperand(0)),

47624

DAG.getBitcast(OpVT, BC.getOperand(1)));

47625

}

47626

47627

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

47628

if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

47629

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

47630

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47631

DAG.getBitcast(OpVT, BC.getOperand(0)),

47632

DAG.getBitcast(OpVT, BC.getOperand(1)));

47633

}

47634

47635

// If every element is an all-sign value, see if we can use TESTP/MOVMSK

47636

// to more efficiently extract the sign bits and compare that.

47637

// TODO: Handle TESTC with comparison inversion.

47638

// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

47639

// TESTP/MOVMSK combines to make sure its never worse than PTEST?

47640

if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {

47641

unsigned EltBits = BCVT.getScalarSizeInBits();

47642

if (DAG.ComputeNumSignBits(BC) == EltBits) {

47643

assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47643, __extension__
__PRETTY_FUNCTION__));

47644

APInt SignMask = APInt::getSignMask(EltBits);

47645

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47646

if (SDValue Res =

47647

TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

47648

// For vXi16 cases we need to use pmovmksb and extract every other

47649

// sign bit.

47650

SDLoc DL(EFLAGS);

47651

if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {

47652

MVT FloatSVT = MVT::getFloatingPointVT(EltBits);

47653

MVT FloatVT =

47654

MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);

47655

Res = DAG.getBitcast(FloatVT, Res);

47656

return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);

47657

} else if (EltBits == 16) {

47658

MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

47659

Res = DAG.getBitcast(MovmskVT, Res);

47660

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

47661

Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

47662

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

47663

} else {

47664

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

47665

}

47666

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

47667

DAG.getConstant(0, DL, MVT::i32));

47668

}

47669

}

47670

}

47671

}

47672

47673

// TESTZ(-1,X) == TESTZ(X,X)

47674

if (ISD::isBuildVectorAllOnes(Op0.getNode()))

47675

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

47676

47677

// TESTZ(X,-1) == TESTZ(X,X)

47678

if (ISD::isBuildVectorAllOnes(Op1.getNode()))

47679

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

47680

47681

// TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)

47682

// TODO: Add COND_NE handling?

47683

if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

47684

SDValue Src0 = peekThroughBitcasts(Op0);

47685

SDValue Src1 = peekThroughBitcasts(Op1);

47686

if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {

47687

Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),

47688

peekThroughBitcasts(Src0.getOperand(1)), true);

47689

Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),

47690

peekThroughBitcasts(Src1.getOperand(1)), true);

47691

if (Src0 && Src1) {

47692

MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();

47693

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47694

DAG.getBitcast(OpVT2, Src0),

47695

DAG.getBitcast(OpVT2, Src1));

47696

}

47697

}

47698

}

47699

}

47700

47701

return SDValue();

47702

}

47703

47704

// Attempt to simplify the MOVMSK input based on the comparison type.

47705

static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

47706

SelectionDAG &DAG,

47707

const X86Subtarget &Subtarget) {

47708

// Handle eq/ne against zero (any_of).

47709

// Handle eq/ne against -1 (all_of).

47710

if (!(CC == X86::COND_E || CC == X86::COND_NE))

47711

return SDValue();

47712

if (EFLAGS.getValueType() != MVT::i32)

47713

return SDValue();

47714

unsigned CmpOpcode = EFLAGS.getOpcode();

47715

if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

47716

return SDValue();

47717

auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

47718

if (!CmpConstant)

47719

return SDValue();

47720

const APInt &CmpVal = CmpConstant->getAPIntValue();

47721

47722

SDValue CmpOp = EFLAGS.getOperand(0);

47723

unsigned CmpBits = CmpOp.getValueSizeInBits();

47724

assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__
__PRETTY_FUNCTION__));

47725

47726

// Peek through any truncate.

47727

if (CmpOp.getOpcode() == ISD::TRUNCATE)

47728

CmpOp = CmpOp.getOperand(0);

47729

47730

// Bail if we don't find a MOVMSK.

47731

if (CmpOp.getOpcode() != X86ISD::MOVMSK)

47732

return SDValue();

47733

47734

SDValue Vec = CmpOp.getOperand(0);

47735

MVT VecVT = Vec.getSimpleValueType();

47736

assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__
__PRETTY_FUNCTION__))

47737

"Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__
__PRETTY_FUNCTION__));

47738

unsigned NumElts = VecVT.getVectorNumElements();

47739

unsigned NumEltBits = VecVT.getScalarSizeInBits();

47740

47741

bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();

47742

bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&

47743

NumElts <= CmpBits && CmpVal.isMask(NumElts);

47744

if (!IsAnyOf && !IsAllOf)

47745

return SDValue();

47746

47747

// TODO: Check more combining cases for me.

47748

// Here we check the cmp use number to decide do combining or not.

47749

// Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"

47750

// and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.

47751

bool IsOneUse = CmpOp.getNode()->hasOneUse();

47752

47753

// See if we can peek through to a vector with a wider element type, if the

47754

// signbits extend down to all the sub-elements as well.

47755

// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

47756

// potential SimplifyDemandedBits/Elts cases.

47757

// If we looked through a truncate that discard bits, we can't do this

47758

// transform.

47759

// FIXME: We could do this transform for truncates that discarded bits by

47760

// inserting an AND mask between the new MOVMSK and the CMP.

47761

if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {

47762

SDValue BC = peekThroughBitcasts(Vec);

47763

MVT BCVT = BC.getSimpleValueType();

47764

unsigned BCNumElts = BCVT.getVectorNumElements();

47765

unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

47766

if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

47767

BCNumEltBits > NumEltBits &&

47768

DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

47769

SDLoc DL(EFLAGS);

47770

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);

47771

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

47772

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

47773

DAG.getConstant(CmpMask, DL, MVT::i32));

47774

}

47775

}

47776

47777

// MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).

47778

// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).

47779

// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).

47780

// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).

47781

if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {

47782

SmallVector<SDValue> Ops;

47783

if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&

47784

Ops.size() == 2) {

47785

SDLoc DL(EFLAGS);

47786

EVT SubVT = Ops[0].getValueType().changeTypeToInteger();

47787

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);

47788

SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,

47789

DAG.getBitcast(SubVT, Ops[0]),

47790

DAG.getBitcast(SubVT, Ops[1]));

47791

V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);

47792

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

47793

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

47794

DAG.getConstant(CmpMask, DL, MVT::i32));

47795

}

47796

}

47797

47798

// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

47799

// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

47800

// MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).

47801

// MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).

47802

if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {

47803

MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

47804

SDValue BC = peekThroughBitcasts(Vec);

47805

// Ensure MOVMSK was testing every signbit of BC.

47806

if (BC.getValueType().getVectorNumElements() <= NumElts) {

47807

if (BC.getOpcode() == X86ISD::PCMPEQ) {

47808

SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),

47809

BC.getOperand(0), BC.getOperand(1));

47810

V = DAG.getBitcast(TestVT, V);

47811

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47812

}

47813

// Check for 256-bit split vector cases.

47814

if (BC.getOpcode() == ISD::AND &&

47815

BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&

47816

BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {

47817

SDValue LHS = BC.getOperand(0);

47818

SDValue RHS = BC.getOperand(1);

47819

LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),

47820

LHS.getOperand(0), LHS.getOperand(1));

47821

RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),

47822

RHS.getOperand(0), RHS.getOperand(1));

47823

LHS = DAG.getBitcast(TestVT, LHS);

47824

RHS = DAG.getBitcast(TestVT, RHS);

47825

SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);

47826

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47827

}

47828

}

47829

}

47830

47831

// See if we can avoid a PACKSS by calling MOVMSK on the sources.

47832

// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

47833

// sign bits prior to the comparison with zero unless we know that

47834

// the vXi16 splats the sign bit down to the lower i8 half.

47835

// TODO: Handle all_of patterns.

47836

if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

47837

SDValue VecOp0 = Vec.getOperand(0);

47838

SDValue VecOp1 = Vec.getOperand(1);

47839

bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

47840

bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

47841

// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

47842

if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

47843

SDLoc DL(EFLAGS);

47844

SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

47845

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47846

Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

47847

if (!SignExt0) {

47848

Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

47849

DAG.getConstant(0xAAAA, DL, MVT::i16));

47850

}

47851

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47852

DAG.getConstant(0, DL, MVT::i16));

47853

}

47854

// PMOVMSKB(PACKSSBW(LO(X), HI(X)))

47855

// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

47856

if (CmpBits >= 16 && Subtarget.hasInt256() &&

47857

(IsAnyOf || (SignExt0 && SignExt1))) {

47858

if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {

47859

SDLoc DL(EFLAGS);

47860

SDValue Result = peekThroughBitcasts(Src);

47861

if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&

47862

Result.getValueType().getVectorNumElements() <= NumElts) {

47863

SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),

47864

Result.getOperand(0), Result.getOperand(1));

47865

V = DAG.getBitcast(MVT::v4i64, V);

47866

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47867

}

47868

Result = DAG.getBitcast(MVT::v32i8, Result);

47869

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47870

unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

47871

if (!SignExt0 || !SignExt1) {

47872

assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__
__PRETTY_FUNCTION__))

47873

"Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__
__PRETTY_FUNCTION__));

47874

Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

47875

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

47876

}

47877

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47878

DAG.getConstant(CmpMask, DL, MVT::i32));

47879

}

47880

}

47881

}

47882

47883

// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

47884

SmallVector<int, 32> ShuffleMask;

47885

SmallVector<SDValue, 2> ShuffleInputs;

47886

if (NumElts <= CmpBits &&

47887

getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

47888

ShuffleMask, DAG) &&

47889

ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

47890

ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

47891

unsigned NumShuffleElts = ShuffleMask.size();

47892

APInt DemandedElts = APInt::getZero(NumShuffleElts);

47893

for (int M : ShuffleMask) {

47894

assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47894, __extension__
__PRETTY_FUNCTION__));

47895

DemandedElts.setBit(M);

47896

}

47897

if (DemandedElts.isAllOnes()) {

47898

SDLoc DL(EFLAGS);

47899

SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

47900

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47901

Result =

47902

DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

47903

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47904

EFLAGS.getOperand(1));

47905

}

47906

}

47907

47908

// MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)

47909

// MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)

47910

// MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)

47911

// MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)

47912

// iff every element is referenced.

47913

if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse &&

47914

(NumEltBits == 32 || NumEltBits == 64)) {

47915

SDLoc DL(EFLAGS);

47916

MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);

47917

MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);

47918

MVT IntVT = FloatVT.changeVectorElementTypeToInteger();

47919

SDValue LHS = Vec;

47920

SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);

47921

CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

47922

return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,

47923

DAG.getBitcast(FloatVT, LHS),

47924

DAG.getBitcast(FloatVT, RHS));

47925

}

47926

47927

return SDValue();

47928

}

47929

47930

/// Optimize an EFLAGS definition used according to the condition code \p CC

47931

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

47932

/// uses of chain values.

47933

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

47934

SelectionDAG &DAG,

47935

const X86Subtarget &Subtarget) {

47936

if (CC == X86::COND_B)

47937

if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

47938

return Flags;

47939

47940

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

47941

return R;

47942

47943

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

47944

return R;

47945

47946

if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

47947

return R;

47948

47949

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

47950

}

47951

47952

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

47953

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

47954

TargetLowering::DAGCombinerInfo &DCI,

47955

const X86Subtarget &Subtarget) {

47956

SDLoc DL(N);

47957

47958

SDValue FalseOp = N->getOperand(0);

47959

SDValue TrueOp = N->getOperand(1);

47960

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

47961

SDValue Cond = N->getOperand(3);

47962

47963

// cmov X, X, ?, ? --> X

47964

if (TrueOp == FalseOp)

47965

return TrueOp;

47966

47967

// Try to simplify the EFLAGS and condition code operands.

47968

// We can't always do this as FCMOV only supports a subset of X86 cond.

47969

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

47970

if (!(FalseOp.getValueType() == MVT::f80 ||

47971

(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

47972

(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

47973

!Subtarget.canUseCMOV() || hasFPCMov(CC)) {

47974

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

47975

Flags};

47976

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47977

}

47978

}

47979

47980

// If this is a select between two integer constants, try to do some

47981

// optimizations. Note that the operands are ordered the opposite of SELECT

47982

// operands.

47983

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

47984

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

47985

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

47986

// larger than FalseC (the false value).

47987

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

47988

CC = X86::GetOppositeBranchCondition(CC);

47989

std::swap(TrueC, FalseC);

47990

std::swap(TrueOp, FalseOp);

47991

}

47992

47993

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

47994

// This is efficient for any integer data type (including i8/i16) and

47995

// shift amount.

47996

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

47997

Cond = getSETCC(CC, Cond, DL, DAG);

47998

47999

// Zero extend the condition if needed.

48000

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

48001

48002

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

48003

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

48004

DAG.getConstant(ShAmt, DL, MVT::i8));

48005

return Cond;

48006

}

48007

48008

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

48009

// for any integer data type, including i8/i16.

48010

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

48011

Cond = getSETCC(CC, Cond, DL, DAG);

48012

48013

// Zero extend the condition if needed.

48014

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

48015

FalseC->getValueType(0), Cond);

48016

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

48017

SDValue(FalseC, 0));

48018

return Cond;

48019

}

48020

48021

// Optimize cases that will turn into an LEA instruction. This requires

48022

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

48023

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

48024

APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

48025

assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__
__PRETTY_FUNCTION__))

48026

"Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__
__PRETTY_FUNCTION__));

48027

48028

bool isFastMultiplier = false;

48029

if (Diff.ult(10)) {

48030

switch (Diff.getZExtValue()) {

48031

default: break;

48032

case 1: // result = add base, cond

48033

case 2: // result = lea base( , cond*2)

48034

case 3: // result = lea base(cond, cond*2)

48035

case 4: // result = lea base( , cond*4)

48036

case 5: // result = lea base(cond, cond*4)

48037

case 8: // result = lea base( , cond*8)

48038

case 9: // result = lea base(cond, cond*8)

48039

isFastMultiplier = true;

48040

break;

48041

}

48042

}

48043

48044

if (isFastMultiplier) {

48045

Cond = getSETCC(CC, Cond, DL ,DAG);

48046

// Zero extend the condition if needed.

48047

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

48048

Cond);

48049

// Scale the condition by the difference.

48050

if (Diff != 1)

48051

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

48052

DAG.getConstant(Diff, DL, Cond.getValueType()));

48053

48054

// Add the base if non-zero.

48055

if (FalseC->getAPIntValue() != 0)

48056

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

48057

SDValue(FalseC, 0));

48058

return Cond;

48059

}

48060

}

48061

}

48062

}

48063

48064

// Handle these cases:

48065

// (select (x != c), e, c) -> select (x != c), e, x),

48066

// (select (x == c), c, e) -> select (x == c), x, e)

48067

// where the c is an integer constant, and the "select" is the combination

48068

// of CMOV and CMP.

48069

//

48070

// The rationale for this change is that the conditional-move from a constant

48071

// needs two instructions, however, conditional-move from a register needs

48072

// only one instruction.

48073

//

48074

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

48075

// some instruction-combining opportunities. This opt needs to be

48076

// postponed as late as possible.

48077

//

48078

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

48079

// the DCI.xxxx conditions are provided to postpone the optimization as

48080

// late as possible.

48081

48082

ConstantSDNode *CmpAgainst = nullptr;

48083

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

48084

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

48085

!isa<ConstantSDNode>(Cond.getOperand(0))) {

48086

48087

if (CC == X86::COND_NE &&

48088

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

48089

CC = X86::GetOppositeBranchCondition(CC);

48090

std::swap(TrueOp, FalseOp);

48091

}

48092

48093

if (CC == X86::COND_E &&

48094

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

48095

SDValue Ops[] = {FalseOp, Cond.getOperand(0),

48096

DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

48097

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

48098

}

48099

}

48100

}

48101

48102

// Transform:

48103

//

48104

// (cmov 1 T (uge T 2))

48105

//

48106

// to:

48107

//

48108

// (adc T 0 (sub T 1))

48109

if (CC == X86::COND_AE && isOneConstant(FalseOp) &&

48110

Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {

48111

SDValue Cond0 = Cond.getOperand(0);

48112

if (Cond0.getOpcode() == ISD::TRUNCATE)

48113

Cond0 = Cond0.getOperand(0);

48114

auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));

48115

if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {

48116

EVT CondVT = Cond->getValueType(0);

48117

EVT OuterVT = N->getValueType(0);

48118

// Subtract 1 and generate a carry.

48119

SDValue NewSub =

48120

DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),

48121

DAG.getConstant(1, DL, CondVT));

48122

SDValue EFLAGS(NewSub.getNode(), 1);

48123

return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),

48124

TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);

48125

}

48126

}

48127

48128

// Fold and/or of setcc's to double CMOV:

48129

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

48130

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

48131

//

48132

// This combine lets us generate:

48133

// cmovcc1 (jcc1 if we don't have CMOV)

48134

// cmovcc2 (same)

48135

// instead of:

48136

// setcc1

48137

// setcc2

48138

// and/or

48139

// cmovne (jne if we don't have CMOV)

48140

// When we can't use the CMOV instruction, it might increase branch

48141

// mispredicts.

48142

// When we can use CMOV, or when there is no mispredict, this improves

48143

// throughput and reduces register pressure.

48144

//

48145

if (CC == X86::COND_NE) {

48146

SDValue Flags;

48147

X86::CondCode CC0, CC1;

48148

bool isAndSetCC;

48149

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

48150

if (isAndSetCC) {

48151

std::swap(FalseOp, TrueOp);

48152

CC0 = X86::GetOppositeBranchCondition(CC0);

48153

CC1 = X86::GetOppositeBranchCondition(CC1);

48154

}

48155

48156

SDValue LOps[] = {FalseOp, TrueOp,

48157

DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

48158

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

48159

SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

48160

Flags};

48161

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

48162

return CMOV;

48163

}

48164

}

48165

48166

// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

48167

// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

48168

// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

48169

// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

48170

if ((CC == X86::COND_NE || CC == X86::COND_E) &&

48171

Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

48172

SDValue Add = TrueOp;

48173

SDValue Const = FalseOp;

48174

// Canonicalize the condition code for easier matching and output.

48175

if (CC == X86::COND_E)

48176

std::swap(Add, Const);

48177

48178

// We might have replaced the constant in the cmov with the LHS of the

48179

// compare. If so change it to the RHS of the compare.

48180

if (Const == Cond.getOperand(0))

48181

Const = Cond.getOperand(1);

48182

48183

// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

48184

if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

48185

Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

48186

(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

48187

Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

48188

Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

48189

EVT VT = N->getValueType(0);

48190

// This should constant fold.

48191

SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

48192

SDValue CMov =

48193

DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

48194

DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

48195

return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

48196

}

48197

}

48198

48199

return SDValue();

48200

}

48201

48202

/// Different mul shrinking modes.

48203

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

48204

48205

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

48206

EVT VT = N->getOperand(0).getValueType();

48207

if (VT.getScalarSizeInBits() != 32)

48208

return false;

48209

48210

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48210, __extension__
__PRETTY_FUNCTION__));

48211

unsigned SignBits[2] = {1, 1};

48212

bool IsPositive[2] = {false, false};

48213

for (unsigned i = 0; i < 2; i++) {

48214

SDValue Opd = N->getOperand(i);

48215

48216

SignBits[i] = DAG.ComputeNumSignBits(Opd);

48217

IsPositive[i] = DAG.SignBitIsZero(Opd);

48218

}

48219

48220

bool AllPositive = IsPositive[0] && IsPositive[1];

48221

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

48222

// When ranges are from -128 ~ 127, use MULS8 mode.

48223

if (MinSignBits >= 25)

48224

Mode = ShrinkMode::MULS8;

48225

// When ranges are from 0 ~ 255, use MULU8 mode.

48226

else if (AllPositive && MinSignBits >= 24)

48227

Mode = ShrinkMode::MULU8;

48228

// When ranges are from -32768 ~ 32767, use MULS16 mode.

48229

else if (MinSignBits >= 17)

48230

Mode = ShrinkMode::MULS16;

48231

// When ranges are from 0 ~ 65535, use MULU16 mode.

48232

else if (AllPositive && MinSignBits >= 16)

48233

Mode = ShrinkMode::MULU16;

48234

else

48235

return false;

48236

return true;

48237

}

48238

48239

/// When the operands of vector mul are extended from smaller size values,

48240

/// like i8 and i16, the type of mul may be shrinked to generate more

48241

/// efficient code. Two typical patterns are handled:

48242

/// Pattern1:

48243

/// %2 = sext/zext <N x i8> %1 to <N x i32>

48244

/// %4 = sext/zext <N x i8> %3 to <N x i32>

48245

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

48246

/// %5 = mul <N x i32> %2, %4

48247

///

48248

/// Pattern2:

48249

/// %2 = zext/sext <N x i16> %1 to <N x i32>

48250

/// %4 = zext/sext <N x i16> %3 to <N x i32>

48251

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

48252

/// %5 = mul <N x i32> %2, %4

48253

///

48254

/// There are four mul shrinking modes:

48255

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

48256

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

48257

/// generate pmullw+sext32 for it (MULS8 mode).

48258

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

48259

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

48260

/// generate pmullw+zext32 for it (MULU8 mode).

48261

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

48262

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

48263

/// generate pmullw+pmulhw for it (MULS16 mode).

48264

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

48265

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

48266

/// generate pmullw+pmulhuw for it (MULU16 mode).

48267

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

48268

const X86Subtarget &Subtarget) {

48269

// Check for legality

48270

// pmullw/pmulhw are not supported by SSE.

48271

if (!Subtarget.hasSSE2())

48272

return SDValue();

48273

48274

// Check for profitability

48275

// pmulld is supported since SSE41. It is better to use pmulld

48276

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

48277

// the expansion.

48278

bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

48279

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

48280

return SDValue();

48281

48282

ShrinkMode Mode;

48283

if (!canReduceVMulWidth(N, DAG, Mode))

48284

return SDValue();

48285

48286

SDLoc DL(N);

48287

SDValue N0 = N->getOperand(0);

48288

SDValue N1 = N->getOperand(1);

48289

EVT VT = N->getOperand(0).getValueType();

48290

unsigned NumElts = VT.getVectorNumElements();

48291

if ((NumElts % 2) != 0)

48292

return SDValue();

48293

48294

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

48295

48296

// Shrink the operands of mul.

48297

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

48298

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

48299

48300

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

48301

// lower part is needed.

48302

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

48303

if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

48304

return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

48305

: ISD::SIGN_EXTEND,

48306

DL, VT, MulLo);

48307

48308

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

48309

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

48310

// the higher part is also needed.

48311

SDValue MulHi =

48312

DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

48313

ReducedVT, NewN0, NewN1);

48314

48315

// Repack the lower part and higher part result of mul into a wider

48316

// result.

48317

// Generate shuffle functioning as punpcklwd.

48318

SmallVector<int, 16> ShuffleMask(NumElts);

48319

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

48320

ShuffleMask[2 * i] = i;

48321

ShuffleMask[2 * i + 1] = i + NumElts;

48322

}

48323

SDValue ResLo =

48324

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

48325

ResLo = DAG.getBitcast(ResVT, ResLo);

48326

// Generate shuffle functioning as punpckhwd.

48327

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

48328

ShuffleMask[2 * i] = i + NumElts / 2;

48329

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

48330

}

48331

SDValue ResHi =

48332

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

48333

ResHi = DAG.getBitcast(ResVT, ResHi);

48334

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

48335

}

48336

48337

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

48338

EVT VT, const SDLoc &DL) {

48339

48340

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

48341

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48342

DAG.getConstant(Mult, DL, VT));

48343

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

48344

DAG.getConstant(Shift, DL, MVT::i8));

48345

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

48346

N->getOperand(0));

48347

return Result;

48348

};

48349

48350

auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

48351

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48352

DAG.getConstant(Mul1, DL, VT));

48353

Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

48354

DAG.getConstant(Mul2, DL, VT));

48355

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

48356

N->getOperand(0));

48357

return Result;

48358

};

48359

48360

switch (MulAmt) {

48361

default:

48362

break;

48363

case 11:

48364

// mul x, 11 => add ((shl (mul x, 5), 1), x)

48365

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

48366

case 21:

48367

// mul x, 21 => add ((shl (mul x, 5), 2), x)

48368

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

48369

case 41:

48370

// mul x, 41 => add ((shl (mul x, 5), 3), x)

48371

return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

48372

case 22:

48373

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

48374

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

48375

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

48376

case 19:

48377

// mul x, 19 => add ((shl (mul x, 9), 1), x)

48378

return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

48379

case 37:

48380

// mul x, 37 => add ((shl (mul x, 9), 2), x)

48381

return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

48382

case 73:

48383

// mul x, 73 => add ((shl (mul x, 9), 3), x)

48384

return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

48385

case 13:

48386

// mul x, 13 => add ((shl (mul x, 3), 2), x)

48387

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

48388

case 23:

48389

// mul x, 23 => sub ((shl (mul x, 3), 3), x)

48390

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

48391

case 26:

48392

// mul x, 26 => add ((mul (mul x, 5), 5), x)

48393

return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

48394

case 28:

48395

// mul x, 28 => add ((mul (mul x, 9), 3), x)

48396

return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

48397

case 29:

48398

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

48399

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

48400

combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

48401

}

48402

48403

// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

48404

// by a single LEA.

48405

// First check if this a sum of two power of 2s because that's easy. Then

48406

// count how many zeros are up to the first bit.

48407

// TODO: We can do this even without LEA at a cost of two shifts and an add.

48408

if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

48409

unsigned ScaleShift = llvm::countr_zero(MulAmt);

48410

if (ScaleShift >= 1 && ScaleShift < 4) {

48411

unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

48412

SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48413

DAG.getConstant(ShiftAmt, DL, MVT::i8));

48414

SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48415

DAG.getConstant(ScaleShift, DL, MVT::i8));

48416

return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

48417

}

48418

}

48419

48420

return SDValue();

48421

}

48422

48423

// If the upper 17 bits of either element are zero and the other element are

48424

// zero/sign bits then we can use PMADDWD, which is always at least as quick as

48425

// PMULLD, except on KNL.

48426

static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

48427

const X86Subtarget &Subtarget) {

48428

if (!Subtarget.hasSSE2())

48429

return SDValue();

48430

48431

if (Subtarget.isPMADDWDSlow())

48432

return SDValue();

48433

48434

EVT VT = N->getValueType(0);

48435

48436

// Only support vXi32 vectors.

48437

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

48438

return SDValue();

48439

48440

// Make sure the type is legal or can split/widen to a legal type.

48441

// With AVX512 but without BWI, we would need to split v32i16.

48442

unsigned NumElts = VT.getVectorNumElements();

48443

if (NumElts == 1 || !isPowerOf2_32(NumElts))

48444

return SDValue();

48445

48446

// With AVX512 but without BWI, we would need to split v32i16.

48447

if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())

48448

return SDValue();

48449

48450

SDValue N0 = N->getOperand(0);

48451

SDValue N1 = N->getOperand(1);

48452

48453

// If we are zero/sign extending two steps without SSE4.1, its better to

48454

// reduce the vmul width instead.

48455

if (!Subtarget.hasSSE41() &&

48456

(((N0.getOpcode() == ISD::ZERO_EXTEND &&

48457

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

48458

(N1.getOpcode() == ISD::ZERO_EXTEND &&

48459

N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||

48460

((N0.getOpcode() == ISD::SIGN_EXTEND &&

48461

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

48462

(N1.getOpcode() == ISD::SIGN_EXTEND &&

48463

N1.getOperand(0).getScalarValueSizeInBits() <= 8))))

48464

return SDValue();

48465

48466

// If we are sign extending a wide vector without SSE4.1, its better to reduce

48467

// the vmul width instead.

48468

if (!Subtarget.hasSSE41() &&

48469

(N0.getOpcode() == ISD::SIGN_EXTEND &&

48470

N0.getOperand(0).getValueSizeInBits() > 128) &&

48471

(N1.getOpcode() == ISD::SIGN_EXTEND &&

48472

N1.getOperand(0).getValueSizeInBits() > 128))

48473

return SDValue();

48474

48475

// Sign bits must extend down to the lowest i16.

48476

if (DAG.ComputeMaxSignificantBits(N1) > 16 ||

48477

DAG.ComputeMaxSignificantBits(N0) > 16)

48478

return SDValue();

48479

48480

// At least one of the elements must be zero in the upper 17 bits, or can be

48481

// safely made zero without altering the final result.

48482

auto GetZeroableOp = [&](SDValue Op) {

48483

APInt Mask17 = APInt::getHighBitsSet(32, 17);

48484

if (DAG.MaskedValueIsZero(Op, Mask17))

48485

return Op;

48486

// Mask off upper 16-bits of sign-extended constants.

48487

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))

48488

return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,

48489

DAG.getConstant(0xFFFF, SDLoc(N), VT));

48490

if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {

48491

SDValue Src = Op.getOperand(0);

48492

// Convert sext(vXi16) to zext(vXi16).

48493

if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)

48494

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

48495

// Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets

48496

// which will expand the extension.

48497

if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {

48498

EVT ExtVT = VT.changeVectorElementType(MVT::i16);

48499

Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);

48500

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

48501

}

48502

}

48503

// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.

48504

if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&

48505

N->isOnlyUserOf(Op.getNode())) {

48506

SDValue Src = Op.getOperand(0);

48507

if (Src.getScalarValueSizeInBits() == 16)

48508

return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);

48509

}

48510

// Convert VSRAI(Op, 16) to VSRLI(Op, 16).

48511

if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&

48512

N->isOnlyUserOf(Op.getNode())) {

48513

return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),

48514

Op.getOperand(1));

48515

}

48516

return SDValue();

48517

};

48518

SDValue ZeroN0 = GetZeroableOp(N0);

48519

SDValue ZeroN1 = GetZeroableOp(N1);

48520

if (!ZeroN0 && !ZeroN1)

48521

return SDValue();

48522

N0 = ZeroN0 ? ZeroN0 : N0;

48523

N1 = ZeroN1 ? ZeroN1 : N1;

48524

48525

// Use SplitOpsAndApply to handle AVX splitting.

48526

auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48527

ArrayRef<SDValue> Ops) {

48528

MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

48529

MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);

48530

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,

48531

DAG.getBitcast(OpVT, Ops[0]),

48532

DAG.getBitcast(OpVT, Ops[1]));

48533

};

48534

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},

48535

PMADDWDBuilder);

48536

}

48537

48538

static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,

48539

const X86Subtarget &Subtarget) {

48540

if (!Subtarget.hasSSE2())

48541

return SDValue();

48542

48543

EVT VT = N->getValueType(0);

48544

48545

// Only support vXi64 vectors.

48546

if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

48547

VT.getVectorNumElements() < 2 ||

48548

!isPowerOf2_32(VT.getVectorNumElements()))

48549

return SDValue();

48550

48551

SDValue N0 = N->getOperand(0);

48552

SDValue N1 = N->getOperand(1);

48553

48554

// MULDQ returns the 64-bit result of the signed multiplication of the lower

48555

// 32-bits. We can lower with this if the sign bits stretch that far.

48556

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

48557

DAG.ComputeNumSignBits(N1) > 32) {

48558

auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48559

ArrayRef<SDValue> Ops) {

48560

return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

48561

};

48562

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

48563

PMULDQBuilder, /*CheckBWI*/false);

48564

}

48565

48566

// If the upper bits are zero we can use a single pmuludq.

48567

APInt Mask = APInt::getHighBitsSet(64, 32);

48568

if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

48569

auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48570

ArrayRef<SDValue> Ops) {

48571

return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

48572

};

48573

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

48574

PMULUDQBuilder, /*CheckBWI*/false);

48575

}

48576

48577

return SDValue();

48578

}

48579

48580

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

48581

TargetLowering::DAGCombinerInfo &DCI,

48582

const X86Subtarget &Subtarget) {

48583

EVT VT = N->getValueType(0);

48584

48585

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))

48586

return V;

48587

48588

if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))

48589

return V;

48590

48591

if (DCI.isBeforeLegalize() && VT.isVector())

48592

return reduceVMULWidth(N, DAG, Subtarget);

48593

48594

// Optimize a single multiply with constant into two operations in order to

48595

// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

48596

if (!MulConstantOptimization)

48597

return SDValue();

48598

48599

// An imul is usually smaller than the alternative sequence.

48600

if (DAG.getMachineFunction().getFunction().hasMinSize())

48601

return SDValue();

48602

48603

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

48604

return SDValue();

48605

48606

if (VT != MVT::i64 && VT != MVT::i32)

48607

return SDValue();

48608

48609

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

48610

if (!C)

48611

return SDValue();

48612

if (isPowerOf2_64(C->getZExtValue()))

48613

return SDValue();

48614

48615

int64_t SignMulAmt = C->getSExtValue();

48616

assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48616, __extension__
__PRETTY_FUNCTION__));

48617

uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

48618

48619

SDLoc DL(N);

48620

if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

48621

SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48622

DAG.getConstant(AbsMulAmt, DL, VT));

48623

if (SignMulAmt < 0)

48624

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

48625

NewMul);

48626

48627

return NewMul;

48628

}

48629

48630

uint64_t MulAmt1 = 0;

48631

uint64_t MulAmt2 = 0;

48632

if ((AbsMulAmt % 9) == 0) {

48633

MulAmt1 = 9;

48634

MulAmt2 = AbsMulAmt / 9;

48635

} else if ((AbsMulAmt % 5) == 0) {

48636

MulAmt1 = 5;

48637

MulAmt2 = AbsMulAmt / 5;

48638

} else if ((AbsMulAmt % 3) == 0) {

48639

MulAmt1 = 3;

48640

MulAmt2 = AbsMulAmt / 3;

48641

}

48642

48643

SDValue NewMul;

48644

// For negative multiply amounts, only allow MulAmt2 to be a power of 2.

48645

if (MulAmt2 &&

48646

(isPowerOf2_64(MulAmt2) ||

48647

(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {

48648

48649

if (isPowerOf2_64(MulAmt2) &&

48650

!(SignMulAmt >= 0 && N->hasOneUse() &&

48651

N->use_begin()->getOpcode() == ISD::ADD))

48652

// If second multiplifer is pow2, issue it first. We want the multiply by

48653

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

48654

// is an add. Only do this for positive multiply amounts since the

48655

// negate would prevent it from being used as an address mode anyway.

48656

std::swap(MulAmt1, MulAmt2);

48657

48658

if (isPowerOf2_64(MulAmt1))

48659

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48660

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

48661

else

48662

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48663

DAG.getConstant(MulAmt1, DL, VT));

48664

48665

if (isPowerOf2_64(MulAmt2))

48666

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

48667

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

48668

else

48669

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

48670

DAG.getConstant(MulAmt2, DL, VT));

48671

48672

// Negate the result.

48673

if (SignMulAmt < 0)

48674

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

48675

NewMul);

48676

} else if (!Subtarget.slowLEA())

48677

NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

48678

48679

if (!NewMul) {

48680

assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))

48681

C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))

48682

"Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))

48683

"already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__));

48684

if (isPowerOf2_64(AbsMulAmt - 1)) {

48685

// (mul x, 2^N + 1) => (add (shl x, N), x)

48686

NewMul = DAG.getNode(

48687

ISD::ADD, DL, VT, N->getOperand(0),

48688

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48689

DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,

48690

MVT::i8)));

48691

// To negate, subtract the number from zero

48692

if (SignMulAmt < 0)

48693

NewMul = DAG.getNode(ISD::SUB, DL, VT,

48694

DAG.getConstant(0, DL, VT), NewMul);

48695

} else if (isPowerOf2_64(AbsMulAmt + 1)) {

48696

// (mul x, 2^N - 1) => (sub (shl x, N), x)

48697

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48698

DAG.getConstant(Log2_64(AbsMulAmt + 1),

48699

DL, MVT::i8));

48700

// To negate, reverse the operands of the subtract.

48701

if (SignMulAmt < 0)

48702

NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

48703

else

48704

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

48705

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {

48706

// (mul x, 2^N + 2) => (add (shl x, N), (add x, x))

48707

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48708

DAG.getConstant(Log2_64(AbsMulAmt - 2),

48709

DL, MVT::i8));

48710

NewMul = DAG.getNode(

48711

ISD::ADD, DL, VT, NewMul,

48712

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

48713

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {

48714

// (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))

48715

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48716

DAG.getConstant(Log2_64(AbsMulAmt + 2),

48717

DL, MVT::i8));

48718

NewMul = DAG.getNode(

48719

ISD::SUB, DL, VT, NewMul,

48720

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

48721

}

48722

}

48723

48724

return NewMul;

48725

}

48726

48727

// Try to form a MULHU or MULHS node by looking for

48728

// (srl (mul ext, ext), 16)

48729

// TODO: This is X86 specific because we want to be able to handle wide types

48730

// before type legalization. But we can only do it if the vector will be

48731

// legalized via widening/splitting. Type legalization can't handle promotion

48732

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

48733

// combiner.

48734

static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

48735

const X86Subtarget &Subtarget) {

48736

assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__
__PRETTY_FUNCTION__))

48737

"SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__
__PRETTY_FUNCTION__));

48738

SDLoc DL(N);

48739

48740

if (!Subtarget.hasSSE2())

48741

return SDValue();

48742

48743

// The operation feeding into the shift must be a multiply.

48744

SDValue ShiftOperand = N->getOperand(0);

48745

if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

48746

return SDValue();

48747

48748

// Input type should be at least vXi32.

48749

EVT VT = N->getValueType(0);

48750

if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

48751

return SDValue();

48752

48753

// Need a shift by 16.

48754

APInt ShiftAmt;

48755

if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

48756

ShiftAmt != 16)

48757

return SDValue();

48758

48759

SDValue LHS = ShiftOperand.getOperand(0);

48760

SDValue RHS = ShiftOperand.getOperand(1);

48761

48762

unsigned ExtOpc = LHS.getOpcode();

48763

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

48764

RHS.getOpcode() != ExtOpc)

48765

return SDValue();

48766

48767

// Peek through the extends.

48768

LHS = LHS.getOperand(0);

48769

RHS = RHS.getOperand(0);

48770

48771

// Ensure the input types match.

48772

EVT MulVT = LHS.getValueType();

48773

if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

48774

return SDValue();

48775

48776

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

48777

SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

48778

48779

ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

48780

return DAG.getNode(ExtOpc, DL, VT, Mulh);

48781

}

48782

48783

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

48784

SDValue N0 = N->getOperand(0);

48785

SDValue N1 = N->getOperand(1);

48786

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

48787

EVT VT = N0.getValueType();

48788

48789

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

48790

// since the result of setcc_c is all zero's or all ones.

48791

if (VT.isInteger() && !VT.isVector() &&

48792

N1C && N0.getOpcode() == ISD::AND &&

48793

N0.getOperand(1).getOpcode() == ISD::Constant) {

48794

SDValue N00 = N0.getOperand(0);

48795

APInt Mask = N0.getConstantOperandAPInt(1);

48796

Mask <<= N1C->getAPIntValue();

48797

bool MaskOK = false;

48798

// We can handle cases concerning bit-widening nodes containing setcc_c if

48799

// we carefully interrogate the mask to make sure we are semantics

48800

// preserving.

48801

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

48802

// of the underlying setcc_c operation if the setcc_c was zero extended.

48803

// Consider the following example:

48804

// zext(setcc_c) -> i32 0x0000FFFF

48805

// c1 -> i32 0x0000FFFF

48806

// c2 -> i32 0x00000001

48807

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

48808

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

48809

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

48810

MaskOK = true;

48811

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

48812

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

48813

MaskOK = true;

48814

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

48815

N00.getOpcode() == ISD::ANY_EXTEND) &&

48816

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

48817

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

48818

}

48819

if (MaskOK && Mask != 0) {

48820

SDLoc DL(N);

48821

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

48822

}

48823

}

48824

48825

return SDValue();

48826

}

48827

48828

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

48829

const X86Subtarget &Subtarget) {

48830

SDValue N0 = N->getOperand(0);

48831

SDValue N1 = N->getOperand(1);

48832

EVT VT = N0.getValueType();

48833

unsigned Size = VT.getSizeInBits();

48834

48835

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

48836

return V;

48837

48838

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

48839

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

48840

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

48841

// depending on sign of (SarConst - [56,48,32,24,16])

48842

48843

// sexts in X86 are MOVs. The MOVs have the same code size

48844

// as above SHIFTs (only SHIFT on 1 has lower code size).

48845

// However the MOVs have 2 advantages to a SHIFT:

48846

// 1. MOVs can write to a register that differs from source

48847

// 2. MOVs accept memory operands

48848

48849

if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

48850

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

48851

N0.getOperand(1).getOpcode() != ISD::Constant)

48852

return SDValue();

48853

48854

SDValue N00 = N0.getOperand(0);

48855

SDValue N01 = N0.getOperand(1);

48856

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

48857

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

48858

EVT CVT = N1.getValueType();

48859

48860

if (SarConst.isNegative())

48861

return SDValue();

48862

48863

for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

48864

unsigned ShiftSize = SVT.getSizeInBits();

48865

// skipping types without corresponding sext/zext and

48866

// ShlConst that is not one of [56,48,32,24,16]

48867

if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

48868

continue;

48869

SDLoc DL(N);

48870

SDValue NN =

48871

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

48872

SarConst = SarConst - (Size - ShiftSize);

48873

if (SarConst == 0)

48874

return NN;

48875

if (SarConst.isNegative())

48876

return DAG.getNode(ISD::SHL, DL, VT, NN,

48877

DAG.getConstant(-SarConst, DL, CVT));

48878

return DAG.getNode(ISD::SRA, DL, VT, NN,

48879

DAG.getConstant(SarConst, DL, CVT));

48880

}

48881

return SDValue();

48882

}

48883

48884

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

48885

TargetLowering::DAGCombinerInfo &DCI,

48886

const X86Subtarget &Subtarget) {

48887

SDValue N0 = N->getOperand(0);

48888

SDValue N1 = N->getOperand(1);

48889

EVT VT = N0.getValueType();

48890

48891

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

48892

return V;

48893

48894

// Only do this on the last DAG combine as it can interfere with other

48895

// combines.

48896

if (!DCI.isAfterLegalizeDAG())

48897

return SDValue();

48898

48899

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

48900

// TODO: This is a generic DAG combine that became an x86-only combine to

48901

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

48902

// and-not ('andn').

48903

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

48904

return SDValue();

48905

48906

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

48907

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

48908

if (!ShiftC || !AndC)

48909

return SDValue();

48910

48911

// If we can shrink the constant mask below 8-bits or 32-bits, then this

48912

// transform should reduce code size. It may also enable secondary transforms

48913

// from improved known-bits analysis or instruction selection.

48914

APInt MaskVal = AndC->getAPIntValue();

48915

48916

// If this can be matched by a zero extend, don't optimize.

48917

if (MaskVal.isMask()) {

48918

unsigned TO = MaskVal.countr_one();

48919

if (TO >= 8 && isPowerOf2_32(TO))

48920

return SDValue();

48921

}

48922

48923

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

48924

unsigned OldMaskSize = MaskVal.getSignificantBits();

48925

unsigned NewMaskSize = NewMaskVal.getSignificantBits();

48926

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

48927

(OldMaskSize > 32 && NewMaskSize <= 32)) {

48928

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

48929

SDLoc DL(N);

48930

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

48931

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

48932

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

48933

}

48934

return SDValue();

48935

}

48936

48937

static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

48938

const X86Subtarget &Subtarget) {

48939

unsigned Opcode = N->getOpcode();

48940

assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48940, __extension__
__PRETTY_FUNCTION__));

48941

48942

SDLoc DL(N);

48943

EVT VT = N->getValueType(0);

48944

SDValue N0 = N->getOperand(0);

48945

SDValue N1 = N->getOperand(1);

48946

EVT SrcVT = N0.getValueType();

48947

48948

SDValue BC0 =

48949

N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;

48950

SDValue BC1 =

48951

N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;

48952

48953

// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

48954

// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

48955

// truncation trees that help us avoid lane crossing shuffles.

48956

// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

48957

// TODO: We don't handle vXf64 shuffles yet.

48958

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48959

if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {

48960

SmallVector<SDValue> ShuffleOps;

48961

SmallVector<int> ShuffleMask, ScaledMask;

48962

SDValue Vec = peekThroughBitcasts(BCSrc);

48963

if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {

48964

resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);

48965

// To keep the HOP LHS/RHS coherency, we must be able to scale the unary

48966

// shuffle to a v4X64 width - we can probably relax this in the future.

48967

if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&

48968

ShuffleOps[0].getValueType().is256BitVector() &&

48969

scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {

48970

SDValue Lo, Hi;

48971

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

48972

std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);

48973

Lo = DAG.getBitcast(SrcVT, Lo);

48974

Hi = DAG.getBitcast(SrcVT, Hi);

48975

SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

48976

Res = DAG.getBitcast(ShufVT, Res);

48977

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);

48978

return DAG.getBitcast(VT, Res);

48979

}

48980

}

48981

}

48982

}

48983

48984

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).

48985

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48986

// If either/both ops are a shuffle that can scale to v2x64,

48987

// then see if we can perform this as a v4x32 post shuffle.

48988

SmallVector<SDValue> Ops0, Ops1;

48989

SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;

48990

bool IsShuf0 =

48991

getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

48992

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

48993

all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48994

bool IsShuf1 =

48995

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

48996

scaleShuffleElements(Mask1, 2, ScaledMask1) &&

48997

all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48998

if (IsShuf0 || IsShuf1) {

48999

if (!IsShuf0) {

49000

Ops0.assign({BC0});

49001

ScaledMask0.assign({0, 1});

49002

}

49003

if (!IsShuf1) {

49004

Ops1.assign({BC1});

49005

ScaledMask1.assign({0, 1});

49006

}

49007

49008

SDValue LHS, RHS;

49009

int PostShuffle[4] = {-1, -1, -1, -1};

49010

auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {

49011

if (M < 0)

49012

return true;

49013

Idx = M % 2;

49014

SDValue Src = Ops[M / 2];

49015

if (!LHS || LHS == Src) {

49016

LHS = Src;

49017

return true;

49018

}

49019

if (!RHS || RHS == Src) {

49020

Idx += 2;

49021

RHS = Src;

49022

return true;

49023

}

49024

return false;

49025

};

49026

if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&

49027

FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&

49028

FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&

49029

FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {

49030

LHS = DAG.getBitcast(SrcVT, LHS);

49031

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

49032

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

49033

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

49034

Res = DAG.getBitcast(ShufVT, Res);

49035

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

49036

return DAG.getBitcast(VT, Res);

49037

}

49038

}

49039

}

49040

49041

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

49042

if (VT.is256BitVector() && Subtarget.hasInt256()) {

49043

SmallVector<int> Mask0, Mask1;

49044

SmallVector<SDValue> Ops0, Ops1;

49045

SmallVector<int, 2> ScaledMask0, ScaledMask1;

49046

if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

49047

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

49048

!Ops0.empty() && !Ops1.empty() &&

49049

all_of(Ops0,

49050

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

49051

all_of(Ops1,

49052

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

49053

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

49054

scaleShuffleElements(Mask1, 2, ScaledMask1)) {

49055

SDValue Op00 = peekThroughBitcasts(Ops0.front());

49056

SDValue Op10 = peekThroughBitcasts(Ops1.front());

49057

SDValue Op01 = peekThroughBitcasts(Ops0.back());

49058

SDValue Op11 = peekThroughBitcasts(Ops1.back());

49059

if ((Op00 == Op11) && (Op01 == Op10)) {

49060

std::swap(Op10, Op11);

49061

ShuffleVectorSDNode::commuteMask(ScaledMask1);

49062

}

49063

if ((Op00 == Op10) && (Op01 == Op11)) {

49064

const int Map[4] = {0, 2, 1, 3};

49065

SmallVector<int, 4> ShuffleMask(

49066

{Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],

49067

Map[ScaledMask1[1]]});

49068

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

49069

SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),

49070

DAG.getBitcast(SrcVT, Op01));

49071

Res = DAG.getBitcast(ShufVT, Res);

49072

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

49073

return DAG.getBitcast(VT, Res);

49074

}

49075

}

49076

}

49077

49078

return SDValue();

49079

}

49080

49081

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

49082

TargetLowering::DAGCombinerInfo &DCI,

49083

const X86Subtarget &Subtarget) {

49084

unsigned Opcode = N->getOpcode();

49085

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))

49086

"Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__));

49087

49088

EVT VT = N->getValueType(0);

49089

SDValue N0 = N->getOperand(0);

49090

SDValue N1 = N->getOperand(1);

49091

unsigned NumDstElts = VT.getVectorNumElements();

49092

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

49093

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

49094

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))

49095

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__))

49096

"Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__
__PRETTY_FUNCTION__));

49097

49098

bool IsSigned = (X86ISD::PACKSS == Opcode);

49099

49100

// Constant Folding.

49101

APInt UndefElts0, UndefElts1;

49102

SmallVector<APInt, 32> EltBits0, EltBits1;

49103

if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

49104

(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

49105

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

49106

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

49107

unsigned NumLanes = VT.getSizeInBits() / 128;

49108

unsigned NumSrcElts = NumDstElts / 2;

49109

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

49110

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

49111

49112

APInt Undefs(NumDstElts, 0);

49113

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));

49114

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

49115

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

49116

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

49117

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

49118

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

49119

49120

if (UndefElts[SrcIdx]) {

49121

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

49122

continue;

49123

}

49124

49125

APInt &Val = EltBits[SrcIdx];

49126

if (IsSigned) {

49127

// PACKSS: Truncate signed value with signed saturation.

49128

// Source values less than dst minint are saturated to minint.

49129

// Source values greater than dst maxint are saturated to maxint.

49130

if (Val.isSignedIntN(DstBitsPerElt))

49131

Val = Val.trunc(DstBitsPerElt);

49132

else if (Val.isNegative())

49133

Val = APInt::getSignedMinValue(DstBitsPerElt);

49134

else

49135

Val = APInt::getSignedMaxValue(DstBitsPerElt);

49136

} else {

49137

// PACKUS: Truncate signed value with unsigned saturation.

49138

// Source values less than zero are saturated to zero.

49139

// Source values greater than dst maxuint are saturated to maxuint.

49140

if (Val.isIntN(DstBitsPerElt))

49141

Val = Val.trunc(DstBitsPerElt);

49142

else if (Val.isNegative())

49143

Val = APInt::getZero(DstBitsPerElt);

49144

else

49145

Val = APInt::getAllOnes(DstBitsPerElt);

49146

}

49147

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

49148

}

49149

}

49150

49151

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

49152

}

49153

49154

// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

49155

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

49156

return V;

49157

49158

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

49159

// truncate to create a larger truncate.

49160

if (Subtarget.hasAVX512() &&

49161

N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

49162

N0.getOperand(0).getValueType() == MVT::v8i32) {

49163

if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

49164

(!IsSigned &&

49165

DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

49166

if (Subtarget.hasVLX())

49167

return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

49168

49169

// Widen input to v16i32 so we can truncate that.

49170

SDLoc dl(N);

49171

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

49172

N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

49173

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

49174

}

49175

}

49176

49177

// Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

49178

if (VT.is128BitVector()) {

49179

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

49180

SDValue Src0, Src1;

49181

if (N0.getOpcode() == ExtOpc &&

49182

N0.getOperand(0).getValueType().is64BitVector() &&

49183

N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

49184

Src0 = N0.getOperand(0);

49185

}

49186

if (N1.getOpcode() == ExtOpc &&

49187

N1.getOperand(0).getValueType().is64BitVector() &&

49188

N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

49189

Src1 = N1.getOperand(0);

49190

}

49191

if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

49192

assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49192, __extension__
__PRETTY_FUNCTION__));

49193

Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

49194

Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

49195

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

49196

}

49197

49198

// Try again with pack(*_extend_vector_inreg, undef).

49199

unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG

49200

: ISD::ZERO_EXTEND_VECTOR_INREG;

49201

if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&

49202

N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)

49203

return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),

49204

DAG);

49205

}

49206

49207

// Attempt to combine as shuffle.

49208

SDValue Op(N, 0);

49209

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49210

return Res;

49211

49212

return SDValue();

49213

}

49214

49215

static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

49216

TargetLowering::DAGCombinerInfo &DCI,

49217

const X86Subtarget &Subtarget) {

49218

assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))

49219

X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__))

49220

"Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__
__PRETTY_FUNCTION__));

49221

49222

if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {

49223

MVT VT = N->getSimpleValueType(0);

49224

SDValue LHS = N->getOperand(0);

49225

SDValue RHS = N->getOperand(1);

49226

49227

// HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).

49228

if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&

49229

LHS.getOpcode() == RHS.getOpcode() &&

49230

LHS.getValueType() == RHS.getValueType() &&

49231

N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {

49232

SDValue LHS0 = LHS.getOperand(0);

49233

SDValue LHS1 = LHS.getOperand(1);

49234

SDValue RHS0 = RHS.getOperand(0);

49235

SDValue RHS1 = RHS.getOperand(1);

49236

if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&

49237

(RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {

49238

SDLoc DL(N);

49239

SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),

49240

LHS0.isUndef() ? LHS1 : LHS0,

49241

RHS0.isUndef() ? RHS1 : RHS0);

49242

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

49243

Res = DAG.getBitcast(ShufVT, Res);

49244

SDValue NewLHS =

49245

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

49246

getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));

49247

SDValue NewRHS =

49248

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

49249

getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));

49250

return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),

49251

DAG.getBitcast(VT, NewRHS));

49252

}

49253

}

49254

}

49255

49256

// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

49257

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

49258

return V;

49259

49260

return SDValue();

49261

}

49262

49263

static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

49264

TargetLowering::DAGCombinerInfo &DCI,

49265

const X86Subtarget &Subtarget) {

49266

assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))

49267

X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__))

49268

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__
__PRETTY_FUNCTION__));

49269

EVT VT = N->getValueType(0);

49270

SDValue N0 = N->getOperand(0);

49271

SDValue N1 = N->getOperand(1);

49272

49273

// Shift zero -> zero.

49274

if (ISD::isBuildVectorAllZeros(N0.getNode()))

49275

return DAG.getConstant(0, SDLoc(N), VT);

49276

49277

// Detect constant shift amounts.

49278

APInt UndefElts;

49279

SmallVector<APInt, 32> EltBits;

49280

if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {

49281

unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

49282

return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

49283

EltBits[0].getZExtValue(), DAG);

49284

}

49285

49286

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49287

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

49288

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

49289

return SDValue(N, 0);

49290

49291

return SDValue();

49292

}

49293

49294

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

49295

TargetLowering::DAGCombinerInfo &DCI,

49296

const X86Subtarget &Subtarget) {

49297

unsigned Opcode = N->getOpcode();

49298

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))

49299

X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__))

49300

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__
__PRETTY_FUNCTION__));

49301

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

49302

EVT VT = N->getValueType(0);

49303

SDValue N0 = N->getOperand(0);

49304

SDValue N1 = N->getOperand(1);

49305

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

49306

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__
__PRETTY_FUNCTION__))

49307

"Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__
__PRETTY_FUNCTION__));

49308

assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 &&
"Unexpected shift amount type") ? void (0) : __assert_fail (
"N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49308, __extension__
__PRETTY_FUNCTION__));

49309

49310

// (shift undef, X) -> 0

49311

if (N0.isUndef())

49312

return DAG.getConstant(0, SDLoc(N), VT);

49313

49314

// Out of range logical bit shifts are guaranteed to be zero.

49315

// Out of range arithmetic bit shifts splat the sign bit.

49316

unsigned ShiftVal = N->getConstantOperandVal(1);

49317

if (ShiftVal >= NumBitsPerElt) {

49318

if (LogicalShift)

49319

return DAG.getConstant(0, SDLoc(N), VT);

49320

ShiftVal = NumBitsPerElt - 1;

49321

}

49322

49323

// (shift X, 0) -> X

49324

if (!ShiftVal)

49325

return N0;

49326

49327

// (shift 0, C) -> 0

49328

if (ISD::isBuildVectorAllZeros(N0.getNode()))

49329

// N0 is all zeros or undef. We guarantee that the bits shifted into the

49330

// result are all zeros, not undef.

49331

return DAG.getConstant(0, SDLoc(N), VT);

49332

49333

// (VSRAI -1, C) -> -1

49334

if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

49335

// N0 is all ones or undef. We guarantee that the bits shifted into the

49336

// result are all ones, not undef.

49337

return DAG.getConstant(-1, SDLoc(N), VT);

49338

49339

auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {

49340

unsigned NewShiftVal = Amt0 + Amt1;

49341

if (NewShiftVal >= NumBitsPerElt) {

49342

// Out of range logical bit shifts are guaranteed to be zero.

49343

// Out of range arithmetic bit shifts splat the sign bit.

49344

if (LogicalShift)

49345

return DAG.getConstant(0, SDLoc(N), VT);

49346

NewShiftVal = NumBitsPerElt - 1;

49347

}

49348

return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

49349

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

49350

};

49351

49352

// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

49353

if (Opcode == N0.getOpcode())

49354

return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));

49355

49356

// (shl (add X, X), C) -> (shl X, (C + 1))

49357

if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&

49358

N0.getOperand(0) == N0.getOperand(1))

49359

return MergeShifts(N0.getOperand(0), ShiftVal, 1);

49360

49361

// We can decode 'whole byte' logical bit shifts as shuffles.

49362

if (LogicalShift && (ShiftVal % 8) == 0) {

49363

SDValue Op(N, 0);

49364

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49365

return Res;

49366

}

49367

49368

auto TryConstantFold = [&](SDValue V) {

49369

APInt UndefElts;

49370

SmallVector<APInt, 32> EltBits;

49371

if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))

49372

return SDValue();

49373

assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__
__PRETTY_FUNCTION__))

49374

"Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__
__PRETTY_FUNCTION__));

49375

// Undef elements need to fold to 0. It's possible SimplifyDemandedBits

49376

// created an undef input due to no input bits being demanded, but user

49377

// still expects 0 in other bits.

49378

for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

49379

APInt &Elt = EltBits[i];

49380

if (UndefElts[i])

49381

Elt = 0;

49382

else if (X86ISD::VSHLI == Opcode)

49383

Elt <<= ShiftVal;

49384

else if (X86ISD::VSRAI == Opcode)

49385

Elt.ashrInPlace(ShiftVal);

49386

else

49387

Elt.lshrInPlace(ShiftVal);

49388

}

49389

// Reset undef elements since they were zeroed above.

49390

UndefElts = 0;

49391

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

49392

};

49393

49394

// Constant Folding.

49395

if (N->isOnlyUserOf(N0.getNode())) {

49396

if (SDValue C = TryConstantFold(N0))

49397

return C;

49398

49399

// Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))

49400

// Don't break NOT patterns.

49401

SDValue BC = peekThroughOneUseBitcasts(N0);

49402

if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&

49403

BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&

49404

!ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {

49405

if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {

49406

SDLoc DL(N);

49407

SDValue LHS = DAG.getNode(Opcode, DL, VT,

49408

DAG.getBitcast(VT, BC.getOperand(0)), N1);

49409

return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);

49410

}

49411

}

49412

}

49413

49414

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49415

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),

49416

DCI))

49417

return SDValue(N, 0);

49418

49419

return SDValue();

49420

}

49421

49422

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

49423

TargetLowering::DAGCombinerInfo &DCI,

49424

const X86Subtarget &Subtarget) {

49425

EVT VT = N->getValueType(0);

49426

unsigned Opcode = N->getOpcode();

49427

assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))

49428

(Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))

49429

Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__))

49430

"Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__
__PRETTY_FUNCTION__));

49431

49432

SDValue Vec = N->getOperand(0);

49433

SDValue Scl = N->getOperand(1);

49434

SDValue Idx = N->getOperand(2);

49435

49436

// Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).

49437

if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))

49438

return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);

49439

49440

if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {

49441

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

49442

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49443

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

49444

APInt::getAllOnes(NumBitsPerElt), DCI))

49445

return SDValue(N, 0);

49446

}

49447

49448

// Attempt to combine insertion patterns to a shuffle.

49449

if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

49450

SDValue Op(N, 0);

49451

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49452

return Res;

49453

}

49454

49455

return SDValue();

49456

}

49457

49458

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

49459

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

49460

/// OR -> CMPNEQSS.

49461

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

49462

TargetLowering::DAGCombinerInfo &DCI,

49463

const X86Subtarget &Subtarget) {

49464

unsigned opcode;

49465

49466

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

49467

// we're requiring SSE2 for both.

49468

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

49469

SDValue N0 = N->getOperand(0);

49470

SDValue N1 = N->getOperand(1);

49471

SDValue CMP0 = N0.getOperand(1);

49472

SDValue CMP1 = N1.getOperand(1);

49473

SDLoc DL(N);

49474

49475

// The SETCCs should both refer to the same CMP.

49476

if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

49477

return SDValue();

49478

49479

SDValue CMP00 = CMP0->getOperand(0);

49480

SDValue CMP01 = CMP0->getOperand(1);

49481

EVT VT = CMP00.getValueType();

49482

49483

if (VT == MVT::f32 || VT == MVT::f64 ||

49484

(VT == MVT::f16 && Subtarget.hasFP16())) {

49485

bool ExpectingFlags = false;

49486

// Check for any users that want flags:

49487

for (const SDNode *U : N->uses()) {

49488

if (ExpectingFlags)

49489

break;

49490

49491

switch (U->getOpcode()) {

49492

default:

49493

case ISD::BR_CC:

49494

case ISD::BRCOND:

49495

case ISD::SELECT:

49496

ExpectingFlags = true;

49497

break;

49498

case ISD::CopyToReg:

49499

case ISD::SIGN_EXTEND:

49500

case ISD::ZERO_EXTEND:

49501

case ISD::ANY_EXTEND:

49502

break;

49503

}

49504

}

49505

49506

if (!ExpectingFlags) {

49507

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

49508

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

49509

49510

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

49511

X86::CondCode tmp = cc0;

49512

cc0 = cc1;

49513

cc1 = tmp;

49514

}

49515

49516

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

49517

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

49518

// FIXME: need symbolic constants for these magic numbers.

49519

// See X86ATTInstPrinter.cpp:printSSECC().

49520

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

49521

if (Subtarget.hasAVX512()) {

49522

SDValue FSetCC =

49523

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

49524

DAG.getTargetConstant(x86cc, DL, MVT::i8));

49525

// Need to fill with zeros to ensure the bitcast will produce zeroes

49526

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

49527

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

49528

DAG.getConstant(0, DL, MVT::v16i1),

49529

FSetCC, DAG.getIntPtrConstant(0, DL));

49530

return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

49531

N->getSimpleValueType(0));

49532

}

49533

SDValue OnesOrZeroesF =

49534

DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

49535

CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

49536

49537

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

49538

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

49539

49540

if (is64BitFP && !Subtarget.is64Bit()) {

49541

// On a 32-bit target, we cannot bitcast the 64-bit float to a

49542

// 64-bit integer, since that's not a legal type. Since

49543

// OnesOrZeroesF is all ones or all zeroes, we don't need all the

49544

// bits, but can do this little dance to extract the lowest 32 bits

49545

// and work with those going forward.

49546

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

49547

OnesOrZeroesF);

49548

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

49549

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

49550

Vector32, DAG.getIntPtrConstant(0, DL));

49551

IntVT = MVT::i32;

49552

}

49553

49554

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

49555

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

49556

DAG.getConstant(1, DL, IntVT));

49557

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

49558

ANDed);

49559

return OneBitOfTruth;

49560

}

49561

}

49562

}

49563

}

49564

return SDValue();

49565

}

49566

49567

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

49568

static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {

49569

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49569, __extension__
__PRETTY_FUNCTION__));

49570

49571

MVT VT = N->getSimpleValueType(0);

49572

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

49573

return SDValue();

49574

49575

SDValue X, Y;

49576

SDValue N0 = N->getOperand(0);

49577

SDValue N1 = N->getOperand(1);

49578

49579

if (SDValue Not = IsNOT(N0, DAG)) {

49580

X = Not;

49581

Y = N1;

49582

} else if (SDValue Not = IsNOT(N1, DAG)) {

49583

X = Not;

49584

Y = N0;

49585

} else

49586

return SDValue();

49587

49588

X = DAG.getBitcast(VT, X);

49589

Y = DAG.getBitcast(VT, Y);

49590

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

49591

}

49592

49593

/// Try to fold:

49594

/// and (vector_shuffle<Z,...,Z>

49595

/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y

49596

/// ->

49597

/// andnp (vector_shuffle<Z,...,Z>

49598

/// (insert_vector_elt undef, X, Z), undef), Y

49599

static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,

49600

const X86Subtarget &Subtarget) {

49601

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49601, __extension__
__PRETTY_FUNCTION__));

49602

49603

EVT VT = N->getValueType(0);

49604

// Do not split 256 and 512 bit vectors with SSE2 as they overwrite original

49605

// value and require extra moves.

49606

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

49607

((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))

49608

return SDValue();

49609

49610

auto GetNot = [&DAG](SDValue V) {

49611

auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));

49612

// TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all

49613

// end-users are ISD::AND including cases

49614

// (and(extract_vector_element(SVN), Y)).

49615

if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||

49616

!SVN->getOperand(1).isUndef()) {

49617

return SDValue();

49618

}

49619

SDValue IVEN = SVN->getOperand(0);

49620

if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||

49621

!IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())

49622

return SDValue();

49623

if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||

49624

IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())

49625

return SDValue();

49626

SDValue Src = IVEN.getOperand(1);

49627

if (SDValue Not = IsNOT(Src, DAG)) {

49628

SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);

49629

SDValue NotIVEN =

49630

DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),

49631

IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));

49632

return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,

49633

SVN->getOperand(1), SVN->getMask());

49634

}

49635

return SDValue();

49636

};

49637

49638

SDValue X, Y;

49639

SDValue N0 = N->getOperand(0);

49640

SDValue N1 = N->getOperand(1);

49641

49642

if (SDValue Not = GetNot(N0)) {

49643

X = Not;

49644

Y = N1;

49645

} else if (SDValue Not = GetNot(N1)) {

49646

X = Not;

49647

Y = N0;

49648

} else

49649

return SDValue();

49650

49651

X = DAG.getBitcast(VT, X);

49652

Y = DAG.getBitcast(VT, Y);

49653

SDLoc DL(N);

49654

// We do not split for SSE at all, but we need to split vectors for AVX1 and

49655

// AVX2.

49656

if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {

49657

SDValue LoX, HiX;

49658

std::tie(LoX, HiX) = splitVector(X, DAG, DL);

49659

SDValue LoY, HiY;

49660

std::tie(LoY, HiY) = splitVector(Y, DAG, DL);

49661

EVT SplitVT = LoX.getValueType();

49662

SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});

49663

SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});

49664

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});

49665

}

49666

return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});

49667

}

49668

49669

// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

49670

// logical operations, like in the example below.

49671

// or (and (truncate x, truncate y)),

49672

// (xor (truncate z, build_vector (constants)))

49673

// Given a target type \p VT, we generate

49674

// or (and x, y), (xor z, zext(build_vector (constants)))

49675

// given x, y and z are of type \p VT. We can do so, if operands are either

49676

// truncates from VT types, the second operand is a vector of constants or can

49677

// be recursively promoted.

49678

static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

49679

unsigned Depth) {

49680

// Limit recursion to avoid excessive compile times.

49681

if (Depth >= SelectionDAG::MaxRecursionDepth)

49682

return SDValue();

49683

49684

if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

49685

N->getOpcode() != ISD::OR)

49686

return SDValue();

49687

49688

SDValue N0 = N->getOperand(0);

49689

SDValue N1 = N->getOperand(1);

49690

SDLoc DL(N);

49691

49692

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49693

if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

49694

return SDValue();

49695

49696

if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

49697

N0 = NN0;

49698

else {

49699

// The Left side has to be a trunc.

49700

if (N0.getOpcode() != ISD::TRUNCATE)

49701

return SDValue();

49702

49703

// The type of the truncated inputs.

49704

if (N0.getOperand(0).getValueType() != VT)

49705

return SDValue();

49706

49707

N0 = N0.getOperand(0);

49708

}

49709

49710

if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

49711

N1 = NN1;

49712

else {

49713

// The right side has to be a 'trunc' or a constant vector.

49714

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

49715

N1.getOperand(0).getValueType() == VT;

49716

if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

49717

return SDValue();

49718

49719

if (RHSTrunc)

49720

N1 = N1.getOperand(0);

49721

else

49722

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

49723

}

49724

49725

return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

49726

}

49727

49728

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

49729

// register. In most cases we actually compare or select YMM-sized registers

49730

// and mixing the two types creates horrible code. This method optimizes

49731

// some of the transition sequences.

49732

// Even with AVX-512 this is still useful for removing casts around logical

49733

// operations on vXi1 mask types.

49734

static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

49735

const X86Subtarget &Subtarget) {

49736

EVT VT = N->getValueType(0);

49737

assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49737, __extension__
__PRETTY_FUNCTION__));

49738

49739

SDLoc DL(N);

49740

assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))

49741

N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__))

49742

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__
__PRETTY_FUNCTION__));

49743

49744

SDValue Narrow = N->getOperand(0);

49745

EVT NarrowVT = Narrow.getValueType();

49746

49747

// Generate the wide operation.

49748

SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

49749

if (!Op)

49750

return SDValue();

49751

switch (N->getOpcode()) {

49752

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49752);

49753

case ISD::ANY_EXTEND:

49754

return Op;

49755

case ISD::ZERO_EXTEND:

49756

return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

49757

case ISD::SIGN_EXTEND:

49758

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

49759

Op, DAG.getValueType(NarrowVT));

49760

}

49761

}

49762

49763

static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

49764

unsigned FPOpcode;

49765

switch (Opcode) {

49766

default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49766);

49767

case ISD::AND: FPOpcode = X86ISD::FAND; break;

49768

case ISD::OR: FPOpcode = X86ISD::FOR; break;

49769

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

49770

}

49771

return FPOpcode;

49772

}

49773

49774

/// If both input operands of a logic op are being cast from floating-point

49775

/// types or FP compares, try to convert this into a floating-point logic node

49776

/// to avoid unnecessary moves from SSE to integer registers.

49777

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

49778

TargetLowering::DAGCombinerInfo &DCI,

49779

const X86Subtarget &Subtarget) {

49780

EVT VT = N->getValueType(0);

49781

SDValue N0 = N->getOperand(0);

49782

SDValue N1 = N->getOperand(1);

49783

SDLoc DL(N);

49784

49785

if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||

49786

(N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))

49787

return SDValue();

49788

49789

SDValue N00 = N0.getOperand(0);

49790

SDValue N10 = N1.getOperand(0);

49791

EVT N00Type = N00.getValueType();

49792

EVT N10Type = N10.getValueType();

49793

49794

// Ensure that both types are the same and are legal scalar fp types.

49795

if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

49796

(Subtarget.hasSSE2() && N00Type == MVT::f64) ||

49797

(Subtarget.hasFP16() && N00Type == MVT::f16)))

49798

return SDValue();

49799

49800

if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {

49801

unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

49802

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

49803

return DAG.getBitcast(VT, FPLogic);

49804

}

49805

49806

if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||

49807

!N1.hasOneUse())

49808

return SDValue();

49809

49810

ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();

49811

ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();

49812

49813

// The vector ISA for FP predicates is incomplete before AVX, so converting

49814

// COMIS* to CMPS* may not be a win before AVX.

49815

if (!Subtarget.hasAVX() &&

49816

!(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))

49817

return SDValue();

49818

49819

// Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)

49820

// and vector logic:

49821

// logic (setcc N00, N01), (setcc N10, N11) -->

49822

// extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0

49823

unsigned NumElts = 128 / N00Type.getSizeInBits();

49824

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);

49825

EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

49826

SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);

49827

SDValue N01 = N0.getOperand(1);

49828

SDValue N11 = N1.getOperand(1);

49829

SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);

49830

SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);

49831

SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);

49832

SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);

49833

SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);

49834

SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);

49835

SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);

49836

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);

49837

}

49838

49839

// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

49840

// to reduce XMM->GPR traffic.

49841

static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

49842

unsigned Opc = N->getOpcode();

49843

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__
__PRETTY_FUNCTION__))

49844

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__
__PRETTY_FUNCTION__));

49845

49846

SDValue N0 = N->getOperand(0);

49847

SDValue N1 = N->getOperand(1);

49848

49849

// Both operands must be single use MOVMSK.

49850

if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

49851

N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

49852

return SDValue();

49853

49854

SDValue Vec0 = N0.getOperand(0);

49855

SDValue Vec1 = N1.getOperand(0);

49856

EVT VecVT0 = Vec0.getValueType();

49857

EVT VecVT1 = Vec1.getValueType();

49858

49859

// Both MOVMSK operands must be from vectors of the same size and same element

49860

// size, but its OK for a fp/int diff.

49861

if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

49862

VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

49863

return SDValue();

49864

49865

SDLoc DL(N);

49866

unsigned VecOpc =

49867

VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

49868

SDValue Result =

49869

DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

49870

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

49871

}

49872

49873

// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).

49874

// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws

49875

// handles in InstCombine.

49876

static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {

49877

unsigned Opc = N->getOpcode();

49878

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))

49879

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__));

49880

49881

SDValue N0 = N->getOperand(0);

49882

SDValue N1 = N->getOperand(1);

49883

EVT VT = N->getValueType(0);

49884

49885

// Both operands must be single use.

49886

if (!N0.hasOneUse() || !N1.hasOneUse())

49887

return SDValue();

49888

49889

// Search for matching shifts.

49890

SDValue BC0 = peekThroughOneUseBitcasts(N0);

49891

SDValue BC1 = peekThroughOneUseBitcasts(N1);

49892

49893

unsigned BCOpc = BC0.getOpcode();

49894

EVT BCVT = BC0.getValueType();

49895

if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())

49896

return SDValue();

49897

49898

switch (BCOpc) {

49899

case X86ISD::VSHLI:

49900

case X86ISD::VSRLI:

49901

case X86ISD::VSRAI: {

49902

if (BC0.getOperand(1) != BC1.getOperand(1))

49903

return SDValue();

49904

49905

SDLoc DL(N);

49906

SDValue BitOp =

49907

DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));

49908

SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));

49909

return DAG.getBitcast(VT, Shift);

49910

}

49911

}

49912

49913

return SDValue();

49914

}

49915

49916

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

49917

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

49918

/// with a shift-right to eliminate loading the vector constant mask value.

49919

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

49920

const X86Subtarget &Subtarget) {

49921

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

49922

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

49923

EVT VT = Op0.getValueType();

49924

if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())

49925

return SDValue();

49926

49927

// Try to convert an "is positive" signbit masking operation into arithmetic

49928

// shift and "andn". This saves a materialization of a -1 vector constant.

49929

// The "is negative" variant should be handled more generally because it only

49930

// requires "and" rather than "andn":

49931

// and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y

49932

//

49933

// This is limited to the original type to avoid producing even more bitcasts.

49934

// If the bitcasts can't be eliminated, then it is unlikely that this fold

49935

// will be profitable.

49936

if (N->getValueType(0) == VT &&

49937

supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {

49938

SDValue X, Y;

49939

if (Op1.getOpcode() == X86ISD::PCMPGT &&

49940

isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {

49941

X = Op1.getOperand(0);

49942

Y = Op0;

49943

} else if (Op0.getOpcode() == X86ISD::PCMPGT &&

49944

isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {

49945

X = Op0.getOperand(0);

49946

Y = Op1;

49947

}

49948

if (X && Y) {

49949

SDLoc DL(N);

49950

SDValue Sra =

49951

getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,

49952

VT.getScalarSizeInBits() - 1, DAG);

49953

return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);

49954

}

49955

}

49956

49957

APInt SplatVal;

49958

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

49959

!SplatVal.isMask())

49960

return SDValue();

49961

49962

// Don't prevent creation of ANDN.

49963

if (isBitwiseNot(Op0))

49964

return SDValue();

49965

49966

if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))

49967

return SDValue();

49968

49969

unsigned EltBitWidth = VT.getScalarSizeInBits();

49970

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

49971

return SDValue();

49972

49973

SDLoc DL(N);

49974

unsigned ShiftVal = SplatVal.countr_one();

49975

SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

49976

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);

49977

return DAG.getBitcast(N->getValueType(0), Shift);

49978

}

49979

49980

// Get the index node from the lowered DAG of a GEP IR instruction with one

49981

// indexing dimension.

49982

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

49983

if (Ld->isIndexed())

49984

return SDValue();

49985

49986

SDValue Base = Ld->getBasePtr();

49987

49988

if (Base.getOpcode() != ISD::ADD)

49989

return SDValue();

49990

49991

SDValue ShiftedIndex = Base.getOperand(0);

49992

49993

if (ShiftedIndex.getOpcode() != ISD::SHL)

49994

return SDValue();

49995

49996

return ShiftedIndex.getOperand(0);

49997

49998

}

49999

50000

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

50001

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

50002

switch (VT.getSizeInBits()) {

50003

default: return false;

50004

case 64: return Subtarget.is64Bit() ? true : false;

50005

case 32: return true;

50006

}

50007

}

50008

return false;

50009

}

50010

50011

// This function recognizes cases where X86 bzhi instruction can replace and

50012

// 'and-load' sequence.

50013

// In case of loading integer value from an array of constants which is defined

50014

// as follows:

50015

//

50016

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

50017

//

50018

// then applying a bitwise and on the result with another input.

50019

// It's equivalent to performing bzhi (zero high bits) on the input, with the

50020

// same index of the load.

50021

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

50022

const X86Subtarget &Subtarget) {

50023

MVT VT = Node->getSimpleValueType(0);

50024

SDLoc dl(Node);

50025

50026

// Check if subtarget has BZHI instruction for the node's type

50027

if (!hasBZHI(Subtarget, VT))

50028

return SDValue();

50029

50030

// Try matching the pattern for both operands.

50031

for (unsigned i = 0; i < 2; i++) {

50032

SDValue N = Node->getOperand(i);

50033

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

50034

50035

// continue if the operand is not a load instruction

50036

if (!Ld)

50037

return SDValue();

50038

50039

const Value *MemOp = Ld->getMemOperand()->getValue();

50040

50041

if (!MemOp)

50042

return SDValue();

50043

50044

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

50045

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

50046

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

50047

50048

Constant *Init = GV->getInitializer();

50049

Type *Ty = Init->getType();

50050

if (!isa<ConstantDataArray>(Init) ||

50051

!Ty->getArrayElementType()->isIntegerTy() ||

50052

Ty->getArrayElementType()->getScalarSizeInBits() !=

50053

VT.getSizeInBits() ||

50054

Ty->getArrayNumElements() >

50055

Ty->getArrayElementType()->getScalarSizeInBits())

50056

continue;

50057

50058

// Check if the array's constant elements are suitable to our case.

50059

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

50060

bool ConstantsMatch = true;

50061

for (uint64_t j = 0; j < ArrayElementCount; j++) {

50062

auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));

50063

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

50064

ConstantsMatch = false;

50065

break;

50066

}

50067

}

50068

if (!ConstantsMatch)

50069

continue;

50070

50071

// Do the transformation (For 32-bit type):

50072

// -> (and (load arr[idx]), inp)

50073

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

50074

// that will be replaced with one bzhi instruction.

50075

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

50076

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

50077

50078

// Get the Node which indexes into the array.

50079

SDValue Index = getIndexFromUnindexedLoad(Ld);

50080

if (!Index)

50081

return SDValue();

50082

Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

50083

50084

SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

50085

Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

50086

50087

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

50088

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

50089

50090

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

50091

}

50092

}

50093

}

50094

}

50095

return SDValue();

50096

}

50097

50098

// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

50099

// Where C is a mask containing the same number of bits as the setcc and

50100

// where the setcc will freely 0 upper bits of k-register. We can replace the

50101

// undef in the concat with 0s and remove the AND. This mainly helps with

50102

// v2i1/v4i1 setcc being casted to scalar.

50103

static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

50104

const X86Subtarget &Subtarget) {

50105

assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50105, __extension__
__PRETTY_FUNCTION__));

50106

50107

EVT VT = N->getValueType(0);

50108

50109

// Make sure this is an AND with constant. We will check the value of the

50110

// constant later.

50111

auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));

50112

if (!C1)

50113

return SDValue();

50114

50115

// This is implied by the ConstantSDNode.

50116

assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50116, __extension__
__PRETTY_FUNCTION__));

50117

50118

SDValue Src = N->getOperand(0);

50119

if (!Src.hasOneUse())

50120

return SDValue();

50121

50122

// (Optionally) peek through any_extend().

50123

if (Src.getOpcode() == ISD::ANY_EXTEND) {

50124

if (!Src.getOperand(0).hasOneUse())

50125

return SDValue();

50126

Src = Src.getOperand(0);

50127

}

50128

50129

if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())

50130

return SDValue();

50131

50132

Src = Src.getOperand(0);

50133

EVT SrcVT = Src.getValueType();

50134

50135

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50136

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

50137

!TLI.isTypeLegal(SrcVT))

50138

return SDValue();

50139

50140

if (Src.getOpcode() != ISD::CONCAT_VECTORS)

50141

return SDValue();

50142

50143

// We only care about the first subvector of the concat, we expect the

50144

// other subvectors to be ignored due to the AND if we make the change.

50145

SDValue SubVec = Src.getOperand(0);

50146

EVT SubVecVT = SubVec.getValueType();

50147

50148

// The RHS of the AND should be a mask with as many bits as SubVec.

50149

if (!TLI.isTypeLegal(SubVecVT) ||

50150

!C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))

50151

return SDValue();

50152

50153

// First subvector should be a setcc with a legal result type or a

50154

// AND containing at least one setcc with a legal result type.

50155

auto IsLegalSetCC = [&](SDValue V) {

50156

if (V.getOpcode() != ISD::SETCC)

50157

return false;

50158

EVT SetccVT = V.getOperand(0).getValueType();

50159

if (!TLI.isTypeLegal(SetccVT) ||

50160

!(Subtarget.hasVLX() || SetccVT.is512BitVector()))

50161

return false;

50162

if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

50163

return false;

50164

return true;

50165

};

50166

if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&

50167

(IsLegalSetCC(SubVec.getOperand(0)) ||

50168

IsLegalSetCC(SubVec.getOperand(1))))))

50169

return SDValue();

50170

50171

// We passed all the checks. Rebuild the concat_vectors with zeroes

50172

// and cast it back to VT.

50173

SDLoc dl(N);

50174

SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

50175

DAG.getConstant(0, dl, SubVecVT));

50176

Ops[0] = SubVec;

50177

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

50178

Ops);

50179

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());

50180

return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);

50181

}

50182

50183

static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,

50184

SDValue OpMustEq, SDValue Op, unsigned Depth) {

50185

// We don't want to go crazy with the recursion here. This isn't a super

50186

// important optimization.

50187

static constexpr unsigned kMaxDepth = 2;

50188

50189

// Only do this re-ordering if op has one use.

50190

if (!Op.hasOneUse())

50191

return SDValue();

50192

50193

SDLoc DL(Op);

50194

// If we hit another assosiative op, recurse further.

50195

if (Op.getOpcode() == Opc) {

50196

// Done recursing.

50197

if (Depth++ >= kMaxDepth)

50198

return SDValue();

50199

50200

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

50201

if (SDValue R =

50202

getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))

50203

return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,

50204

Op.getOperand(1 - OpIdx));

50205

50206

} else if (Op.getOpcode() == ISD::SUB) {

50207

if (Opc == ISD::AND) {

50208

// BLSI: (and x, (sub 0, x))

50209

if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)

50210

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

50211

}

50212

// Opc must be ISD::AND or ISD::XOR

50213

// BLSR: (and x, (sub x, 1))

50214

// BLSMSK: (xor x, (sub x, 1))

50215

if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

50216

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

50217

50218

} else if (Op.getOpcode() == ISD::ADD) {

50219

// Opc must be ISD::AND or ISD::XOR

50220

// BLSR: (and x, (add x, -1))

50221

// BLSMSK: (xor x, (add x, -1))

50222

if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

50223

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

50224

}

50225

return SDValue();

50226

}

50227

50228

static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,

50229

const X86Subtarget &Subtarget) {

50230

EVT VT = N->getValueType(0);

50231

// Make sure this node is a candidate for BMI instructions.

50232

if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||

50233

(VT != MVT::i32 && VT != MVT::i64))

50234

return SDValue();

50235

50236

assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50236, __extension__
__PRETTY_FUNCTION__));

50237

50238

// Try and match LHS and RHS.

50239

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

50240

if (SDValue OpMatch =

50241

getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),

50242

N->getOperand(1 - OpIdx), 0))

50243

return OpMatch;

50244

return SDValue();

50245

}

50246

50247

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

50248

TargetLowering::DAGCombinerInfo &DCI,

50249

const X86Subtarget &Subtarget) {

50250

SDValue N0 = N->getOperand(0);

50251

SDValue N1 = N->getOperand(1);

50252

EVT VT = N->getValueType(0);

50253

SDLoc dl(N);

50254

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50255

50256

// If this is SSE1 only convert to FAND to avoid scalarization.

50257

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

50258

return DAG.getBitcast(MVT::v4i32,

50259

DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,

50260

DAG.getBitcast(MVT::v4f32, N0),

50261

DAG.getBitcast(MVT::v4f32, N1)));

50262

}

50263

50264

// Use a 32-bit and+zext if upper bits known zero.

50265

if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {

50266

APInt HiMask = APInt::getHighBitsSet(64, 32);

50267

if (DAG.MaskedValueIsZero(N1, HiMask) ||

50268

DAG.MaskedValueIsZero(N0, HiMask)) {

50269

SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);

50270

SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);

50271

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

50272

DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

50273

}

50274

}

50275

50276

// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

50277

// TODO: Support multiple SrcOps.

50278

if (VT == MVT::i1) {

50279

SmallVector<SDValue, 2> SrcOps;

50280

SmallVector<APInt, 2> SrcPartials;

50281

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

50282

SrcOps.size() == 1) {

50283

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

50284

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

50285

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

50286

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

50287

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

50288

if (Mask) {

50289

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__
__PRETTY_FUNCTION__))

50290

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__
__PRETTY_FUNCTION__));

50291

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

50292

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

50293

return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

50294

}

50295

}

50296

}

50297

50298

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

50299

return V;

50300

50301

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

50302

return R;

50303

50304

if (SDValue R = combineBitOpWithShift(N, DAG))

50305

return R;

50306

50307

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

50308

return FPLogic;

50309

50310

if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))

50311

return R;

50312

50313

if (DCI.isBeforeLegalizeOps())

50314

return SDValue();

50315

50316

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

50317

return R;

50318

50319

if (SDValue R = combineAndNotIntoANDNP(N, DAG))

50320

return R;

50321

50322

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

50323

return ShiftRight;

50324

50325

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

50326

return R;

50327

50328

// fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))

50329

// iff c2 is all/no bits mask - i.e. a select-with-zero mask.

50330

// TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?

50331

if (VT.isVector() && getTargetConstantFromNode(N1)) {

50332

unsigned Opc0 = N0.getOpcode();

50333

if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&

50334

getTargetConstantFromNode(N0.getOperand(1)) &&

50335

DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&

50336

N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {

50337

SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);

50338

return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);

50339

}

50340

}

50341

50342

// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant

50343

// avoids slow variable shift (moving shift amount to ECX etc.)

50344

if (isOneConstant(N1) && N0->hasOneUse()) {

50345

SDValue Src = N0;

50346

while ((Src.getOpcode() == ISD::ZERO_EXTEND ||

50347

Src.getOpcode() == ISD::TRUNCATE) &&

50348

Src.getOperand(0)->hasOneUse())

50349

Src = Src.getOperand(0);

50350

bool ContainsNOT = false;

50351

X86::CondCode X86CC = X86::COND_B;

50352

// Peek through AND(NOT(SRL(X,Y)),1).

50353

if (isBitwiseNot(Src)) {

50354

Src = Src.getOperand(0);

50355

X86CC = X86::COND_AE;

50356

ContainsNOT = true;

50357

}

50358

if (Src.getOpcode() == ISD::SRL &&

50359

!isa<ConstantSDNode>(Src.getOperand(1))) {

50360

SDValue BitNo = Src.getOperand(1);

50361

Src = Src.getOperand(0);

50362

// Peek through AND(SRL(NOT(X),Y),1).

50363

if (isBitwiseNot(Src)) {

50364

Src = Src.getOperand(0);

50365

X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;

50366

ContainsNOT = true;

50367

}

50368

// If we have BMI2 then SHRX should be faster for i32/i64 cases.

50369

if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))

50370

if (SDValue BT = getBT(Src, BitNo, dl, DAG))

50371

return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);

50372

}

50373

}

50374

50375

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

50376

// Attempt to recursively combine a bitmask AND with shuffles.

50377

SDValue Op(N, 0);

50378

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

50379

return Res;

50380

50381

// If either operand is a constant mask, then only the elements that aren't

50382

// zero are actually demanded by the other operand.

50383

auto GetDemandedMasks = [&](SDValue Op) {

50384

APInt UndefElts;

50385

SmallVector<APInt> EltBits;

50386

int NumElts = VT.getVectorNumElements();

50387

int EltSizeInBits = VT.getScalarSizeInBits();

50388

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

50389

APInt DemandedElts = APInt::getAllOnes(NumElts);

50390

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

50391

EltBits)) {

50392

DemandedBits.clearAllBits();

50393

DemandedElts.clearAllBits();

50394

for (int I = 0; I != NumElts; ++I) {

50395

if (UndefElts[I]) {

50396

// We can't assume an undef src element gives an undef dst - the

50397

// other src might be zero.

50398

DemandedBits.setAllBits();

50399

DemandedElts.setBit(I);

50400

} else if (!EltBits[I].isZero()) {

50401

DemandedBits |= EltBits[I];

50402

DemandedElts.setBit(I);

50403

}

50404

}

50405

}

50406

return std::make_pair(DemandedBits, DemandedElts);

50407

};

50408

APInt Bits0, Elts0;

50409

APInt Bits1, Elts1;

50410

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

50411

std::tie(Bits1, Elts1) = GetDemandedMasks(N0);

50412

50413

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

50414

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

50415

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

50416

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

50417

if (N->getOpcode() != ISD::DELETED_NODE)

50418

DCI.AddToWorklist(N);

50419

return SDValue(N, 0);

50420

}

50421

50422

SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);

50423

SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);

50424

if (NewN0 || NewN1)

50425

return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,

50426

NewN1 ? NewN1 : N1);

50427

}

50428

50429

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

50430

if ((VT.getScalarSizeInBits() % 8) == 0 &&

50431

N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

50432

isa<ConstantSDNode>(N0.getOperand(1))) {

50433

SDValue BitMask = N1;

50434

SDValue SrcVec = N0.getOperand(0);

50435

EVT SrcVecVT = SrcVec.getValueType();

50436

50437

// Check that the constant bitmask masks whole bytes.

50438

APInt UndefElts;

50439

SmallVector<APInt, 64> EltBits;

50440

if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&

50441

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

50442

llvm::all_of(EltBits, [](const APInt &M) {

50443

return M.isZero() || M.isAllOnes();

50444

})) {

50445

unsigned NumElts = SrcVecVT.getVectorNumElements();

50446

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

50447

unsigned Idx = N0.getConstantOperandVal(1);

50448

50449

// Create a root shuffle mask from the byte mask and the extracted index.

50450

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

50451

for (unsigned i = 0; i != Scale; ++i) {

50452

if (UndefElts[i])

50453

continue;

50454

int VecIdx = Scale * Idx + i;

50455

ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;

50456

}

50457

50458

if (SDValue Shuffle = combineX86ShufflesRecursively(

50459

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,

50460

X86::MaxShuffleCombineDepth,

50461

/*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,

50462

/*AllowVarPerLaneMask*/ true, DAG, Subtarget))

50463

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,

50464

N0.getOperand(1));

50465

}

50466

}

50467

50468

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

50469

return R;

50470

50471

return SDValue();

50472

}

50473

50474

// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))

50475

static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

50476

const X86Subtarget &Subtarget) {

50477

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50477, __extension__
__PRETTY_FUNCTION__));

50478

50479

MVT VT = N->getSimpleValueType(0);

50480

unsigned EltSizeInBits = VT.getScalarSizeInBits();

50481

if (!VT.isVector() || (EltSizeInBits % 8) != 0)

50482

return SDValue();

50483

50484

SDValue N0 = peekThroughBitcasts(N->getOperand(0));

50485

SDValue N1 = peekThroughBitcasts(N->getOperand(1));

50486

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

50487

return SDValue();

50488

50489

// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

50490

// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

50491

if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||

50492

!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

50493

return SDValue();

50494

50495

// Attempt to extract constant byte masks.

50496

APInt UndefElts0, UndefElts1;

50497

SmallVector<APInt, 32> EltBits0, EltBits1;

50498

if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

50499

false, false))

50500

return SDValue();

50501

if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

50502

false, false))

50503

return SDValue();

50504

50505

for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

50506

// TODO - add UNDEF elts support.

50507

if (UndefElts0[i] || UndefElts1[i])

50508

return SDValue();

50509

if (EltBits0[i] != ~EltBits1[i])

50510

return SDValue();

50511

}

50512

50513

SDLoc DL(N);

50514

50515

if (useVPTERNLOG(Subtarget, VT)) {

50516

// Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.

50517

// VPTERNLOG is only available as vXi32/64-bit types.

50518

MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;

50519

MVT OpVT =

50520

MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());

50521

SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));

50522

SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));

50523

SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));

50524

SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

50525

SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},

50526

DAG, Subtarget);

50527

return DAG.getBitcast(VT, Res);

50528

}

50529

50530

SDValue X = N->getOperand(0);

50531

SDValue Y =

50532

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

50533

DAG.getBitcast(VT, N1.getOperand(0)));

50534

return DAG.getNode(ISD::OR, DL, VT, X, Y);

50535

}

50536

50537

// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

50538

static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

50539

if (N->getOpcode() != ISD::OR)

50540

return false;

50541

50542

SDValue N0 = N->getOperand(0);

50543

SDValue N1 = N->getOperand(1);

50544

50545

// Canonicalize AND to LHS.

50546

if (N1.getOpcode() == ISD::AND)

50547

std::swap(N0, N1);

50548

50549

// Attempt to match OR(AND(M,Y),ANDNP(M,X)).

50550

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

50551

return false;

50552

50553

Mask = N1.getOperand(0);

50554

X = N1.getOperand(1);

50555

50556

// Check to see if the mask appeared in both the AND and ANDNP.

50557

if (N0.getOperand(0) == Mask)

50558

Y = N0.getOperand(1);

50559

else if (N0.getOperand(1) == Mask)

50560

Y = N0.getOperand(0);

50561

else

50562

return false;

50563

50564

// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for

50565

// ANDNP combine allows other combines to happen that prevent matching.

50566

return true;

50567

}

50568

50569

// Try to fold:

50570

// (or (and (m, y), (pandn m, x)))

50571

// into:

50572

// (vselect m, x, y)

50573

// As a special case, try to fold:

50574

// (or (and (m, (sub 0, x)), (pandn m, x)))

50575

// into:

50576

// (sub (xor X, M), M)

50577

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

50578

const X86Subtarget &Subtarget) {

50579

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50579, __extension__
__PRETTY_FUNCTION__));

50580

50581

EVT VT = N->getValueType(0);

50582

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

50583

(VT.is256BitVector() && Subtarget.hasInt256())))

50584

return SDValue();

50585

50586

SDValue X, Y, Mask;

50587

if (!matchLogicBlend(N, X, Y, Mask))

50588

return SDValue();

50589

50590

// Validate that X, Y, and Mask are bitcasts, and see through them.

50591

Mask = peekThroughBitcasts(Mask);

50592

X = peekThroughBitcasts(X);

50593

Y = peekThroughBitcasts(Y);

50594

50595

EVT MaskVT = Mask.getValueType();

50596

unsigned EltBits = MaskVT.getScalarSizeInBits();

50597

50598

// TODO: Attempt to handle floating point cases as well?

50599

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

50600

return SDValue();

50601

50602

SDLoc DL(N);

50603

50604

// Attempt to combine to conditional negate: (sub (xor X, M), M)

50605

if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

50606

DAG, Subtarget))

50607

return Res;

50608

50609

// PBLENDVB is only available on SSE 4.1.

50610

if (!Subtarget.hasSSE41())

50611

return SDValue();

50612

50613

// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

50614

if (Subtarget.hasVLX())

50615

return SDValue();

50616

50617

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

50618

50619

X = DAG.getBitcast(BlendVT, X);

50620

Y = DAG.getBitcast(BlendVT, Y);

50621

Mask = DAG.getBitcast(BlendVT, Mask);

50622

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

50623

return DAG.getBitcast(VT, Mask);

50624

}

50625

50626

// Helper function for combineOrCmpEqZeroToCtlzSrl

50627

// Transforms:

50628

// seteq(cmp x, 0)

50629

// into:

50630

// srl(ctlz x), log2(bitsize(x))

50631

// Input pattern is checked by caller.

50632

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {

50633

SDValue Cmp = Op.getOperand(1);

50634

EVT VT = Cmp.getOperand(0).getValueType();

50635

unsigned Log2b = Log2_32(VT.getSizeInBits());

50636

SDLoc dl(Op);

50637

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

50638

// The result of the shift is true or false, and on X86, the 32-bit

50639

// encoding of shr and lzcnt is more desirable.

50640

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

50641

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

50642

DAG.getConstant(Log2b, dl, MVT::i8));

50643

return Scc;

50644

}

50645

50646

// Try to transform:

50647

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

50648

// into:

50649

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

50650

// Will also attempt to match more generic cases, eg:

50651

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

50652

// Only applies if the target supports the FastLZCNT feature.

50653

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

50654

TargetLowering::DAGCombinerInfo &DCI,

50655

const X86Subtarget &Subtarget) {

50656

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

50657

return SDValue();

50658

50659

auto isORCandidate = [](SDValue N) {

50660

return (N->getOpcode() == ISD::OR && N->hasOneUse());

50661

};

50662

50663

// Check the zero extend is extending to 32-bit or more. The code generated by

50664

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

50665

// instructions to clear the upper bits.

50666

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

50667

!isORCandidate(N->getOperand(0)))

50668

return SDValue();

50669

50670

// Check the node matches: setcc(eq, cmp 0)

50671

auto isSetCCCandidate = [](SDValue N) {

50672

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

50673

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

50674

N->getOperand(1).getOpcode() == X86ISD::CMP &&

50675

isNullConstant(N->getOperand(1).getOperand(1)) &&

50676

N->getOperand(1).getValueType().bitsGE(MVT::i32);

50677

};

50678

50679

SDNode *OR = N->getOperand(0).getNode();

50680

SDValue LHS = OR->getOperand(0);

50681

SDValue RHS = OR->getOperand(1);

50682

50683

// Save nodes matching or(or, setcc(eq, cmp 0)).

50684

SmallVector<SDNode *, 2> ORNodes;

50685

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

50686

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

50687

ORNodes.push_back(OR);

50688

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

50689

LHS = OR->getOperand(0);

50690

RHS = OR->getOperand(1);

50691

}

50692

50693

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

50694

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

50695

!isORCandidate(SDValue(OR, 0)))

50696

return SDValue();

50697

50698

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

50699

// to

50700

// or(srl(ctlz),srl(ctlz)).

50701

// The dag combiner can then fold it into:

50702

// srl(or(ctlz, ctlz)).

50703

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);

50704

SDValue Ret, NewRHS;

50705

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))

50706

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);

50707

50708

if (!Ret)

50709

return SDValue();

50710

50711

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

50712

while (!ORNodes.empty()) {

50713

OR = ORNodes.pop_back_val();

50714

LHS = OR->getOperand(0);

50715

RHS = OR->getOperand(1);

50716

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

50717

if (RHS->getOpcode() == ISD::OR)

50718

std::swap(LHS, RHS);

50719

NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);

50720

if (!NewRHS)

50721

return SDValue();

50722

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);

50723

}

50724

50725

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

50726

}

50727

50728

static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,

50729

SDValue And1_L, SDValue And1_R,

50730

const SDLoc &DL, SelectionDAG &DAG) {

50731

if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())

50732

return SDValue();

50733

SDValue NotOp = And0_L->getOperand(0);

50734

if (NotOp == And1_R)

50735

std::swap(And1_R, And1_L);

50736

if (NotOp != And1_L)

50737

return SDValue();

50738

50739

// (~(NotOp) & And0_R) | (NotOp & And1_R)

50740

// --> ((And0_R ^ And1_R) & NotOp) ^ And1_R

50741

EVT VT = And1_L->getValueType(0);

50742

SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);

50743

SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);

50744

SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);

50745

SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);

50746

return Xor1;

50747

}

50748

50749

/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the

50750

/// equivalent `((x ^ y) & m) ^ y)` pattern.

50751

/// This is typically a better representation for targets without a fused

50752

/// "and-not" operation. This function is intended to be called from a

50753

/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.

50754

static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {

50755

// Note that masked-merge variants using XOR or ADD expressions are

50756

// normalized to OR by InstCombine so we only check for OR.

50757

assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50757, __extension__
__PRETTY_FUNCTION__));

50758

SDValue N0 = Node->getOperand(0);

50759

if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())

50760

return SDValue();

50761

SDValue N1 = Node->getOperand(1);

50762

if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())

50763

return SDValue();

50764

50765

SDLoc DL(Node);

50766

SDValue N00 = N0->getOperand(0);

50767

SDValue N01 = N0->getOperand(1);

50768

SDValue N10 = N1->getOperand(0);

50769

SDValue N11 = N1->getOperand(1);

50770

if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))

50771

return Result;

50772

if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))

50773

return Result;

50774

if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))

50775

return Result;

50776

if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))

50777

return Result;

50778

return SDValue();

50779

}

50780

50781

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

50782

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

50783

/// with CMP+{ADC, SBB}.

50784

/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.

50785

static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,

50786

SDValue X, SDValue Y,

50787

SelectionDAG &DAG,

50788

bool ZeroSecondOpOnly = false) {

50789

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

50790

return SDValue();

50791

50792

// Look through a one-use zext.

50793

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())

50794

Y = Y.getOperand(0);

50795

50796

X86::CondCode CC;

50797

SDValue EFLAGS;

50798

if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {

50799

CC = (X86::CondCode)Y.getConstantOperandVal(0);

50800

EFLAGS = Y.getOperand(1);

50801

} else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&

50802

Y.hasOneUse()) {

50803

EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);

50804

}

50805

50806

if (!EFLAGS)

50807

return SDValue();

50808

50809

// If X is -1 or 0, then we have an opportunity to avoid constants required in

50810

// the general case below.

50811

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

50812

if (ConstantX && !ZeroSecondOpOnly) {

50813

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||

50814

(IsSub && CC == X86::COND_B && ConstantX->isZero())) {

50815

// This is a complicated way to get -1 or 0 from the carry flag:

50816

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

50817

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

50818

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50819

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50820

EFLAGS);

50821

}

50822

50823

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||

50824

(IsSub && CC == X86::COND_A && ConstantX->isZero())) {

50825

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

50826

EFLAGS.getValueType().isInteger() &&

50827

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50828

// Swap the operands of a SUB, and we have the same pattern as above.

50829

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

50830

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

50831

SDValue NewSub = DAG.getNode(

50832

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50833

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50834

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

50835

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50836

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50837

NewEFLAGS);

50838

}

50839

}

50840

}

50841

50842

if (CC == X86::COND_B) {

50843

// X + SETB Z --> adc X, 0

50844

// X - SETB Z --> sbb X, 0

50845

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

50846

DAG.getVTList(VT, MVT::i32), X,

50847

DAG.getConstant(0, DL, VT), EFLAGS);

50848

}

50849

50850

if (ZeroSecondOpOnly)

50851

return SDValue();

50852

50853

if (CC == X86::COND_A) {

50854

// Try to convert COND_A into COND_B in an attempt to facilitate

50855

// materializing "setb reg".

50856

//

50857

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

50858

// cannot take an immediate as its first operand.

50859

//

50860

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

50861

EFLAGS.getValueType().isInteger() &&

50862

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50863

SDValue NewSub =

50864

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50865

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50866

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

50867

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

50868

DAG.getVTList(VT, MVT::i32), X,

50869

DAG.getConstant(0, DL, VT), NewEFLAGS);

50870

}

50871

}

50872

50873

if (CC == X86::COND_AE) {

50874

// X + SETAE --> sbb X, -1

50875

// X - SETAE --> adc X, -1

50876

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

50877

DAG.getVTList(VT, MVT::i32), X,

50878

DAG.getConstant(-1, DL, VT), EFLAGS);

50879

}

50880

50881

if (CC == X86::COND_BE) {

50882

// X + SETBE --> sbb X, -1

50883

// X - SETBE --> adc X, -1

50884

// Try to convert COND_BE into COND_AE in an attempt to facilitate

50885

// materializing "setae reg".

50886

//

50887

// Do not flip "e <= c", where "c" is a constant, because Cmp instruction

50888

// cannot take an immediate as its first operand.

50889

//

50890

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

50891

EFLAGS.getValueType().isInteger() &&

50892

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50893

SDValue NewSub =

50894

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50895

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50896

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

50897

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

50898

DAG.getVTList(VT, MVT::i32), X,

50899

DAG.getConstant(-1, DL, VT), NewEFLAGS);

50900

}

50901

}

50902

50903

if (CC != X86::COND_E && CC != X86::COND_NE)

50904

return SDValue();

50905

50906

if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||

50907

!X86::isZeroNode(EFLAGS.getOperand(1)) ||

50908

!EFLAGS.getOperand(0).getValueType().isInteger())

50909

return SDValue();

50910

50911

SDValue Z = EFLAGS.getOperand(0);

50912

EVT ZVT = Z.getValueType();

50913

50914

// If X is -1 or 0, then we have an opportunity to avoid constants required in

50915

// the general case below.

50916

if (ConstantX) {

50917

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

50918

// fake operands:

50919

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

50920

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

50921

if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||

50922

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {

50923

SDValue Zero = DAG.getConstant(0, DL, ZVT);

50924

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50925

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

50926

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50927

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50928

SDValue(Neg.getNode(), 1));

50929

}

50930

50931

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

50932

// with fake operands:

50933

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

50934

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

50935

if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||

50936

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {

50937

SDValue One = DAG.getConstant(1, DL, ZVT);

50938

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50939

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50940

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50941

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50942

Cmp1.getValue(1));

50943

}

50944

}

50945

50946

// (cmp Z, 1) sets the carry flag if Z is 0.

50947

SDValue One = DAG.getConstant(1, DL, ZVT);

50948

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50949

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50950

50951

// Add the flags type for ADC/SBB nodes.

50952

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

50953

50954

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

50955

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

50956

if (CC == X86::COND_NE)

50957

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

50958

DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

50959

50960

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

50961

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

50962

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

50963

DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

50964

}

50965

50966

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

50967

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

50968

/// with CMP+{ADC, SBB}.

50969

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

50970

bool IsSub = N->getOpcode() == ISD::SUB;

50971

SDValue X = N->getOperand(0);

50972

SDValue Y = N->getOperand(1);

50973

EVT VT = N->getValueType(0);

50974

SDLoc DL(N);

50975

50976

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))

50977

return ADCOrSBB;

50978

50979

// Commute and try again (negate the result for subtracts).

50980

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {

50981

if (IsSub)

50982

ADCOrSBB =

50983

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);

50984

return ADCOrSBB;

50985

}

50986

50987

return SDValue();

50988

}

50989

50990

static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,

50991

SelectionDAG &DAG) {

50992

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__
__PRETTY_FUNCTION__))

50993

"Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__
__PRETTY_FUNCTION__));

50994

50995

// Delegate to combineAddOrSubToADCOrSBB if we have:

50996

//

50997

// (xor/or (zero_extend (setcc)) imm)

50998

//

50999

// where imm is odd if and only if we have xor, in which case the XOR/OR are

51000

// equivalent to a SUB/ADD, respectively.

51001

if (N0.getOpcode() == ISD::ZERO_EXTEND &&

51002

N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {

51003

if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {

51004

bool IsSub = N->getOpcode() == ISD::XOR;

51005

bool N1COdd = N1C->getZExtValue() & 1;

51006

if (IsSub ? N1COdd : !N1COdd) {

51007

SDLoc DL(N);

51008

EVT VT = N->getValueType(0);

51009

if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))

51010

return R;

51011

}

51012

}

51013

}

51014

51015

return SDValue();

51016

}

51017

51018

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

51019

TargetLowering::DAGCombinerInfo &DCI,

51020

const X86Subtarget &Subtarget) {

51021

SDValue N0 = N->getOperand(0);

51022

SDValue N1 = N->getOperand(1);

51023

EVT VT = N->getValueType(0);

51024

SDLoc dl(N);

51025

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51026

51027

// If this is SSE1 only convert to FOR to avoid scalarization.

51028

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

51029

return DAG.getBitcast(MVT::v4i32,

51030

DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,

51031

DAG.getBitcast(MVT::v4f32, N0),

51032

DAG.getBitcast(MVT::v4f32, N1)));

51033

}

51034

51035

// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

51036

// TODO: Support multiple SrcOps.

51037

if (VT == MVT::i1) {

51038

SmallVector<SDValue, 2> SrcOps;

51039

SmallVector<APInt, 2> SrcPartials;

51040

if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

51041

SrcOps.size() == 1) {

51042

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

51043

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

51044

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

51045

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

51046

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

51047

if (Mask) {

51048

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__
__PRETTY_FUNCTION__))

51049

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__
__PRETTY_FUNCTION__));

51050

SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

51051

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

51052

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

51053

return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

51054

}

51055

}

51056

}

51057

51058

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

51059

return R;

51060

51061

if (SDValue R = combineBitOpWithShift(N, DAG))

51062

return R;

51063

51064

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

51065

return FPLogic;

51066

51067

if (DCI.isBeforeLegalizeOps())

51068

return SDValue();

51069

51070

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

51071

return R;

51072

51073

if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

51074

return R;

51075

51076

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

51077

return R;

51078

51079

// (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.

51080

if ((VT == MVT::i32 || VT == MVT::i64) &&

51081

N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&

51082

isNullConstant(N0.getOperand(0))) {

51083

SDValue Cond = N0.getOperand(1);

51084

if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())

51085

Cond = Cond.getOperand(0);

51086

51087

if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {

51088

if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {

51089

uint64_t Val = CN->getZExtValue();

51090

if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {

51091

X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);

51092

CCode = X86::GetOppositeBranchCondition(CCode);

51093

SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);

51094

51095

SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);

51096

R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));

51097

R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));

51098

return R;

51099

}

51100

}

51101

}

51102

}

51103

51104

// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

51105

// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

51106

// iff the upper elements of the non-shifted arg are zero.

51107

// KUNPCK require 16+ bool vector elements.

51108

if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

51109

unsigned NumElts = VT.getVectorNumElements();

51110

unsigned HalfElts = NumElts / 2;

51111

APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

51112

if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

51113

N1.getConstantOperandAPInt(1) == HalfElts &&

51114

DAG.MaskedVectorIsZero(N0, UpperElts)) {

51115

return DAG.getNode(

51116

ISD::CONCAT_VECTORS, dl, VT,

51117

extractSubVector(N0, 0, DAG, dl, HalfElts),

51118

extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

51119

}

51120

if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

51121

N0.getConstantOperandAPInt(1) == HalfElts &&

51122

DAG.MaskedVectorIsZero(N1, UpperElts)) {

51123

return DAG.getNode(

51124

ISD::CONCAT_VECTORS, dl, VT,

51125

extractSubVector(N1, 0, DAG, dl, HalfElts),

51126

extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

51127

}

51128

}

51129

51130

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

51131

// Attempt to recursively combine an OR of shuffles.

51132

SDValue Op(N, 0);

51133

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

51134

return Res;

51135

51136

// If either operand is a constant mask, then only the elements that aren't

51137

// allones are actually demanded by the other operand.

51138

auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {

51139

APInt UndefElts;

51140

SmallVector<APInt> EltBits;

51141

int NumElts = VT.getVectorNumElements();

51142

int EltSizeInBits = VT.getScalarSizeInBits();

51143

if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))

51144

return false;

51145

51146

APInt DemandedElts = APInt::getZero(NumElts);

51147

for (int I = 0; I != NumElts; ++I)

51148

if (!EltBits[I].isAllOnes())

51149

DemandedElts.setBit(I);

51150

51151

return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);

51152

};

51153

if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {

51154

if (N->getOpcode() != ISD::DELETED_NODE)

51155

DCI.AddToWorklist(N);

51156

return SDValue(N, 0);

51157

}

51158

}

51159

51160

// We should fold "masked merge" patterns when `andn` is not available.

51161

if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)

51162

if (SDValue R = foldMaskedMerge(N, DAG))

51163

return R;

51164

51165

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

51166

return R;

51167

51168

return SDValue();

51169

}

51170

51171

/// Try to turn tests against the signbit in the form of:

51172

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

51173

/// into:

51174

/// SETGT(X, -1)

51175

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

51176

// This is only worth doing if the output type is i8 or i1.

51177

EVT ResultType = N->getValueType(0);

51178

if (ResultType != MVT::i8 && ResultType != MVT::i1)

51179

return SDValue();

51180

51181

SDValue N0 = N->getOperand(0);

51182

SDValue N1 = N->getOperand(1);

51183

51184

// We should be performing an xor against a truncated shift.

51185

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

51186

return SDValue();

51187

51188

// Make sure we are performing an xor against one.

51189

if (!isOneConstant(N1))

51190

return SDValue();

51191

51192

// SetCC on x86 zero extends so only act on this if it's a logical shift.

51193

SDValue Shift = N0.getOperand(0);

51194

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

51195

return SDValue();

51196

51197

// Make sure we are truncating from one of i16, i32 or i64.

51198

EVT ShiftTy = Shift.getValueType();

51199

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

51200

return SDValue();

51201

51202

// Make sure the shift amount extracts the sign bit.

51203

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

51204

Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

51205

return SDValue();

51206

51207

// Create a greater-than comparison against -1.

51208

// N.B. Using SETGE against 0 works but we want a canonical looking

51209

// comparison, using SETGT matches up with what TranslateX86CC.

51210

SDLoc DL(N);

51211

SDValue ShiftOp = Shift.getOperand(0);

51212

EVT ShiftOpTy = ShiftOp.getValueType();

51213

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51214

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

51215

*DAG.getContext(), ResultType);

51216

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

51217

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

51218

if (SetCCResultType != ResultType)

51219

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

51220

return Cond;

51221

}

51222

51223

/// Turn vector tests of the signbit in the form of:

51224

/// xor (sra X, elt_size(X)-1), -1

51225

/// into:

51226

/// pcmpgt X, -1

51227

///

51228

/// This should be called before type legalization because the pattern may not

51229

/// persist after that.

51230

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

51231

const X86Subtarget &Subtarget) {

51232

EVT VT = N->getValueType(0);

51233

if (!VT.isSimple())

51234

return SDValue();

51235

51236

switch (VT.getSimpleVT().SimpleTy) {

51237

default: return SDValue();

51238

case MVT::v16i8:

51239

case MVT::v8i16:

51240

case MVT::v4i32:

51241

case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

51242

case MVT::v32i8:

51243

case MVT::v16i16:

51244

case MVT::v8i32:

51245

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

51246

}

51247

51248

// There must be a shift right algebraic before the xor, and the xor must be a

51249

// 'not' operation.

51250

SDValue Shift = N->getOperand(0);

51251

SDValue Ones = N->getOperand(1);

51252

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

51253

!ISD::isBuildVectorAllOnes(Ones.getNode()))

51254

return SDValue();

51255

51256

// The shift should be smearing the sign bit across each vector element.

51257

auto *ShiftAmt =

51258

isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

51259

if (!ShiftAmt ||

51260

ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

51261

return SDValue();

51262

51263

// Create a greater-than comparison against -1. We don't use the more obvious

51264

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

51265

return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

51266

}

51267

51268

/// Detect patterns of truncation with unsigned saturation:

51269

///

51270

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

51271

/// Return the source value x to be truncated or SDValue() if the pattern was

51272

/// not matched.

51273

///

51274

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

51275

/// where C1 >= 0 and C2 is unsigned max of destination type.

51276

///

51277

/// (truncate (smax (smin (x, C2), C1)) to dest_type)

51278

/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

51279

///

51280

/// These two patterns are equivalent to:

51281

/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

51282

/// So return the smax(x, C1) value to be truncated or SDValue() if the

51283

/// pattern was not matched.

51284

static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

51285

const SDLoc &DL) {

51286

EVT InVT = In.getValueType();

51287

51288

// Saturation with truncation. We truncate from InVT to VT.

51289

assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__
__PRETTY_FUNCTION__))

51290

"Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__
__PRETTY_FUNCTION__));

51291

51292

// Match min/max and return limit value as a parameter.

51293

auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {

51294

if (V.getOpcode() == Opcode &&

51295

ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))

51296

return V.getOperand(0);

51297

return SDValue();

51298

};

51299

51300

APInt C1, C2;

51301

if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))

51302

// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

51303

// the element size of the destination type.

51304

if (C2.isMask(VT.getScalarSizeInBits()))

51305

return UMin;

51306

51307

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))

51308

if (MatchMinMax(SMin, ISD::SMAX, C1))

51309

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

51310

return SMin;

51311

51312

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))

51313

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))

51314

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&

51315

C2.uge(C1)) {

51316

return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));

51317

}

51318

51319

return SDValue();

51320

}

51321

51322

/// Detect patterns of truncation with signed saturation:

51323

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

51324

/// signed_max_of_dest_type)) to dest_type)

51325

/// or:

51326

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

51327

/// signed_min_of_dest_type)) to dest_type).

51328

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

51329

/// Return the source value to be truncated or SDValue() if the pattern was not

51330

/// matched.

51331

static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

51332

unsigned NumDstBits = VT.getScalarSizeInBits();

51333

unsigned NumSrcBits = In.getScalarValueSizeInBits();

51334

assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51334, __extension__
__PRETTY_FUNCTION__));

51335

51336

auto MatchMinMax = [](SDValue V, unsigned Opcode,

51337

const APInt &Limit) -> SDValue {

51338

APInt C;

51339

if (V.getOpcode() == Opcode &&

51340

ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)

51341

return V.getOperand(0);

51342

return SDValue();

51343

};

51344

51345

APInt SignedMax, SignedMin;

51346

if (MatchPackUS) {

51347

SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);

51348

SignedMin = APInt(NumSrcBits, 0);

51349

} else {

51350

SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

51351

SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

51352

}

51353

51354

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))

51355

if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))

51356

return SMax;

51357

51358

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))

51359

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))

51360

return SMin;

51361

51362

return SDValue();

51363

}

51364

51365

static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

51366

SelectionDAG &DAG,

51367

const X86Subtarget &Subtarget) {

51368

if (!Subtarget.hasSSE2() || !VT.isVector())

51369

return SDValue();

51370

51371

EVT SVT = VT.getVectorElementType();

51372

EVT InVT = In.getValueType();

51373

EVT InSVT = InVT.getVectorElementType();

51374

51375

// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

51376

// split across two registers. We can use a packusdw+perm to clamp to 0-65535

51377

// and concatenate at the same time. Then we can use a final vpmovuswb to

51378

// clip to 0-255.

51379

if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

51380

InVT == MVT::v16i32 && VT == MVT::v16i8) {

51381

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

51382

// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

51383

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

51384

DL, DAG, Subtarget);

51385

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51385, __extension__
__PRETTY_FUNCTION__));

51386

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

51387

}

51388

}

51389

51390

// vXi32 truncate instructions are available with AVX512F.

51391

// vXi16 truncate instructions are only available with AVX512BW.

51392

// For 256-bit or smaller vectors, we require VLX.

51393

// FIXME: We could widen truncates to 512 to remove the VLX restriction.

51394

// If the result type is 256-bits or larger and we have disable 512-bit

51395

// registers, we should go ahead and use the pack instructions if possible.

51396

bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

51397

(Subtarget.hasBWI() && InSVT == MVT::i16)) &&

51398

(InVT.getSizeInBits() > 128) &&

51399

(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

51400

!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

51401

51402

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&

51403

VT.getSizeInBits() >= 64 &&

51404

(SVT == MVT::i8 || SVT == MVT::i16) &&

51405

(InSVT == MVT::i16 || InSVT == MVT::i32)) {

51406

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

51407

// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

51408

// Only do this when the result is at least 64 bits or we'll leaving

51409

// dangling PACKSSDW nodes.

51410

if (SVT == MVT::i8 && InSVT == MVT::i32) {

51411

EVT MidVT = VT.changeVectorElementType(MVT::i16);

51412

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

51413

DAG, Subtarget);

51414

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51414, __extension__
__PRETTY_FUNCTION__));

51415

SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

51416

Subtarget);

51417

assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51417, __extension__ __PRETTY_FUNCTION__));

51418

return V;

51419

} else if (SVT == MVT::i8 || Subtarget.hasSSE41())

51420

return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

51421

Subtarget);

51422

}

51423

if (SDValue SSatVal = detectSSatPattern(In, VT))

51424

return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

51425

Subtarget);

51426

}

51427

51428

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51429

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

51430

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&

51431

(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {

51432

unsigned TruncOpc = 0;

51433

SDValue SatVal;

51434

if (SDValue SSatVal = detectSSatPattern(In, VT)) {

51435

SatVal = SSatVal;

51436

TruncOpc = X86ISD::VTRUNCS;

51437

} else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {

51438

SatVal = USatVal;

51439

TruncOpc = X86ISD::VTRUNCUS;

51440

}

51441

if (SatVal) {

51442

unsigned ResElts = VT.getVectorNumElements();

51443

// If the input type is less than 512 bits and we don't have VLX, we need

51444

// to widen to 512 bits.

51445

if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

51446

unsigned NumConcats = 512 / InVT.getSizeInBits();

51447

ResElts *= NumConcats;

51448

SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

51449

ConcatOps[0] = SatVal;

51450

InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

51451

NumConcats * InVT.getVectorNumElements());

51452

SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

51453

}

51454

// Widen the result if its narrower than 128 bits.

51455

if (ResElts * SVT.getSizeInBits() < 128)

51456

ResElts = 128 / SVT.getSizeInBits();

51457

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

51458

SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

51459

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

51460

DAG.getIntPtrConstant(0, DL));

51461

}

51462

}

51463

51464

return SDValue();

51465

}

51466

51467

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

51468

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

51469

/// ISD::AVGCEILU (AVG) instruction.

51470

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

51471

const X86Subtarget &Subtarget,

51472

const SDLoc &DL) {

51473

if (!VT.isVector())

51474

return SDValue();

51475

EVT InVT = In.getValueType();

51476

unsigned NumElems = VT.getVectorNumElements();

51477

51478

EVT ScalarVT = VT.getVectorElementType();

51479

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))

51480

return SDValue();

51481

51482

// InScalarVT is the intermediate type in AVG pattern and it should be greater

51483

// than the original input type (i8/i16).

51484

EVT InScalarVT = InVT.getVectorElementType();

51485

if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())

51486

return SDValue();

51487

51488

if (!Subtarget.hasSSE2())

51489

return SDValue();

51490

51491

// Detect the following pattern:

51492

//

51493

// %1 = zext <N x i8> %a to <N x i32>

51494

// %2 = zext <N x i8> %b to <N x i32>

51495

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

51496

// %4 = add nuw nsw <N x i32> %3, %2

51497

// %5 = lshr <N x i32> %N, <i32 1 x N>

51498

// %6 = trunc <N x i32> %5 to <N x i8>

51499

//

51500

// In AVX512, the last instruction can also be a trunc store.

51501

if (In.getOpcode() != ISD::SRL)

51502

return SDValue();

51503

51504

// A lambda checking the given SDValue is a constant vector and each element

51505

// is in the range [Min, Max].

51506

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

51507

return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

51508

return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

51509

});

51510

};

51511

51512

auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {

51513

unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();

51514

return MaxActiveBits <= ScalarVT.getSizeInBits();

51515

};

51516

51517

// Check if each element of the vector is right-shifted by one.

51518

SDValue LHS = In.getOperand(0);

51519

SDValue RHS = In.getOperand(1);

51520

if (!IsConstVectorInRange(RHS, 1, 1))

51521

return SDValue();

51522

if (LHS.getOpcode() != ISD::ADD)

51523

return SDValue();

51524

51525

// Detect a pattern of a + b + 1 where the order doesn't matter.

51526

SDValue Operands[3];

51527

Operands[0] = LHS.getOperand(0);

51528

Operands[1] = LHS.getOperand(1);

51529

51530

auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

51531

ArrayRef<SDValue> Ops) {

51532

return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);

51533

};

51534

51535

auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {

51536

for (SDValue &Op : Ops)

51537

if (Op.getValueType() != VT)

51538

Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

51539

// Pad to a power-of-2 vector, split+apply and extract the original vector.

51540

unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);

51541

EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);

51542

if (NumElemsPow2 != NumElems) {

51543

for (SDValue &Op : Ops) {

51544

SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));

51545

for (unsigned i = 0; i != NumElems; ++i) {

51546

SDValue Idx = DAG.getIntPtrConstant(i, DL);

51547

EltsOfOp[i] =

51548

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);

51549

}

51550

Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);

51551

}

51552

}

51553

SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);

51554

if (NumElemsPow2 == NumElems)

51555

return Res;

51556

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

51557

DAG.getIntPtrConstant(0, DL));

51558

};

51559

51560

// Take care of the case when one of the operands is a constant vector whose

51561

// element is in the range [1, 256].

51562

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

51563

IsZExtLike(Operands[0])) {

51564

// The pattern is detected. Subtract one from the constant vector, then

51565

// demote it and emit X86ISD::AVG instruction.

51566

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

51567

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

51568

return AVGSplitter({Operands[0], Operands[1]});

51569

}

51570

51571

// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).

51572

// Match the or case only if its 'add-like' - can be replaced by an add.

51573

auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {

51574

if (ISD::ADD == V.getOpcode()) {

51575

Op0 = V.getOperand(0);

51576

Op1 = V.getOperand(1);

51577

return true;

51578

}

51579

if (ISD::ZERO_EXTEND != V.getOpcode())

51580

return false;

51581

V = V.getOperand(0);

51582

if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||

51583

!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))

51584

return false;

51585

Op0 = V.getOperand(0);

51586

Op1 = V.getOperand(1);

51587

return true;

51588

};

51589

51590

SDValue Op0, Op1;

51591

if (FindAddLike(Operands[0], Op0, Op1))

51592

std::swap(Operands[0], Operands[1]);

51593

else if (!FindAddLike(Operands[1], Op0, Op1))

51594

return SDValue();

51595

Operands[2] = Op0;

51596

Operands[1] = Op1;

51597

51598

// Now we have three operands of two additions. Check that one of them is a

51599

// constant vector with ones, and the other two can be promoted from i8/i16.

51600

for (SDValue &Op : Operands) {

51601

if (!IsConstVectorInRange(Op, 1, 1))

51602

continue;

51603

std::swap(Op, Operands[2]);

51604

51605

// Check if Operands[0] and Operands[1] are results of type promotion.

51606

for (int j = 0; j < 2; ++j)

51607

if (Operands[j].getValueType() != VT)

51608

if (!IsZExtLike(Operands[j]))

51609

return SDValue();

51610

51611

// The pattern is detected, emit X86ISD::AVG instruction(s).

51612

return AVGSplitter({Operands[0], Operands[1]});

51613

}

51614

51615

return SDValue();

51616

}

51617

51618

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

51619

TargetLowering::DAGCombinerInfo &DCI,

51620

const X86Subtarget &Subtarget) {

51621

LoadSDNode *Ld = cast<LoadSDNode>(N);

51622

EVT RegVT = Ld->getValueType(0);

51623

EVT MemVT = Ld->getMemoryVT();

51624

SDLoc dl(Ld);

51625

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51626

51627

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

51628

// into two 16-byte operations. Also split non-temporal aligned loads on

51629

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

51630

ISD::LoadExtType Ext = Ld->getExtensionType();

51631

unsigned Fast;

51632

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

51633

Ext == ISD::NON_EXTLOAD &&

51634

((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

51635

Ld->getAlign() >= Align(16)) ||

51636

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

51637

*Ld->getMemOperand(), &Fast) &&

51638

!Fast))) {

51639

unsigned NumElems = RegVT.getVectorNumElements();

51640

if (NumElems < 2)

51641

return SDValue();

51642

51643

unsigned HalfOffset = 16;

51644

SDValue Ptr1 = Ld->getBasePtr();

51645

SDValue Ptr2 =

51646

DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);

51647

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

51648

NumElems / 2);

51649

SDValue Load1 =

51650

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

51651

Ld->getOriginalAlign(),

51652

Ld->getMemOperand()->getFlags());

51653

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

51654

Ld->getPointerInfo().getWithOffset(HalfOffset),

51655

Ld->getOriginalAlign(),

51656

Ld->getMemOperand()->getFlags());

51657

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

51658

Load1.getValue(1), Load2.getValue(1));

51659

51660

SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

51661

return DCI.CombineTo(N, NewVec, TF, true);

51662

}

51663

51664

// Bool vector load - attempt to cast to an integer, as we have good

51665

// (vXiY *ext(vXi1 bitcast(iX))) handling.

51666

if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

51667

RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

51668

unsigned NumElts = RegVT.getVectorNumElements();

51669

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

51670

if (TLI.isTypeLegal(IntVT)) {

51671

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

51672

Ld->getPointerInfo(),

51673

Ld->getOriginalAlign(),

51674

Ld->getMemOperand()->getFlags());

51675

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

51676

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

51677

}

51678

}

51679

51680

// If we also broadcast this as a subvector to a wider type, then just extract

51681

// the lowest subvector.

51682

if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&

51683

(RegVT.is128BitVector() || RegVT.is256BitVector())) {

51684

SDValue Ptr = Ld->getBasePtr();

51685

SDValue Chain = Ld->getChain();

51686

for (SDNode *User : Ptr->uses()) {

51687

if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

51688

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

51689

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

51690

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

51691

MemVT.getSizeInBits() &&

51692

!User->hasAnyUseOfValue(1) &&

51693

User->getValueSizeInBits(0).getFixedValue() >

51694

RegVT.getFixedSizeInBits()) {

51695

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

51696

RegVT.getSizeInBits());

51697

Extract = DAG.getBitcast(RegVT, Extract);

51698

return DCI.CombineTo(N, Extract, SDValue(User, 1));

51699

}

51700

}

51701

}

51702

51703

// Cast ptr32 and ptr64 pointers to the default address space before a load.

51704

unsigned AddrSpace = Ld->getAddressSpace();

51705

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

51706

AddrSpace == X86AS::PTR32_UPTR) {

51707

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

51708

if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

51709

SDValue Cast =

51710

DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

51711

return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

51712

Ld->getOriginalAlign(),

51713

Ld->getMemOperand()->getFlags());

51714

}

51715

}

51716

51717

return SDValue();

51718

}

51719

51720

/// If V is a build vector of boolean constants and exactly one of those

51721

/// constants is true, return the operand index of that true element.

51722

/// Otherwise, return -1.

51723

static int getOneTrueElt(SDValue V) {

51724

// This needs to be a build vector of booleans.

51725

// TODO: Checking for the i1 type matches the IR definition for the mask,

51726

// but the mask check could be loosened to i8 or other types. That might

51727

// also require checking more than 'allOnesValue'; eg, the x86 HW

51728

// instructions only require that the MSB is set for each mask element.

51729

// The ISD::MSTORE comments/definition do not specify how the mask operand

51730

// is formatted.

51731

auto *BV = dyn_cast<BuildVectorSDNode>(V);

51732

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

51733

return -1;

51734

51735

int TrueIndex = -1;

51736

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

51737

for (unsigned i = 0; i < NumElts; ++i) {

51738

const SDValue &Op = BV->getOperand(i);

51739

if (Op.isUndef())

51740

continue;

51741

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

51742

if (!ConstNode)

51743

return -1;

51744

if (ConstNode->getAPIntValue().countr_one() >= 1) {

51745

// If we already found a one, this is too many.

51746

if (TrueIndex >= 0)

51747

return -1;

51748

TrueIndex = i;

51749

}

51750

}

51751

return TrueIndex;

51752

}

51753

51754

/// Given a masked memory load/store operation, return true if it has one mask

51755

/// bit set. If it has one mask bit set, then also return the memory address of

51756

/// the scalar element to load/store, the vector index to insert/extract that

51757

/// scalar element, and the alignment for the scalar memory access.

51758

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

51759

SelectionDAG &DAG, SDValue &Addr,

51760

SDValue &Index, Align &Alignment,

51761

unsigned &Offset) {

51762

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

51763

if (TrueMaskElt < 0)

51764

return false;

51765

51766

// Get the address of the one scalar element that is specified by the mask

51767

// using the appropriate offset from the base pointer.

51768

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

51769

Offset = 0;

51770

Addr = MaskedOp->getBasePtr();

51771

if (TrueMaskElt != 0) {

51772

Offset = TrueMaskElt * EltVT.getStoreSize();

51773

Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),

51774

SDLoc(MaskedOp));

51775

}

51776

51777

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

51778

Alignment = commonAlignment(MaskedOp->getOriginalAlign(),

51779

EltVT.getStoreSize());

51780

return true;

51781

}

51782

51783

/// If exactly one element of the mask is set for a non-extending masked load,

51784

/// it is a scalar load and vector insert.

51785

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

51786

/// mask have already been optimized in IR, so we don't bother with those here.

51787

static SDValue

51788

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

51789

TargetLowering::DAGCombinerInfo &DCI,

51790

const X86Subtarget &Subtarget) {

51791

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51791, __extension__
__PRETTY_FUNCTION__));

51792

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

51793

// However, some target hooks may need to be added to know when the transform

51794

// is profitable. Endianness would also have to be considered.

51795

51796

SDValue Addr, VecIndex;

51797

Align Alignment;

51798

unsigned Offset;

51799

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

51800

return SDValue();

51801

51802

// Load the one scalar element that is specified by the mask using the

51803

// appropriate offset from the base pointer.

51804

SDLoc DL(ML);

51805

EVT VT = ML->getValueType(0);

51806

EVT EltVT = VT.getVectorElementType();

51807

51808

EVT CastVT = VT;

51809

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

51810

EltVT = MVT::f64;

51811

CastVT = VT.changeVectorElementType(EltVT);

51812

}

51813

51814

SDValue Load =

51815

DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

51816

ML->getPointerInfo().getWithOffset(Offset),

51817

Alignment, ML->getMemOperand()->getFlags());

51818

51819

SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());

51820

51821

// Insert the loaded element into the appropriate place in the vector.

51822

SDValue Insert =

51823

DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);

51824

Insert = DAG.getBitcast(VT, Insert);

51825

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

51826

}

51827

51828

static SDValue

51829

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

51830

TargetLowering::DAGCombinerInfo &DCI) {

51831

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51831, __extension__
__PRETTY_FUNCTION__));

51832

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

51833

return SDValue();

51834

51835

SDLoc DL(ML);

51836

EVT VT = ML->getValueType(0);

51837

51838

// If we are loading the first and last elements of a vector, it is safe and

51839

// always faster to load the whole vector. Replace the masked load with a

51840

// vector load and select.

51841

unsigned NumElts = VT.getVectorNumElements();

51842

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

51843

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

51844

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

51845

if (LoadFirstElt && LoadLastElt) {

51846

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

51847

ML->getMemOperand());

51848

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

51849

ML->getPassThru());

51850

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

51851

}

51852

51853

// Convert a masked load with a constant mask into a masked load and a select.

51854

// This allows the select operation to use a faster kind of select instruction

51855

// (for example, vblendvps -> vblendps).

51856

51857

// Don't try this if the pass-through operand is already undefined. That would

51858

// cause an infinite loop because that's what we're about to create.

51859

if (ML->getPassThru().isUndef())

51860

return SDValue();

51861

51862

if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

51863

return SDValue();

51864

51865

// The new masked load has an undef pass-through operand. The select uses the

51866

// original pass-through operand.

51867

SDValue NewML = DAG.getMaskedLoad(

51868

VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

51869

DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

51870

ML->getAddressingMode(), ML->getExtensionType());

51871

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

51872

ML->getPassThru());

51873

51874

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

51875

}

51876

51877

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

51878

TargetLowering::DAGCombinerInfo &DCI,

51879

const X86Subtarget &Subtarget) {

51880

auto *Mld = cast<MaskedLoadSDNode>(N);

51881

51882

// TODO: Expanding load with constant mask may be optimized as well.

51883

if (Mld->isExpandingLoad())

51884

return SDValue();

51885

51886

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

51887

if (SDValue ScalarLoad =

51888

reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))

51889

return ScalarLoad;

51890

51891

// TODO: Do some AVX512 subsets benefit from this transform?

51892

if (!Subtarget.hasAVX512())

51893

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

51894

return Blend;

51895

}

51896

51897

// If the mask value has been legalized to a non-boolean vector, try to

51898

// simplify ops leading up to it. We only demand the MSB of each lane.

51899

SDValue Mask = Mld->getMask();

51900

if (Mask.getScalarValueSizeInBits() != 1) {

51901

EVT VT = Mld->getValueType(0);

51902

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51903

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51904

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51905

if (N->getOpcode() != ISD::DELETED_NODE)

51906

DCI.AddToWorklist(N);

51907

return SDValue(N, 0);

51908

}

51909

if (SDValue NewMask =

51910

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51911

return DAG.getMaskedLoad(

51912

VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

51913

NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

51914

Mld->getAddressingMode(), Mld->getExtensionType());

51915

}

51916

51917

return SDValue();

51918

}

51919

51920

/// If exactly one element of the mask is set for a non-truncating masked store,

51921

/// it is a vector extract and scalar store.

51922

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

51923

/// mask have already been optimized in IR, so we don't bother with those here.

51924

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

51925

SelectionDAG &DAG,

51926

const X86Subtarget &Subtarget) {

51927

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

51928

// However, some target hooks may need to be added to know when the transform

51929

// is profitable. Endianness would also have to be considered.

51930

51931

SDValue Addr, VecIndex;

51932

Align Alignment;

51933

unsigned Offset;

51934

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

51935

return SDValue();

51936

51937

// Extract the one scalar element that is actually being stored.

51938

SDLoc DL(MS);

51939

SDValue Value = MS->getValue();

51940

EVT VT = Value.getValueType();

51941

EVT EltVT = VT.getVectorElementType();

51942

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

51943

EltVT = MVT::f64;

51944

EVT CastVT = VT.changeVectorElementType(EltVT);

51945

Value = DAG.getBitcast(CastVT, Value);

51946

}

51947

SDValue Extract =

51948

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);

51949

51950

// Store that element at the appropriate offset from the base pointer.

51951

return DAG.getStore(MS->getChain(), DL, Extract, Addr,

51952

MS->getPointerInfo().getWithOffset(Offset),

51953

Alignment, MS->getMemOperand()->getFlags());

51954

}

51955

51956

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

51957

TargetLowering::DAGCombinerInfo &DCI,

51958

const X86Subtarget &Subtarget) {

51959

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

51960

if (Mst->isCompressingStore())

51961

return SDValue();

51962

51963

EVT VT = Mst->getValue().getValueType();

51964

SDLoc dl(Mst);

51965

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51966

51967

if (Mst->isTruncatingStore())

51968

return SDValue();

51969

51970

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))

51971

return ScalarStore;

51972

51973

// If the mask value has been legalized to a non-boolean vector, try to

51974

// simplify ops leading up to it. We only demand the MSB of each lane.

51975

SDValue Mask = Mst->getMask();

51976

if (Mask.getScalarValueSizeInBits() != 1) {

51977

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51978

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51979

if (N->getOpcode() != ISD::DELETED_NODE)

51980

DCI.AddToWorklist(N);

51981

return SDValue(N, 0);

51982

}

51983

if (SDValue NewMask =

51984

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51985

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

51986

Mst->getBasePtr(), Mst->getOffset(), NewMask,

51987

Mst->getMemoryVT(), Mst->getMemOperand(),

51988

Mst->getAddressingMode());

51989

}

51990

51991

SDValue Value = Mst->getValue();

51992

if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

51993

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

51994

Mst->getMemoryVT())) {

51995

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

51996

Mst->getBasePtr(), Mst->getOffset(), Mask,

51997

Mst->getMemoryVT(), Mst->getMemOperand(),

51998

Mst->getAddressingMode(), true);

51999

}

52000

52001

return SDValue();

52002

}

52003

52004

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

52005

TargetLowering::DAGCombinerInfo &DCI,

52006

const X86Subtarget &Subtarget) {

52007

StoreSDNode *St = cast<StoreSDNode>(N);

52008

EVT StVT = St->getMemoryVT();

52009

SDLoc dl(St);

52010

SDValue StoredVal = St->getValue();

52011

EVT VT = StoredVal.getValueType();

52012

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52013

52014

// Convert a store of vXi1 into a store of iX and a bitcast.

52015

if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

52016

VT.getVectorElementType() == MVT::i1) {

52017

52018

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

52019

StoredVal = DAG.getBitcast(NewVT, StoredVal);

52020

52021

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

52022

St->getPointerInfo(), St->getOriginalAlign(),

52023

St->getMemOperand()->getFlags());

52024

}

52025

52026

// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

52027

// This will avoid a copy to k-register.

52028

if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

52029

StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

52030

StoredVal.getOperand(0).getValueType() == MVT::i8) {

52031

SDValue Val = StoredVal.getOperand(0);

52032

// We must store zeros to the unused bits.

52033

Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);

52034

return DAG.getStore(St->getChain(), dl, Val,

52035

St->getBasePtr(), St->getPointerInfo(),

52036

St->getOriginalAlign(),

52037

St->getMemOperand()->getFlags());

52038

}

52039

52040

// Widen v2i1/v4i1 stores to v8i1.

52041

if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

52042

Subtarget.hasAVX512()) {

52043

unsigned NumConcats = 8 / VT.getVectorNumElements();

52044

// We must store zeros to the unused bits.

52045

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));

52046

Ops[0] = StoredVal;

52047

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

52048

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

52049

St->getPointerInfo(), St->getOriginalAlign(),

52050

St->getMemOperand()->getFlags());

52051

}

52052

52053

// Turn vXi1 stores of constants into a scalar store.

52054

if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

52055

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

52056

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

52057

// If its a v64i1 store without 64-bit support, we need two stores.

52058

if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

52059

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

52060

StoredVal->ops().slice(0, 32));

52061

Lo = combinevXi1ConstantToInteger(Lo, DAG);

52062

SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

52063

StoredVal->ops().slice(32, 32));

52064

Hi = combinevXi1ConstantToInteger(Hi, DAG);

52065

52066

SDValue Ptr0 = St->getBasePtr();

52067

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);

52068

52069

SDValue Ch0 =

52070

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

52071

St->getOriginalAlign(),

52072

St->getMemOperand()->getFlags());

52073

SDValue Ch1 =

52074

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

52075

St->getPointerInfo().getWithOffset(4),

52076

St->getOriginalAlign(),

52077

St->getMemOperand()->getFlags());

52078

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

52079

}

52080

52081

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

52082

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

52083

St->getPointerInfo(), St->getOriginalAlign(),

52084

St->getMemOperand()->getFlags());

52085

}

52086

52087

// If we are saving a 32-byte vector and 32-byte stores are slow, such as on

52088

// Sandy Bridge, perform two 16-byte stores.

52089

unsigned Fast;

52090

if (VT.is256BitVector() && StVT == VT &&

52091

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

52092

*St->getMemOperand(), &Fast) &&

52093

!Fast) {

52094

unsigned NumElems = VT.getVectorNumElements();

52095

if (NumElems < 2)

52096

return SDValue();

52097

52098

return splitVectorStore(St, DAG);

52099

}

52100

52101

// Split under-aligned vector non-temporal stores.

52102

if (St->isNonTemporal() && StVT == VT &&

52103

St->getAlign().value() < VT.getStoreSize()) {

52104

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

52105

// vectors or the legalizer can scalarize it to use MOVNTI.

52106

if (VT.is256BitVector() || VT.is512BitVector()) {

52107

unsigned NumElems = VT.getVectorNumElements();

52108

if (NumElems < 2)

52109

return SDValue();

52110

return splitVectorStore(St, DAG);

52111

}

52112

52113

// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

52114

// to use MOVNTI.

52115

if (VT.is128BitVector() && Subtarget.hasSSE2()) {

52116

MVT NTVT = Subtarget.hasSSE4A()

52117

? MVT::v2f64

52118

: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

52119

return scalarizeVectorStore(St, NTVT, DAG);

52120

}

52121

}

52122

52123

// Try to optimize v16i16->v16i8 truncating stores when BWI is not

52124

// supported, but avx512f is by extending to v16i32 and truncating.

52125

if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

52126

St->getValue().getOpcode() == ISD::TRUNCATE &&

52127

St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

52128

TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

52129

St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

52130

SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,

52131

St->getValue().getOperand(0));

52132

return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

52133

MVT::v16i8, St->getMemOperand());

52134

}

52135

52136

// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

52137

if (!St->isTruncatingStore() &&

52138

(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

52139

StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

52140

StoredVal.hasOneUse() &&

52141

TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

52142

bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

52143

return EmitTruncSStore(IsSigned, St->getChain(),

52144

dl, StoredVal.getOperand(0), St->getBasePtr(),

52145

VT, St->getMemOperand(), DAG);

52146

}

52147

52148

// Try to fold a extract_element(VTRUNC) pattern into a truncating store.

52149

if (!St->isTruncatingStore()) {

52150

auto IsExtractedElement = [](SDValue V) {

52151

if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())

52152

V = V.getOperand(0);

52153

unsigned Opc = V.getOpcode();

52154

if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&

52155

isNullConstant(V.getOperand(1)) && V.hasOneUse() &&

52156

V.getOperand(0).hasOneUse())

52157

return V.getOperand(0);

52158

return SDValue();

52159

};

52160

if (SDValue Extract = IsExtractedElement(StoredVal)) {

52161

SDValue Trunc = peekThroughOneUseBitcasts(Extract);

52162

if (Trunc.getOpcode() == X86ISD::VTRUNC) {

52163

SDValue Src = Trunc.getOperand(0);

52164

MVT DstVT = Trunc.getSimpleValueType();

52165

MVT SrcVT = Src.getSimpleValueType();

52166

unsigned NumSrcElts = SrcVT.getVectorNumElements();

52167

unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

52168

MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

52169

if (NumTruncBits == VT.getSizeInBits() &&

52170

TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

52171

return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

52172

TruncVT, St->getMemOperand());

52173

}

52174

}

52175

}

52176

}

52177

52178

// Optimize trunc store (of multiple scalars) to shuffle and store.

52179

// First, pack all of the elements in one place. Next, store to memory

52180

// in fewer chunks.

52181

if (St->isTruncatingStore() && VT.isVector()) {

52182

// Check if we can detect an AVG pattern from the truncation. If yes,

52183

// replace the trunc store by a normal store with the result of X86ISD::AVG

52184

// instruction.

52185

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))

52186

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

52187

Subtarget, dl))

52188

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

52189

St->getPointerInfo(), St->getOriginalAlign(),

52190

St->getMemOperand()->getFlags());

52191

52192

if (TLI.isTruncStoreLegal(VT, StVT)) {

52193

if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

52194

return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

52195

dl, Val, St->getBasePtr(),

52196

St->getMemoryVT(), St->getMemOperand(), DAG);

52197

if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

52198

DAG, dl))

52199

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

52200

dl, Val, St->getBasePtr(),

52201

St->getMemoryVT(), St->getMemOperand(), DAG);

52202

}

52203

52204

return SDValue();

52205

}

52206

52207

// Cast ptr32 and ptr64 pointers to the default address space before a store.

52208

unsigned AddrSpace = St->getAddressSpace();

52209

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

52210

AddrSpace == X86AS::PTR32_UPTR) {

52211

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

52212

if (PtrVT != St->getBasePtr().getSimpleValueType()) {

52213

SDValue Cast =

52214

DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

52215

return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

52216

St->getPointerInfo(), St->getOriginalAlign(),

52217

St->getMemOperand()->getFlags(), St->getAAInfo());

52218

}

52219

}

52220

52221

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

52222

// the FP state in cases where an emms may be missing.

52223

// A preferable solution to the general problem is to figure out the right

52224

// places to insert EMMS. This qualifies as a quick hack.

52225

52226

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

52227

if (VT.getSizeInBits() != 64)

52228

return SDValue();

52229

52230

const Function &F = DAG.getMachineFunction().getFunction();

52231

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

52232

bool F64IsLegal =

52233

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

52234

if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&

52235

isa<LoadSDNode>(St->getValue()) &&

52236

cast<LoadSDNode>(St->getValue())->isSimple() &&

52237

St->getChain().hasOneUse() && St->isSimple()) {

52238

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

52239

52240

if (!ISD::isNormalLoad(Ld))

52241

return SDValue();

52242

52243

// Avoid the transformation if there are multiple uses of the loaded value.

52244

if (!Ld->hasNUsesOfValue(1, 0))

52245

return SDValue();

52246

52247

SDLoc LdDL(Ld);

52248

SDLoc StDL(N);

52249

// Lower to a single movq load/store pair.

52250

SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

52251

Ld->getBasePtr(), Ld->getMemOperand());

52252

52253

// Make sure new load is placed in same chain order.

52254

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

52255

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

52256

St->getMemOperand());

52257

}

52258

52259

// This is similar to the above case, but here we handle a scalar 64-bit

52260

// integer store that is extracted from a vector on a 32-bit target.

52261

// If we have SSE2, then we can treat it like a floating-point double

52262

// to get past legalization. The execution dependencies fixup pass will

52263

// choose the optimal machine instruction for the store if this really is

52264

// an integer or v2f32 rather than an f64.

52265

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

52266

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

52267

SDValue OldExtract = St->getOperand(1);

52268

SDValue ExtOp0 = OldExtract.getOperand(0);

52269

unsigned VecSize = ExtOp0.getValueSizeInBits();

52270

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

52271

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

52272

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

52273

BitCast, OldExtract.getOperand(1));

52274

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

52275

St->getPointerInfo(), St->getOriginalAlign(),

52276

St->getMemOperand()->getFlags());

52277

}

52278

52279

return SDValue();

52280

}

52281

52282

static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

52283

TargetLowering::DAGCombinerInfo &DCI,

52284

const X86Subtarget &Subtarget) {

52285

auto *St = cast<MemIntrinsicSDNode>(N);

52286

52287

SDValue StoredVal = N->getOperand(1);

52288

MVT VT = StoredVal.getSimpleValueType();

52289

EVT MemVT = St->getMemoryVT();

52290

52291

// Figure out which elements we demand.

52292

unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

52293

APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

52294

52295

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52296

if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {

52297

if (N->getOpcode() != ISD::DELETED_NODE)

52298

DCI.AddToWorklist(N);

52299

return SDValue(N, 0);

52300

}

52301

52302

return SDValue();

52303

}

52304

52305

/// Return 'true' if this vector operation is "horizontal"

52306

/// and return the operands for the horizontal operation in LHS and RHS. A

52307

/// horizontal operation performs the binary operation on successive elements

52308

/// of its first operand, then on successive elements of its second operand,

52309

/// returning the resulting values in a vector. For example, if

52310

/// A = < float a0, float a1, float a2, float a3 >

52311

/// and

52312

/// B = < float b0, float b1, float b2, float b3 >

52313

/// then the result of doing a horizontal operation on A and B is

52314

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

52315

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

52316

/// A horizontal-op B, for some already available A and B, and if so then LHS is

52317

/// set to A, RHS to B, and the routine returns 'true'.

52318

static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

52319

SelectionDAG &DAG, const X86Subtarget &Subtarget,

52320

bool IsCommutative,

52321

SmallVectorImpl<int> &PostShuffleMask) {

52322

// If either operand is undef, bail out. The binop should be simplified.

52323

if (LHS.isUndef() || RHS.isUndef())

52324

return false;

52325

52326

// Look for the following pattern:

52327

// A = < float a0, float a1, float a2, float a3 >

52328

// B = < float b0, float b1, float b2, float b3 >

52329

// and

52330

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

52331

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

52332

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

52333

// which is A horizontal-op B.

52334

52335

MVT VT = LHS.getSimpleValueType();

52336

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__
__PRETTY_FUNCTION__))

52337

"Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__
__PRETTY_FUNCTION__));

52338

unsigned NumElts = VT.getVectorNumElements();

52339

52340

auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

52341

SmallVectorImpl<int> &ShuffleMask) {

52342

bool UseSubVector = false;

52343

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

52344

Op.getOperand(0).getValueType().is256BitVector() &&

52345

llvm::isNullConstant(Op.getOperand(1))) {

52346

Op = Op.getOperand(0);

52347

UseSubVector = true;

52348

}

52349

SmallVector<SDValue, 2> SrcOps;

52350

SmallVector<int, 16> SrcMask, ScaledMask;

52351

SDValue BC = peekThroughBitcasts(Op);

52352

if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&

52353

!isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {

52354

return Op.getValueSizeInBits() == BC.getValueSizeInBits();

52355

})) {

52356

resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);

52357

if (!UseSubVector && SrcOps.size() <= 2 &&

52358

scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {

52359

N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();

52360

N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

52361

ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());

52362

}

52363

if (UseSubVector && SrcOps.size() == 1 &&

52364

scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {

52365

std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));

52366

ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);

52367

ShuffleMask.assign(Mask.begin(), Mask.end());

52368

}

52369

}

52370

};

52371

52372

// View LHS in the form

52373

// LHS = VECTOR_SHUFFLE A, B, LMask

52374

// If LHS is not a shuffle, then pretend it is the identity shuffle:

52375

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

52376

// NOTE: A default initialized SDValue represents an UNDEF of type VT.

52377

SDValue A, B;

52378

SmallVector<int, 16> LMask;

52379

GetShuffle(LHS, A, B, LMask);

52380

52381

// Likewise, view RHS in the form

52382

// RHS = VECTOR_SHUFFLE C, D, RMask

52383

SDValue C, D;

52384

SmallVector<int, 16> RMask;

52385

GetShuffle(RHS, C, D, RMask);

52386

52387

// At least one of the operands should be a vector shuffle.

52388

unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

52389

if (NumShuffles == 0)

52390

return false;

52391

52392

if (LMask.empty()) {

52393

A = LHS;

52394

for (unsigned i = 0; i != NumElts; ++i)

52395

LMask.push_back(i);

52396

}

52397

52398

if (RMask.empty()) {

52399

C = RHS;

52400

for (unsigned i = 0; i != NumElts; ++i)

52401

RMask.push_back(i);

52402

}

52403

52404

// If we have an unary mask, ensure the other op is set to null.

52405

if (isUndefOrInRange(LMask, 0, NumElts))

52406

B = SDValue();

52407

else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))

52408

A = SDValue();

52409

52410

if (isUndefOrInRange(RMask, 0, NumElts))

52411

D = SDValue();

52412

else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))

52413

C = SDValue();

52414

52415

// If A and B occur in reverse order in RHS, then canonicalize by commuting

52416

// RHS operands and shuffle mask.

52417

if (A != C) {

52418

std::swap(C, D);

52419

ShuffleVectorSDNode::commuteMask(RMask);

52420

}

52421

// Check that the shuffles are both shuffling the same vectors.

52422

if (!(A == C && B == D))

52423

return false;

52424

52425

PostShuffleMask.clear();

52426

PostShuffleMask.append(NumElts, SM_SentinelUndef);

52427

52428

// LHS and RHS are now:

52429

// LHS = shuffle A, B, LMask

52430

// RHS = shuffle A, B, RMask

52431

// Check that the masks correspond to performing a horizontal operation.

52432

// AVX defines horizontal add/sub to operate independently on 128-bit lanes,

52433

// so we just repeat the inner loop if this is a 256-bit op.

52434

unsigned Num128BitChunks = VT.getSizeInBits() / 128;

52435

unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

52436

unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

52437

assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__
__PRETTY_FUNCTION__))

52438

"Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__
__PRETTY_FUNCTION__));

52439

for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

52440

for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

52441

// Ignore undefined components.

52442

int LIdx = LMask[i + j], RIdx = RMask[i + j];

52443

if (LIdx < 0 || RIdx < 0 ||

52444

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

52445

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

52446

continue;

52447

52448

// Check that successive odd/even elements are being operated on. If not,

52449

// this is not a horizontal operation.

52450

if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

52451

!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

52452

return false;

52453

52454

// Compute the post-shuffle mask index based on where the element

52455

// is stored in the HOP result, and where it needs to be moved to.

52456

int Base = LIdx & ~1u;

52457

int Index = ((Base % NumEltsPer128BitChunk) / 2) +

52458

((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

52459

52460

// The low half of the 128-bit result must choose from A.

52461

// The high half of the 128-bit result must choose from B,

52462

// unless B is undef. In that case, we are always choosing from A.

52463

if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

52464

Index += NumEltsPer64BitChunk;

52465

PostShuffleMask[i + j] = Index;

52466

}

52467

}

52468

52469

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

52470

SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

52471

52472

bool IsIdentityPostShuffle =

52473

isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

52474

if (IsIdentityPostShuffle)

52475

PostShuffleMask.clear();

52476

52477

// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

52478

if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

52479

isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

52480

return false;

52481

52482

// If the source nodes are already used in HorizOps then always accept this.

52483

// Shuffle folding should merge these back together.

52484

bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {

52485

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

52486

});

52487

bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {

52488

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

52489

});

52490

bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;

52491

52492

// Assume a SingleSource HOP if we only shuffle one input and don't need to

52493

// shuffle the result.

52494

if (!ForceHorizOp &&

52495

!shouldUseHorizontalOp(NewLHS == NewRHS &&

52496

(NumShuffles < 2 || !IsIdentityPostShuffle),

52497

DAG, Subtarget))

52498

return false;

52499

52500

LHS = DAG.getBitcast(VT, NewLHS);

52501

RHS = DAG.getBitcast(VT, NewRHS);

52502

return true;

52503

}

52504

52505

// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.

52506

static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,

52507

const X86Subtarget &Subtarget) {

52508

EVT VT = N->getValueType(0);

52509

unsigned Opcode = N->getOpcode();

52510

bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);

52511

SmallVector<int, 8> PostShuffleMask;

52512

52513

switch (Opcode) {

52514

case ISD::FADD:

52515

case ISD::FSUB:

52516

if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

52517

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

52518

SDValue LHS = N->getOperand(0);

52519

SDValue RHS = N->getOperand(1);

52520

auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;

52521

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

52522

PostShuffleMask)) {

52523

SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

52524

if (!PostShuffleMask.empty())

52525

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

52526

DAG.getUNDEF(VT), PostShuffleMask);

52527

return HorizBinOp;

52528

}

52529

}

52530

break;

52531

case ISD::ADD:

52532

case ISD::SUB:

52533

if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

52534

VT == MVT::v16i16 || VT == MVT::v8i32)) {

52535

SDValue LHS = N->getOperand(0);

52536

SDValue RHS = N->getOperand(1);

52537

auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;

52538

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

52539

PostShuffleMask)) {

52540

auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,

52541

ArrayRef<SDValue> Ops) {

52542

return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

52543

};

52544

SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

52545

{LHS, RHS}, HOpBuilder);

52546

if (!PostShuffleMask.empty())

52547

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

52548

DAG.getUNDEF(VT), PostShuffleMask);

52549

return HorizBinOp;

52550

}

52551

}

52552

break;

52553

}

52554

52555

return SDValue();

52556

}

52557

52558

// Try to combine the following nodes

52559

// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64

52560

// <i32 -2147483648[float -0.000000e+00]> 0

52561

// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD

52562

// <(load 4 from constant-pool)> t0, t29

52563

// [t30: v16i32 = bitcast t27]

52564

// t6: v16i32 = xor t7, t27[t30]

52565

// t11: v16f32 = bitcast t6

52566

// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8

52567

// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:

52568

// t22: v16f32 = bitcast t7

52569

// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22

52570

// t24: v32f16 = bitcast t23

52571

static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,

52572

const X86Subtarget &Subtarget) {

52573

EVT VT = N->getValueType(0);

52574

SDValue LHS = N->getOperand(0);

52575

SDValue RHS = N->getOperand(1);

52576

int CombineOpcode =

52577

N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;

52578

auto isConjugationConstant = [](const Constant *c) {

52579

if (const auto *CI = dyn_cast<ConstantInt>(c)) {

52580

APInt ConjugationInt32 = APInt(32, 0x80000000, true);

52581

APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);

52582

switch (CI->getBitWidth()) {

52583

case 16:

52584

return false;

52585

case 32:

52586

return CI->getValue() == ConjugationInt32;

52587

case 64:

52588

return CI->getValue() == ConjugationInt64;

52589

default:

52590

llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52590);

52591

}

52592

}

52593

if (const auto *CF = dyn_cast<ConstantFP>(c))

52594

return CF->isNegativeZeroValue();

52595

return false;

52596

};

52597

auto combineConjugation = [&](SDValue &r) {

52598

if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {

52599

SDValue XOR = LHS.getOperand(0);

52600

if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {

52601

SDValue XORRHS = XOR.getOperand(1);

52602

if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())

52603

XORRHS = XORRHS.getOperand(0);

52604

if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&

52605

XORRHS.getOperand(1).getNumOperands()) {

52606

ConstantPoolSDNode *CP =

52607

dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));

52608

if (CP && isConjugationConstant(CP->getConstVal())) {

52609

SelectionDAG::FlagInserter FlagsInserter(DAG, N);

52610

SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));

52611

SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);

52612

r = DAG.getBitcast(VT, FCMulC);

52613

return true;

52614

}

52615

}

52616

}

52617

}

52618

return false;

52619

};

52620

SDValue Res;

52621

if (combineConjugation(Res))

52622

return Res;

52623

std::swap(LHS, RHS);

52624

if (combineConjugation(Res))

52625

return Res;

52626

return Res;

52627

}

52628

52629

// Try to combine the following nodes:

52630

// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)

52631

static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,

52632

const X86Subtarget &Subtarget) {

52633

auto AllowContract = [&DAG](const SDNodeFlags &Flags) {

52634

return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||

52635

Flags.hasAllowContract();

52636

};

52637

52638

auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {

52639

return DAG.getTarget().Options.NoSignedZerosFPMath ||

52640

Flags.hasNoSignedZeros();

52641

};

52642

auto IsVectorAllNegativeZero = [](const SDNode *N) {

52643

if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)

52644

return false;

52645

assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__
__PRETTY_FUNCTION__))

52646

"Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__
__PRETTY_FUNCTION__));

52647

if (ConstantPoolSDNode *CP =

52648

dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {

52649

APInt AI = APInt(32, 0x80008000, true);

52650

if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))

52651

return CI->getValue() == AI;

52652

if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))

52653

return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);

52654

}

52655

return false;

52656

};

52657

52658

if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||

52659

!AllowContract(N->getFlags()))

52660

return SDValue();

52661

52662

EVT VT = N->getValueType(0);

52663

if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)

52664

return SDValue();

52665

52666

SDValue LHS = N->getOperand(0);

52667

SDValue RHS = N->getOperand(1);

52668

bool IsConj;

52669

SDValue FAddOp1, MulOp0, MulOp1;

52670

auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,

52671

&IsVectorAllNegativeZero,

52672

&HasNoSignedZero](SDValue N) -> bool {

52673

if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)

52674

return false;

52675

SDValue Op0 = N.getOperand(0);

52676

unsigned Opcode = Op0.getOpcode();

52677

if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {

52678

if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {

52679

MulOp0 = Op0.getOperand(0);

52680

MulOp1 = Op0.getOperand(1);

52681

IsConj = Opcode == X86ISD::VFCMULC;

52682

return true;

52683

}

52684

if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&

52685

((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&

52686

HasNoSignedZero(Op0->getFlags())) ||

52687

IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {

52688

MulOp0 = Op0.getOperand(0);

52689

MulOp1 = Op0.getOperand(1);

52690

IsConj = Opcode == X86ISD::VFCMADDC;

52691

return true;

52692

}

52693

}

52694

return false;

52695

};

52696

52697

if (GetCFmulFrom(LHS))

52698

FAddOp1 = RHS;

52699

else if (GetCFmulFrom(RHS))

52700

FAddOp1 = LHS;

52701

else

52702

return SDValue();

52703

52704

MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);

52705

FAddOp1 = DAG.getBitcast(CVT, FAddOp1);

52706

unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;

52707

// FIXME: How do we handle when fast math flags of FADD are different from

52708

// CFMUL's?

52709

SDValue CFmul =

52710

DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());

52711

return DAG.getBitcast(VT, CFmul);

52712

}

52713

52714

/// Do target-specific dag combines on floating-point adds/subs.

52715

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

52716

const X86Subtarget &Subtarget) {

52717

if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))

52718

return HOp;

52719

52720

if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))

52721

return COp;

52722

52723

return SDValue();

52724

}

52725

52726

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

52727

/// the codegen.

52728

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

52729

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

52730

/// anything that is guaranteed to be transformed by DAGCombiner.

52731

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

52732

const X86Subtarget &Subtarget,

52733

const SDLoc &DL) {

52734

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52734, __extension__
__PRETTY_FUNCTION__));

52735

SDValue Src = N->getOperand(0);

52736

unsigned SrcOpcode = Src.getOpcode();

52737

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52738

52739

EVT VT = N->getValueType(0);

52740

EVT SrcVT = Src.getValueType();

52741

52742

auto IsFreeTruncation = [VT](SDValue Op) {

52743

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

52744

52745

// See if this has been extended from a smaller/equal size to

52746

// the truncation size, allowing a truncation to combine with the extend.

52747

unsigned Opcode = Op.getOpcode();

52748

if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

52749

Opcode == ISD::ZERO_EXTEND) &&

52750

Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

52751

return true;

52752

52753

// See if this is a single use constant which can be constant folded.

52754

// NOTE: We don't peek throught bitcasts here because there is currently

52755

// no support for constant folding truncate+bitcast+vector_of_constants. So

52756

// we'll just send up with a truncate on both operands which will

52757

// get turned back into (truncate (binop)) causing an infinite loop.

52758

return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

52759

};

52760

52761

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

52762

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

52763

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

52764

return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

52765

};

52766

52767

// Don't combine if the operation has other uses.

52768

if (!Src.hasOneUse())

52769

return SDValue();

52770

52771

// Only support vector truncation for now.

52772

// TODO: i64 scalar math would benefit as well.

52773

if (!VT.isVector())

52774

return SDValue();

52775

52776

// In most cases its only worth pre-truncating if we're only facing the cost

52777

// of one truncation.

52778

// i.e. if one of the inputs will constant fold or the input is repeated.

52779

switch (SrcOpcode) {

52780

case ISD::MUL:

52781

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

52782

// better to truncate if we have the chance.

52783

if (SrcVT.getScalarType() == MVT::i64 &&

52784

TLI.isOperationLegal(SrcOpcode, VT) &&

52785

!TLI.isOperationLegal(SrcOpcode, SrcVT))

52786

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

52787

[[fallthrough]];

52788

case ISD::AND:

52789

case ISD::XOR:

52790

case ISD::OR:

52791

case ISD::ADD:

52792

case ISD::SUB: {

52793

SDValue Op0 = Src.getOperand(0);

52794

SDValue Op1 = Src.getOperand(1);

52795

if (TLI.isOperationLegal(SrcOpcode, VT) &&

52796

(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

52797

return TruncateArithmetic(Op0, Op1);

52798

break;

52799

}

52800

}

52801

52802

return SDValue();

52803

}

52804

52805

/// Truncate using ISD::AND mask and X86ISD::PACKUS.

52806

/// e.g. trunc <8 x i32> X to <8 x i16> -->

52807

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

52808

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)

52809

static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,

52810

const X86Subtarget &Subtarget,

52811

SelectionDAG &DAG) {

52812

SDValue In = N->getOperand(0);

52813

EVT InVT = In.getValueType();

52814

EVT OutVT = N->getValueType(0);

52815

52816

APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),

52817

OutVT.getScalarSizeInBits());

52818

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));

52819

return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);

52820

}

52821

52822

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

52823

static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,

52824

const X86Subtarget &Subtarget,

52825

SelectionDAG &DAG) {

52826

SDValue In = N->getOperand(0);

52827

EVT InVT = In.getValueType();

52828

EVT OutVT = N->getValueType(0);

52829

In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,

52830

DAG.getValueType(OutVT));

52831

return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);

52832

}

52833

52834

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

52835

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

52836

/// legalization the truncation will be translated into a BUILD_VECTOR with each

52837

/// element that is extracted from a vector and then truncated, and it is

52838

/// difficult to do this optimization based on them.

52839

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

52840

const X86Subtarget &Subtarget) {

52841

EVT OutVT = N->getValueType(0);

52842

if (!OutVT.isVector())

52843

return SDValue();

52844

52845

SDValue In = N->getOperand(0);

52846

if (!In.getValueType().isSimple())

52847

return SDValue();

52848

52849

EVT InVT = In.getValueType();

52850

unsigned NumElems = OutVT.getVectorNumElements();

52851

52852

// AVX512 provides fast truncate ops.

52853

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

52854

return SDValue();

52855

52856

EVT OutSVT = OutVT.getVectorElementType();

52857

EVT InSVT = InVT.getVectorElementType();

52858

if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&

52859

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

52860

NumElems >= 8))

52861

return SDValue();

52862

52863

// SSSE3's pshufb results in less instructions in the cases below.

52864

if (Subtarget.hasSSSE3() && NumElems == 8) {

52865

if (InSVT == MVT::i16)

52866

return SDValue();

52867

if (InSVT == MVT::i32 &&

52868

(OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))

52869

return SDValue();

52870

}

52871

52872

SDLoc DL(N);

52873

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

52874

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

52875

// truncate 2 x v4i32 to v8i16.

52876

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

52877

return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);

52878

if (InSVT == MVT::i32)

52879

return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

52880

52881

return SDValue();

52882

}

52883

52884

/// This function transforms vector truncation of 'extended sign-bits' or

52885

/// 'extended zero-bits' values.

52886

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

52887

static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

52888

SelectionDAG &DAG,

52889

const X86Subtarget &Subtarget) {

52890

// Requires SSE2.

52891

if (!Subtarget.hasSSE2())

52892

return SDValue();

52893

52894

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

52895

return SDValue();

52896

52897

SDValue In = N->getOperand(0);

52898

if (!In.getValueType().isSimple())

52899

return SDValue();

52900

52901

MVT VT = N->getValueType(0).getSimpleVT();

52902

MVT SVT = VT.getScalarType();

52903

52904

MVT InVT = In.getValueType().getSimpleVT();

52905

MVT InSVT = InVT.getScalarType();

52906

52907

// Check we have a truncation suited for PACKSS/PACKUS.

52908

if (!isPowerOf2_32(VT.getVectorNumElements()))

52909

return SDValue();

52910

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

52911

return SDValue();

52912

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

52913

return SDValue();

52914

52915

// Truncation to sub-128bit vXi32 can be better handled with shuffles.

52916

if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

52917

return SDValue();

52918

52919

// AVX512 has fast truncate, but if the input is already going to be split,

52920

// there's no harm in trying pack.

52921

if (Subtarget.hasAVX512() &&

52922

!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&

52923

InVT.is512BitVector())) {

52924

// PACK should still be worth it for 128-bit vectors if the sources were

52925

// originally concatenated from subvectors.

52926

SmallVector<SDValue> ConcatOps;

52927

if (VT.getSizeInBits() > 128 ||

52928

!collectConcatOps(In.getNode(), ConcatOps, DAG))

52929

return SDValue();

52930

}

52931

52932

unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

52933

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

52934

52935

// Use PACKUS if the input has zero-bits that extend all the way to the

52936

// packed/truncated value. e.g. masks, zext_in_reg, etc.

52937

KnownBits Known = DAG.computeKnownBits(In);

52938

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

52939

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))

52940

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

52941

52942

// Use PACKSS if the input has sign-bits that extend all the way to the

52943

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

52944

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

52945

52946

// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

52947

// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

52948

// on and combines/simplifications can't then use it.

52949

if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

52950

return SDValue();

52951

52952

unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;

52953

if (NumSignBits > MinSignBits)

52954

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

52955

52956

// If we have a srl that only generates signbits that we will discard in

52957

// the truncation then we can use PACKSS by converting the srl to a sra.

52958

// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.

52959

if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))

52960

if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(

52961

In, APInt::getAllOnes(VT.getVectorNumElements()))) {

52962

if (*ShAmt == MinSignBits) {

52963

SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());

52964

return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,

52965

Subtarget);

52966

}

52967

}

52968

52969

return SDValue();

52970

}

52971

52972

// Try to form a MULHU or MULHS node by looking for

52973

// (trunc (srl (mul ext, ext), 16))

52974

// TODO: This is X86 specific because we want to be able to handle wide types

52975

// before type legalization. But we can only do it if the vector will be

52976

// legalized via widening/splitting. Type legalization can't handle promotion

52977

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

52978

// combiner.

52979

static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

52980

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

52981

// First instruction should be a right shift of a multiply.

52982

if (Src.getOpcode() != ISD::SRL ||

52983

Src.getOperand(0).getOpcode() != ISD::MUL)

52984

return SDValue();

52985

52986

if (!Subtarget.hasSSE2())

52987

return SDValue();

52988

52989

// Only handle vXi16 types that are at least 128-bits unless they will be

52990

// widened.

52991

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

52992

return SDValue();

52993

52994

// Input type should be at least vXi32.

52995

EVT InVT = Src.getValueType();

52996

if (InVT.getVectorElementType().getSizeInBits() < 32)

52997

return SDValue();

52998

52999

// Need a shift by 16.

53000

APInt ShiftAmt;

53001

if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||

53002

ShiftAmt != 16)

53003

return SDValue();

53004

53005

SDValue LHS = Src.getOperand(0).getOperand(0);

53006

SDValue RHS = Src.getOperand(0).getOperand(1);

53007

53008

// Count leading sign/zero bits on both inputs - if there are enough then

53009

// truncation back to vXi16 will be cheap - either as a pack/shuffle

53010

// sequence or using AVX512 truncations. If the inputs are sext/zext then the

53011

// truncations may actually be free by peeking through to the ext source.

53012

auto IsSext = [&DAG](SDValue V) {

53013

return DAG.ComputeMaxSignificantBits(V) <= 16;

53014

};

53015

auto IsZext = [&DAG](SDValue V) {

53016

return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;

53017

};

53018

53019

bool IsSigned = IsSext(LHS) && IsSext(RHS);

53020

bool IsUnsigned = IsZext(LHS) && IsZext(RHS);

53021

if (!IsSigned && !IsUnsigned)

53022

return SDValue();

53023

53024

// Check if both inputs are extensions, which will be removed by truncation.

53025

bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||

53026

LHS.getOpcode() == ISD::ZERO_EXTEND) &&

53027

(RHS.getOpcode() == ISD::SIGN_EXTEND ||

53028

RHS.getOpcode() == ISD::ZERO_EXTEND) &&

53029

LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&

53030

RHS.getOperand(0).getScalarValueSizeInBits() <= 16;

53031

53032

// For AVX2+ targets, with the upper bits known zero, we can perform MULHU on

53033

// the (bitcasted) inputs directly, and then cheaply pack/truncate the result

53034

// (upper elts will be zero). Don't attempt this with just AVX512F as MULHU

53035

// will have to split anyway.

53036

unsigned InSizeInBits = InVT.getSizeInBits();

53037

if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&

53038

!(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&

53039

(InSizeInBits % 16) == 0) {

53040

EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

53041

InVT.getSizeInBits() / 16);

53042

SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),

53043

DAG.getBitcast(BCVT, RHS));

53044

return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));

53045

}

53046

53047

// Truncate back to source type.

53048

LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);

53049

RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);

53050

53051

unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;

53052

return DAG.getNode(Opc, DL, VT, LHS, RHS);

53053

}

53054

53055

// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

53056

// from one vector with signed bytes from another vector, adds together

53057

// adjacent pairs of 16-bit products, and saturates the result before

53058

// truncating to 16-bits.

53059

//

53060

// Which looks something like this:

53061

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

53062

// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))

53063

static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

53064

const X86Subtarget &Subtarget,

53065

const SDLoc &DL) {

53066

if (!VT.isVector() || !Subtarget.hasSSSE3())

53067

return SDValue();

53068

53069

unsigned NumElems = VT.getVectorNumElements();

53070

EVT ScalarVT = VT.getVectorElementType();

53071

if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

53072

return SDValue();

53073

53074

SDValue SSatVal = detectSSatPattern(In, VT);

53075

if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

53076

return SDValue();

53077

53078

// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

53079

// of multiplies from even/odd elements.

53080

SDValue N0 = SSatVal.getOperand(0);

53081

SDValue N1 = SSatVal.getOperand(1);

53082

53083

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

53084

return SDValue();

53085

53086

SDValue N00 = N0.getOperand(0);

53087

SDValue N01 = N0.getOperand(1);

53088

SDValue N10 = N1.getOperand(0);

53089

SDValue N11 = N1.getOperand(1);

53090

53091

// TODO: Handle constant vectors and use knownbits/computenumsignbits?

53092

// Canonicalize zero_extend to LHS.

53093

if (N01.getOpcode() == ISD::ZERO_EXTEND)

53094

std::swap(N00, N01);

53095

if (N11.getOpcode() == ISD::ZERO_EXTEND)

53096

std::swap(N10, N11);

53097

53098

// Ensure we have a zero_extend and a sign_extend.

53099

if (N00.getOpcode() != ISD::ZERO_EXTEND ||

53100

N01.getOpcode() != ISD::SIGN_EXTEND ||

53101

N10.getOpcode() != ISD::ZERO_EXTEND ||

53102

N11.getOpcode() != ISD::SIGN_EXTEND)

53103

return SDValue();

53104

53105

// Peek through the extends.

53106

N00 = N00.getOperand(0);

53107

N01 = N01.getOperand(0);

53108

N10 = N10.getOperand(0);

53109

N11 = N11.getOperand(0);

53110

53111

// Ensure the extend is from vXi8.

53112

if (N00.getValueType().getVectorElementType() != MVT::i8 ||

53113

N01.getValueType().getVectorElementType() != MVT::i8 ||

53114

N10.getValueType().getVectorElementType() != MVT::i8 ||

53115

N11.getValueType().getVectorElementType() != MVT::i8)

53116

return SDValue();

53117

53118

// All inputs should be build_vectors.

53119

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

53120

N01.getOpcode() != ISD::BUILD_VECTOR ||

53121

N10.getOpcode() != ISD::BUILD_VECTOR ||

53122

N11.getOpcode() != ISD::BUILD_VECTOR)

53123

return SDValue();

53124

53125

// N00/N10 are zero extended. N01/N11 are sign extended.

53126

53127

// For each element, we need to ensure we have an odd element from one vector

53128

// multiplied by the odd element of another vector and the even element from

53129

// one of the same vectors being multiplied by the even element from the

53130

// other vector. So we need to make sure for each element i, this operator

53131

// is being performed:

53132

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

53133

SDValue ZExtIn, SExtIn;

53134

for (unsigned i = 0; i != NumElems; ++i) {

53135

SDValue N00Elt = N00.getOperand(i);

53136

SDValue N01Elt = N01.getOperand(i);

53137

SDValue N10Elt = N10.getOperand(i);

53138

SDValue N11Elt = N11.getOperand(i);

53139

// TODO: Be more tolerant to undefs.

53140

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

53141

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

53142

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

53143

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

53144

return SDValue();

53145

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

53146

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

53147

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

53148

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

53149

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

53150

return SDValue();

53151

unsigned IdxN00 = ConstN00Elt->getZExtValue();

53152

unsigned IdxN01 = ConstN01Elt->getZExtValue();

53153

unsigned IdxN10 = ConstN10Elt->getZExtValue();

53154

unsigned IdxN11 = ConstN11Elt->getZExtValue();

53155

// Add is commutative so indices can be reordered.

53156

if (IdxN00 > IdxN10) {

53157

std::swap(IdxN00, IdxN10);

53158

std::swap(IdxN01, IdxN11);

53159

}

53160

// N0 indices be the even element. N1 indices must be the next odd element.

53161

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

53162

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

53163

return SDValue();

53164

SDValue N00In = N00Elt.getOperand(0);

53165

SDValue N01In = N01Elt.getOperand(0);

53166

SDValue N10In = N10Elt.getOperand(0);

53167

SDValue N11In = N11Elt.getOperand(0);

53168

// First time we find an input capture it.

53169

if (!ZExtIn) {

53170

ZExtIn = N00In;

53171

SExtIn = N01In;

53172

}

53173

if (ZExtIn != N00In || SExtIn != N01In ||

53174

ZExtIn != N10In || SExtIn != N11In)

53175

return SDValue();

53176

}

53177

53178

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

53179

ArrayRef<SDValue> Ops) {

53180

// Shrink by adding truncate nodes and let DAGCombine fold with the

53181

// sources.

53182

EVT InVT = Ops[0].getValueType();

53183

assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__
__PRETTY_FUNCTION__))

53184

"Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__
__PRETTY_FUNCTION__));

53185

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53185, __extension__
__PRETTY_FUNCTION__));

53186

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

53187

InVT.getVectorNumElements() / 2);

53188

return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

53189

};

53190

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

53191

PMADDBuilder);

53192

}

53193

53194

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

53195

const X86Subtarget &Subtarget) {

53196

EVT VT = N->getValueType(0);

53197

SDValue Src = N->getOperand(0);

53198

SDLoc DL(N);

53199

53200

// Attempt to pre-truncate inputs to arithmetic ops instead.

53201

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

53202

return V;

53203

53204

// Try to detect AVG pattern first.

53205

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

53206

return Avg;

53207

53208

// Try to detect PMADD

53209

if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

53210

return PMAdd;

53211

53212

// Try to combine truncation with signed/unsigned saturation.

53213

if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

53214

return Val;

53215

53216

// Try to combine PMULHUW/PMULHW for vXi16.

53217

if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

53218

return V;

53219

53220

// The bitcast source is a direct mmx result.

53221

// Detect bitcasts between i32 to x86mmx

53222

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

53223

SDValue BCSrc = Src.getOperand(0);

53224

if (BCSrc.getValueType() == MVT::x86mmx)

53225

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

53226

}

53227

53228

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

53229

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

53230

return V;

53231

53232

return combineVectorTruncation(N, DAG, Subtarget);

53233

}

53234

53235

static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

53236

TargetLowering::DAGCombinerInfo &DCI) {

53237

EVT VT = N->getValueType(0);

53238

SDValue In = N->getOperand(0);

53239

SDLoc DL(N);

53240

53241

if (SDValue SSatVal = detectSSatPattern(In, VT))

53242

return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

53243

if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))

53244

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

53245

53246

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53247

APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));

53248

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

53249

return SDValue(N, 0);

53250

53251

return SDValue();

53252

}

53253

53254

/// Returns the negated value if the node \p N flips sign of FP value.

53255

///

53256

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

53257

/// or FSUB(0, x)

53258

/// AVX512F does not have FXOR, so FNEG is lowered as

53259

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

53260

/// In this case we go though all bitcasts.

53261

/// This also recognizes splat of a negated value and returns the splat of that

53262

/// value.

53263

static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

53264

if (N->getOpcode() == ISD::FNEG)

53265

return N->getOperand(0);

53266

53267

// Don't recurse exponentially.

53268

if (Depth > SelectionDAG::MaxRecursionDepth)

53269

return SDValue();

53270

53271

unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

53272

53273

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

53274

EVT VT = Op->getValueType(0);

53275

53276

// Make sure the element size doesn't change.

53277

if (VT.getScalarSizeInBits() != ScalarSize)

53278

return SDValue();

53279

53280

unsigned Opc = Op.getOpcode();

53281

switch (Opc) {

53282

case ISD::VECTOR_SHUFFLE: {

53283

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

53284

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

53285

if (!Op.getOperand(1).isUndef())

53286

return SDValue();

53287

if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

53288

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

53289

return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

53290

cast<ShuffleVectorSDNode>(Op)->getMask());

53291

break;

53292

}

53293

case ISD::INSERT_VECTOR_ELT: {

53294

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

53295

// -V, INDEX).

53296

SDValue InsVector = Op.getOperand(0);

53297

SDValue InsVal = Op.getOperand(1);

53298

if (!InsVector.isUndef())

53299

return SDValue();

53300

if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

53301

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

53302

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

53303

NegInsVal, Op.getOperand(2));

53304

break;

53305

}

53306

case ISD::FSUB:

53307

case ISD::XOR:

53308

case X86ISD::FXOR: {

53309

SDValue Op1 = Op.getOperand(1);

53310

SDValue Op0 = Op.getOperand(0);

53311

53312

// For XOR and FXOR, we want to check if constant

53313

// bits of Op1 are sign bit masks. For FSUB, we

53314

// have to check if constant bits of Op0 are sign

53315

// bit masks and hence we swap the operands.

53316

if (Opc == ISD::FSUB)

53317

std::swap(Op0, Op1);

53318

53319

APInt UndefElts;

53320

SmallVector<APInt, 16> EltBits;

53321

// Extract constant bits and see if they are all

53322

// sign bit masks. Ignore the undef elements.

53323

if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

53324

/* AllowWholeUndefs */ true,

53325

/* AllowPartialUndefs */ false)) {

53326

for (unsigned I = 0, E = EltBits.size(); I < E; I++)

53327

if (!UndefElts[I] && !EltBits[I].isSignMask())

53328

return SDValue();

53329

53330

// Only allow bitcast from correctly-sized constant.

53331

Op0 = peekThroughBitcasts(Op0);

53332

if (Op0.getScalarValueSizeInBits() == ScalarSize)

53333

return Op0;

53334

}

53335

break;

53336

} // case

53337

} // switch

53338

53339

return SDValue();

53340

}

53341

53342

static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

53343

bool NegRes) {

53344

if (NegMul) {

53345

switch (Opcode) {

53346

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53346);

53347

case ISD::FMA: Opcode = X86ISD::FNMADD; break;

53348

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

53349

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

53350

case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

53351

case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

53352

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

53353

case X86ISD::FNMADD: Opcode = ISD::FMA; break;

53354

case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

53355

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

53356

case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

53357

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

53358

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

53359

}

53360

}

53361

53362

if (NegAcc) {

53363

switch (Opcode) {

53364

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53364);

53365

case ISD::FMA: Opcode = X86ISD::FMSUB; break;

53366

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

53367

case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

53368

case X86ISD::FMSUB: Opcode = ISD::FMA; break;

53369

case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

53370

case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

53371

case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

53372

case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

53373

case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

53374

case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

53375

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

53376

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

53377

case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

53378

case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

53379

case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

53380

case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

53381

}

53382

}

53383

53384

if (NegRes) {

53385

switch (Opcode) {

53386

// For accuracy reason, we never combine fneg and fma under strict FP.

53387

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53387);

53388

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

53389

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

53390

case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;

53391

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

53392

case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;

53393

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

53394

case X86ISD::FNMSUB: Opcode = ISD::FMA; break;

53395

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

53396

}

53397

}

53398

53399

return Opcode;

53400

}

53401

53402

/// Do target-specific dag combines on floating point negations.

53403

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

53404

TargetLowering::DAGCombinerInfo &DCI,

53405

const X86Subtarget &Subtarget) {

53406

EVT OrigVT = N->getValueType(0);

53407

SDValue Arg = isFNEG(DAG, N);

53408

if (!Arg)

53409

return SDValue();

53410

53411

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53412

EVT VT = Arg.getValueType();

53413

EVT SVT = VT.getScalarType();

53414

SDLoc DL(N);

53415

53416

// Let legalize expand this if it isn't a legal type yet.

53417

if (!TLI.isTypeLegal(VT))

53418

return SDValue();

53419

53420

// If we're negating a FMUL node on a target with FMA, then we can avoid the

53421

// use of a constant by performing (-0 - A*B) instead.

53422

// FIXME: Check rounding control flags as well once it becomes available.

53423

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

53424

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

53425

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

53426

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

53427

Arg.getOperand(1), Zero);

53428

return DAG.getBitcast(OrigVT, NewNode);

53429

}

53430

53431

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

53432

bool LegalOperations = !DCI.isBeforeLegalizeOps();

53433

if (SDValue NegArg =

53434

TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

53435

return DAG.getBitcast(OrigVT, NegArg);

53436

53437

return SDValue();

53438

}

53439

53440

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

53441

bool LegalOperations,

53442

bool ForCodeSize,

53443

NegatibleCost &Cost,

53444

unsigned Depth) const {

53445

// fneg patterns are removable even if they have multiple uses.

53446

if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

53447

Cost = NegatibleCost::Cheaper;

53448

return DAG.getBitcast(Op.getValueType(), Arg);

53449

}

53450

53451

EVT VT = Op.getValueType();

53452

EVT SVT = VT.getScalarType();

53453

unsigned Opc = Op.getOpcode();

53454

SDNodeFlags Flags = Op.getNode()->getFlags();

53455

switch (Opc) {

53456

case ISD::FMA:

53457

case X86ISD::FMSUB:

53458

case X86ISD::FNMADD:

53459

case X86ISD::FNMSUB:

53460

case X86ISD::FMADD_RND:

53461

case X86ISD::FMSUB_RND:

53462

case X86ISD::FNMADD_RND:

53463

case X86ISD::FNMSUB_RND: {

53464

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

53465

!(SVT == MVT::f32 || SVT == MVT::f64) ||

53466

!isOperationLegal(ISD::FMA, VT))

53467

break;

53468

53469

// Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)

53470

// if it may have signed zeros.

53471

if (!Flags.hasNoSignedZeros())

53472

break;

53473

53474

// This is always negatible for free but we might be able to remove some

53475

// extra operand negations as well.

53476

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

53477

for (int i = 0; i != 3; ++i)

53478

NewOps[i] = getCheaperNegatedExpression(

53479

Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

53480

53481

bool NegA = !!NewOps[0];

53482

bool NegB = !!NewOps[1];

53483

bool NegC = !!NewOps[2];

53484

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

53485

53486

Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

53487

: NegatibleCost::Neutral;

53488

53489

// Fill in the non-negated ops with the original values.

53490

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

53491

if (!NewOps[i])

53492

NewOps[i] = Op.getOperand(i);

53493

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

53494

}

53495

case X86ISD::FRCP:

53496

if (SDValue NegOp0 =

53497

getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

53498

ForCodeSize, Cost, Depth + 1))

53499

return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

53500

break;

53501

}

53502

53503

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

53504

ForCodeSize, Cost, Depth);

53505

}

53506

53507

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

53508

const X86Subtarget &Subtarget) {

53509

MVT VT = N->getSimpleValueType(0);

53510

// If we have integer vector types available, use the integer opcodes.

53511

if (!VT.isVector() || !Subtarget.hasSSE2())

53512

return SDValue();

53513

53514

SDLoc dl(N);

53515

53516

unsigned IntBits = VT.getScalarSizeInBits();

53517

MVT IntSVT = MVT::getIntegerVT(IntBits);

53518

MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

53519

53520

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

53521

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

53522

unsigned IntOpcode;

53523

switch (N->getOpcode()) {

53524

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53524);

53525

case X86ISD::FOR: IntOpcode = ISD::OR; break;

53526

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

53527

case X86ISD::FAND: IntOpcode = ISD::AND; break;

53528

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

53529

}

53530

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

53531

return DAG.getBitcast(VT, IntOp);

53532

}

53533

53534

53535

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

53536

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

53537

if (N->getOpcode() != ISD::XOR)

53538

return SDValue();

53539

53540

SDValue LHS = N->getOperand(0);

53541

if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

53542

return SDValue();

53543

53544

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

53545

X86::CondCode(LHS->getConstantOperandVal(0)));

53546

SDLoc DL(N);

53547

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

53548

}

53549

53550

static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,

53551

const X86Subtarget &Subtarget) {

53552

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__
__PRETTY_FUNCTION__))

53553

"Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__
__PRETTY_FUNCTION__));

53554

if (Subtarget.hasFastLZCNT())

53555

return SDValue();

53556

53557

EVT VT = N->getValueType(0);

53558

if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&

53559

(VT != MVT::i64 || !Subtarget.is64Bit()))

53560

return SDValue();

53561

53562

SDValue N0 = N->getOperand(0);

53563

SDValue N1 = N->getOperand(1);

53564

53565

if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&

53566

N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)

53567

return SDValue();

53568

53569

SDValue OpCTLZ;

53570

SDValue OpSizeTM1;

53571

53572

if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {

53573

OpCTLZ = N1;

53574

OpSizeTM1 = N0;

53575

} else if (N->getOpcode() == ISD::SUB) {

53576

return SDValue();

53577

} else {

53578

OpCTLZ = N0;

53579

OpSizeTM1 = N1;

53580

}

53581

53582

if (!OpCTLZ.hasOneUse())

53583

return SDValue();

53584

auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);

53585

if (!C)

53586

return SDValue();

53587

53588

if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))

53589

return SDValue();

53590

SDLoc DL(N);

53591

EVT OpVT = VT;

53592

SDValue Op = OpCTLZ.getOperand(0);

53593

if (VT == MVT::i8) {

53594

// Zero extend to i32 since there is not an i8 bsr.

53595

OpVT = MVT::i32;

53596

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);

53597

}

53598

53599

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

53600

Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);

53601

if (VT == MVT::i8)

53602

Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);

53603

53604

return Op;

53605

}

53606

53607

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

53608

TargetLowering::DAGCombinerInfo &DCI,

53609

const X86Subtarget &Subtarget) {

53610

SDValue N0 = N->getOperand(0);

53611

SDValue N1 = N->getOperand(1);

53612

EVT VT = N->getValueType(0);

53613

53614

// If this is SSE1 only convert to FXOR to avoid scalarization.

53615

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

53616

return DAG.getBitcast(MVT::v4i32,

53617

DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

53618

DAG.getBitcast(MVT::v4f32, N0),

53619

DAG.getBitcast(MVT::v4f32, N1)));

53620

}

53621

53622

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

53623

return Cmp;

53624

53625

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

53626

return R;

53627

53628

if (SDValue R = combineBitOpWithShift(N, DAG))

53629

return R;

53630

53631

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

53632

return FPLogic;

53633

53634

if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))

53635

return R;

53636

53637

if (DCI.isBeforeLegalizeOps())

53638

return SDValue();

53639

53640

if (SDValue SetCC = foldXor1SetCC(N, DAG))

53641

return SetCC;

53642

53643

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

53644

return R;

53645

53646

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

53647

return RV;

53648

53649

// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.

53650

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53651

if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&

53652

N0.getOperand(0).getValueType().isVector() &&

53653

N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

53654

TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {

53655

return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),

53656

N0.getOperand(0).getValueType()));

53657

}

53658

53659

// Handle AVX512 mask widening.

53660

// Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))

53661

if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&

53662

VT.getVectorElementType() == MVT::i1 &&

53663

N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&

53664

TLI.isTypeLegal(N0.getOperand(1).getValueType())) {

53665

return DAG.getNode(

53666

ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),

53667

DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),

53668

N0.getOperand(2));

53669

}

53670

53671

// Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))

53672

// Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))

53673

// TODO: Under what circumstances could this be performed in DAGCombine?

53674

if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&

53675

N0.getOperand(0).getOpcode() == N->getOpcode()) {

53676

SDValue TruncExtSrc = N0.getOperand(0);

53677

auto *N1C = dyn_cast<ConstantSDNode>(N1);

53678

auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));

53679

if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {

53680

SDLoc DL(N);

53681

SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);

53682

SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);

53683

return DAG.getNode(ISD::XOR, DL, VT, LHS,

53684

DAG.getNode(ISD::XOR, DL, VT, RHS, N1));

53685

}

53686

}

53687

53688

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

53689

return R;

53690

53691

return combineFneg(N, DAG, DCI, Subtarget);

53692

}

53693

53694

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

53695

TargetLowering::DAGCombinerInfo &DCI,

53696

const X86Subtarget &Subtarget) {

53697

EVT VT = N->getValueType(0);

53698

unsigned NumBits = VT.getSizeInBits();

53699

53700

// TODO - Constant Folding.

53701

53702

// Simplify the inputs.

53703

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53704

APInt DemandedMask(APInt::getAllOnes(NumBits));

53705

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

53706

return SDValue(N, 0);

53707

53708

return SDValue();

53709

}

53710

53711

static bool isNullFPScalarOrVectorConst(SDValue V) {

53712

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

53713

}

53714

53715

/// If a value is a scalar FP zero or a vector FP zero (potentially including

53716

/// undefined elements), return a zero constant that may be used to fold away

53717

/// that value. In the case of a vector, the returned constant will not contain

53718

/// undefined elements even if the input parameter does. This makes it suitable

53719

/// to be used as a replacement operand with operations (eg, bitwise-and) where

53720

/// an undef should not propagate.

53721

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

53722

const X86Subtarget &Subtarget) {

53723

if (!isNullFPScalarOrVectorConst(V))

53724

return SDValue();

53725

53726

if (V.getValueType().isVector())

53727

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

53728

53729

return V;

53730

}

53731

53732

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

53733

const X86Subtarget &Subtarget) {

53734

SDValue N0 = N->getOperand(0);

53735

SDValue N1 = N->getOperand(1);

53736

EVT VT = N->getValueType(0);

53737

SDLoc DL(N);

53738

53739

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

53740

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

53741

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

53742

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

53743

return SDValue();

53744

53745

auto isAllOnesConstantFP = [](SDValue V) {

53746

if (V.getSimpleValueType().isVector())

53747

return ISD::isBuildVectorAllOnes(V.getNode());

53748

auto *C = dyn_cast<ConstantFPSDNode>(V);

53749

return C && C->getConstantFPValue()->isAllOnesValue();

53750

};

53751

53752

// fand (fxor X, -1), Y --> fandn X, Y

53753

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

53754

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

53755

53756

// fand X, (fxor Y, -1) --> fandn Y, X

53757

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

53758

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

53759

53760

return SDValue();

53761

}

53762

53763

/// Do target-specific dag combines on X86ISD::FAND nodes.

53764

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

53765

const X86Subtarget &Subtarget) {

53766

// FAND(0.0, x) -> 0.0

53767

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

53768

return V;

53769

53770

// FAND(x, 0.0) -> 0.0

53771

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

53772

return V;

53773

53774

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

53775

return V;

53776

53777

return lowerX86FPLogicOp(N, DAG, Subtarget);

53778

}

53779

53780

/// Do target-specific dag combines on X86ISD::FANDN nodes.

53781

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

53782

const X86Subtarget &Subtarget) {

53783

// FANDN(0.0, x) -> x

53784

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

53785

return N->getOperand(1);

53786

53787

// FANDN(x, 0.0) -> 0.0

53788

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

53789

return V;

53790

53791

return lowerX86FPLogicOp(N, DAG, Subtarget);

53792

}

53793

53794

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

53795

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

53796

TargetLowering::DAGCombinerInfo &DCI,

53797

const X86Subtarget &Subtarget) {

53798

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53798, __extension__
__PRETTY_FUNCTION__));

53799

53800

// F[X]OR(0.0, x) -> x

53801

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

53802

return N->getOperand(1);

53803

53804

// F[X]OR(x, 0.0) -> x

53805

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

53806

return N->getOperand(0);

53807

53808

if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

53809

return NewVal;

53810

53811

return lowerX86FPLogicOp(N, DAG, Subtarget);

53812

}

53813

53814

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

53815

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

53816

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53816, __extension__
__PRETTY_FUNCTION__));

53817

53818

// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

53819

if (!DAG.getTarget().Options.NoNaNsFPMath ||

53820

!DAG.getTarget().Options.NoSignedZerosFPMath)

53821

return SDValue();

53822

53823

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

53824

// into FMINC and FMAXC, which are Commutative operations.

53825

unsigned NewOp = 0;

53826

switch (N->getOpcode()) {

53827

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53827);

53828

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

53829

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

53830

}

53831

53832

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

53833

N->getOperand(0), N->getOperand(1));

53834

}

53835

53836

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

53837

const X86Subtarget &Subtarget) {

53838

EVT VT = N->getValueType(0);

53839

if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))

53840

return SDValue();

53841

53842

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53843

53844

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

53845

(Subtarget.hasSSE2() && VT == MVT::f64) ||

53846

(Subtarget.hasFP16() && VT == MVT::f16) ||

53847

(VT.isVector() && TLI.isTypeLegal(VT))))

53848

return SDValue();

53849

53850

SDValue Op0 = N->getOperand(0);

53851

SDValue Op1 = N->getOperand(1);

53852

SDLoc DL(N);

53853

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

53854

53855

// If we don't have to respect NaN inputs, this is a direct translation to x86

53856

// min/max instructions.

53857

if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

53858

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

53859

53860

// If one of the operands is known non-NaN use the native min/max instructions

53861

// with the non-NaN input as second operand.

53862

if (DAG.isKnownNeverNaN(Op1))

53863

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

53864

if (DAG.isKnownNeverNaN(Op0))

53865

return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

53866

53867

// If we have to respect NaN inputs, this takes at least 3 instructions.

53868

// Favor a library call when operating on a scalar and minimizing code size.

53869

if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

53870

return SDValue();

53871

53872

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

53873

VT);

53874

53875

// There are 4 possibilities involving NaN inputs, and these are the required

53876

// outputs:

53877

// Op1

53878

// Num NaN

53879

// ----------------

53880

// Num | Max | Op0 |

53881

// Op0 ----------------

53882

// NaN | Op1 | NaN |

53883

// ----------------

53884

//

53885

// The SSE FP max/min instructions were not designed for this case, but rather

53886

// to implement:

53887

// Min = Op1 < Op0 ? Op1 : Op0

53888

// Max = Op1 > Op0 ? Op1 : Op0

53889

//

53890

// So they always return Op0 if either input is a NaN. However, we can still

53891

// use those instructions for fmaxnum by selecting away a NaN input.

53892

53893

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

53894

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

53895

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

53896

53897

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

53898

// are NaN, the NaN value of Op1 is the result.

53899

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

53900

}

53901

53902

static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

53903

TargetLowering::DAGCombinerInfo &DCI) {

53904

EVT VT = N->getValueType(0);

53905

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53906

53907

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

53908

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

53909

return SDValue(N, 0);

53910

53911

// Convert a full vector load into vzload when not all bits are needed.

53912

SDValue In = N->getOperand(0);

53913

MVT InVT = In.getSimpleValueType();

53914

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53915

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53916

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53916, __extension__
__PRETTY_FUNCTION__));

53917

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

53918

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53919

MVT MemVT = MVT::getIntegerVT(NumBits);

53920

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53921

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53922

SDLoc dl(N);

53923

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

53924

DAG.getBitcast(InVT, VZLoad));

53925

DCI.CombineTo(N, Convert);

53926

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53927

DCI.recursivelyDeleteUnusedNodes(LN);

53928

return SDValue(N, 0);

53929

}

53930

}

53931

53932

return SDValue();

53933

}

53934

53935

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

53936

TargetLowering::DAGCombinerInfo &DCI) {

53937

bool IsStrict = N->isTargetStrictFPOpcode();

53938

EVT VT = N->getValueType(0);

53939

53940

// Convert a full vector load into vzload when not all bits are needed.

53941

SDValue In = N->getOperand(IsStrict ? 1 : 0);

53942

MVT InVT = In.getSimpleValueType();

53943

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53944

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53945

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53945, __extension__
__PRETTY_FUNCTION__));

53946

LoadSDNode *LN = cast<LoadSDNode>(In);

53947

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53948

MVT MemVT = MVT::getFloatingPointVT(NumBits);

53949

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53950

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53951

SDLoc dl(N);

53952

if (IsStrict) {

53953

SDValue Convert =

53954

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

53955

{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

53956

DCI.CombineTo(N, Convert, Convert.getValue(1));

53957

} else {

53958

SDValue Convert =

53959

DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

53960

DCI.CombineTo(N, Convert);

53961

}

53962

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53963

DCI.recursivelyDeleteUnusedNodes(LN);

53964

return SDValue(N, 0);

53965

}

53966

}

53967

53968

return SDValue();

53969

}

53970

53971

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

53972

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

53973

TargetLowering::DAGCombinerInfo &DCI,

53974

const X86Subtarget &Subtarget) {

53975

SDValue N0 = N->getOperand(0);

53976

SDValue N1 = N->getOperand(1);

53977

MVT VT = N->getSimpleValueType(0);

53978

int NumElts = VT.getVectorNumElements();

53979

unsigned EltSizeInBits = VT.getScalarSizeInBits();

53980

53981

// ANDNP(undef, x) -> 0

53982

// ANDNP(x, undef) -> 0

53983

if (N0.isUndef() || N1.isUndef())

53984

return DAG.getConstant(0, SDLoc(N), VT);

53985

53986

// ANDNP(0, x) -> x

53987

if (ISD::isBuildVectorAllZeros(N0.getNode()))

53988

return N1;

53989

53990

// ANDNP(x, 0) -> 0

53991

if (ISD::isBuildVectorAllZeros(N1.getNode()))

53992

return DAG.getConstant(0, SDLoc(N), VT);

53993

53994

// Turn ANDNP back to AND if input is inverted.

53995

if (SDValue Not = IsNOT(N0, DAG))

53996

return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);

53997

53998

// Constant Folding

53999

APInt Undefs0, Undefs1;

54000

SmallVector<APInt> EltBits0, EltBits1;

54001

if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {

54002

SDLoc DL(N);

54003

if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {

54004

SmallVector<APInt> ResultBits;

54005

for (int I = 0; I != NumElts; ++I)

54006

ResultBits.push_back(~EltBits0[I] & EltBits1[I]);

54007

return getConstVector(ResultBits, VT, DAG, DL);

54008

}

54009

54010

// Constant fold NOT(N0) to allow us to use AND.

54011

// Ensure this is only performed if we can confirm that the bitcasted source

54012

// has oneuse to prevent an infinite loop with canonicalizeBitSelect.

54013

if (N0->hasOneUse()) {

54014

SDValue BC0 = peekThroughOneUseBitcasts(N0);

54015

if (BC0.getOpcode() != ISD::BITCAST) {

54016

for (APInt &Elt : EltBits0)

54017

Elt = ~Elt;

54018

SDValue Not = getConstVector(EltBits0, VT, DAG, DL);

54019

return DAG.getNode(ISD::AND, DL, VT, Not, N1);

54020

}

54021

}

54022

}

54023

54024

// Attempt to recursively combine a bitmask ANDNP with shuffles.

54025

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

54026

SDValue Op(N, 0);

54027

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

54028

return Res;

54029

54030

// If either operand is a constant mask, then only the elements that aren't

54031

// zero are actually demanded by the other operand.

54032

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

54033

APInt UndefElts;

54034

SmallVector<APInt> EltBits;

54035

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

54036

APInt DemandedElts = APInt::getAllOnes(NumElts);

54037

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

54038

EltBits)) {

54039

DemandedBits.clearAllBits();

54040

DemandedElts.clearAllBits();

54041

for (int I = 0; I != NumElts; ++I) {

54042

if (UndefElts[I]) {

54043

// We can't assume an undef src element gives an undef dst - the

54044

// other src might be zero.

54045

DemandedBits.setAllBits();

54046

DemandedElts.setBit(I);

54047

} else if ((Invert && !EltBits[I].isAllOnes()) ||

54048

(!Invert && !EltBits[I].isZero())) {

54049

DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];

54050

DemandedElts.setBit(I);

54051

}

54052

}

54053

}

54054

return std::make_pair(DemandedBits, DemandedElts);

54055

};

54056

APInt Bits0, Elts0;

54057

APInt Bits1, Elts1;

54058

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

54059

std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);

54060

54061

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54062

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

54063

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

54064

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

54065

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

54066

if (N->getOpcode() != ISD::DELETED_NODE)

54067

DCI.AddToWorklist(N);

54068

return SDValue(N, 0);

54069

}

54070

}

54071

54072

return SDValue();

54073

}

54074

54075

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

54076

TargetLowering::DAGCombinerInfo &DCI) {

54077

SDValue N1 = N->getOperand(1);

54078

54079

// BT ignores high bits in the bit index operand.

54080

unsigned BitWidth = N1.getValueSizeInBits();

54081

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

54082

if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

54083

if (N->getOpcode() != ISD::DELETED_NODE)

54084

DCI.AddToWorklist(N);

54085

return SDValue(N, 0);

54086

}

54087

54088

return SDValue();

54089

}

54090

54091

static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

54092

TargetLowering::DAGCombinerInfo &DCI) {

54093

bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

54094

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

54095

54096

if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

54097

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54098

APInt DemandedElts = APInt::getLowBitsSet(8, 4);

54099

if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {

54100

if (N->getOpcode() != ISD::DELETED_NODE)

54101

DCI.AddToWorklist(N);

54102

return SDValue(N, 0);

54103

}

54104

54105

// Convert a full vector load into vzload when not all bits are needed.

54106

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

54107

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

54108

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

54109

SDLoc dl(N);

54110

if (IsStrict) {

54111

SDValue Convert = DAG.getNode(

54112

N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

54113

{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

54114

DCI.CombineTo(N, Convert, Convert.getValue(1));

54115

} else {

54116

SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

54117

DAG.getBitcast(MVT::v8i16, VZLoad));

54118

DCI.CombineTo(N, Convert);

54119

}

54120

54121

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

54122

DCI.recursivelyDeleteUnusedNodes(LN);

54123

return SDValue(N, 0);

54124

}

54125

}

54126

}

54127

54128

return SDValue();

54129

}

54130

54131

// Try to combine sext_in_reg of a cmov of constants by extending the constants.

54132

static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

54133

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54133, __extension__
__PRETTY_FUNCTION__));

54134

54135

EVT DstVT = N->getValueType(0);

54136

54137

SDValue N0 = N->getOperand(0);

54138

SDValue N1 = N->getOperand(1);

54139

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

54140

54141

if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

54142

return SDValue();

54143

54144

// Look through single use any_extends / truncs.

54145

SDValue IntermediateBitwidthOp;

54146

if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

54147

N0.hasOneUse()) {

54148

IntermediateBitwidthOp = N0;

54149

N0 = N0.getOperand(0);

54150

}

54151

54152

// See if we have a single use cmov.

54153

if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

54154

return SDValue();

54155

54156

SDValue CMovOp0 = N0.getOperand(0);

54157

SDValue CMovOp1 = N0.getOperand(1);

54158

54159

// Make sure both operands are constants.

54160

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

54161

!isa<ConstantSDNode>(CMovOp1.getNode()))

54162

return SDValue();

54163

54164

SDLoc DL(N);

54165

54166

// If we looked through an any_extend/trunc above, add one to the constants.

54167

if (IntermediateBitwidthOp) {

54168

unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

54169

CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

54170

CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

54171

}

54172

54173

CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

54174

CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

54175

54176

EVT CMovVT = DstVT;

54177

// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

54178

if (DstVT == MVT::i16) {

54179

CMovVT = MVT::i32;

54180

CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

54181

CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

54182

}

54183

54184

SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

54185

N0.getOperand(2), N0.getOperand(3));

54186

54187

if (CMovVT != DstVT)

54188

CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

54189

54190

return CMov;

54191

}

54192

54193

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

54194

const X86Subtarget &Subtarget) {

54195

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54195, __extension__
__PRETTY_FUNCTION__));

54196

54197

if (SDValue V = combineSextInRegCmov(N, DAG))

54198

return V;

54199

54200

EVT VT = N->getValueType(0);

54201

SDValue N0 = N->getOperand(0);

54202

SDValue N1 = N->getOperand(1);

54203

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

54204

SDLoc dl(N);

54205

54206

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

54207

// both SSE and AVX2 since there is no sign-extended shift right

54208

// operation on a vector with 64-bit elements.

54209

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

54210

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

54211

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

54212

N0.getOpcode() == ISD::SIGN_EXTEND)) {

54213

SDValue N00 = N0.getOperand(0);

54214

54215

// EXTLOAD has a better solution on AVX2,

54216

// it may be replaced with X86ISD::VSEXT node.

54217

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

54218

if (!ISD::isNormalLoad(N00.getNode()))

54219

return SDValue();

54220

54221

// Attempt to promote any comparison mask ops before moving the

54222

// SIGN_EXTEND_INREG in the way.

54223

if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

54224

return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

54225

54226

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

54227

SDValue Tmp =

54228

DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

54229

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

54230

}

54231

}

54232

return SDValue();

54233

}

54234

54235

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

54236

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

54237

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

54238

/// opportunities to combine math ops, use an LEA, or use a complex addressing

54239

/// mode. This can eliminate extend, add, and shift instructions.

54240

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

54241

const X86Subtarget &Subtarget) {

54242

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

54243

Ext->getOpcode() != ISD::ZERO_EXTEND)

54244

return SDValue();

54245

54246

// TODO: This should be valid for other integer types.

54247

EVT VT = Ext->getValueType(0);

54248

if (VT != MVT::i64)

54249

return SDValue();

54250

54251

SDValue Add = Ext->getOperand(0);

54252

if (Add.getOpcode() != ISD::ADD)

54253

return SDValue();

54254

54255

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

54256

bool NSW = Add->getFlags().hasNoSignedWrap();

54257

bool NUW = Add->getFlags().hasNoUnsignedWrap();

54258

54259

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

54260

// into the 'zext'

54261

if ((Sext && !NSW) || (!Sext && !NUW))

54262

return SDValue();

54263

54264

// Having a constant operand to the 'add' ensures that we are not increasing

54265

// the instruction count because the constant is extended for free below.

54266

// A constant operand can also become the displacement field of an LEA.

54267

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

54268

if (!AddOp1)

54269

return SDValue();

54270

54271

// Don't make the 'add' bigger if there's no hope of combining it with some

54272

// other 'add' or 'shl' instruction.

54273

// TODO: It may be profitable to generate simpler LEA instructions in place

54274

// of single 'add' instructions, but the cost model for selecting an LEA

54275

// currently has a high threshold.

54276

bool HasLEAPotential = false;

54277

for (auto *User : Ext->uses()) {

54278

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

54279

HasLEAPotential = true;

54280

break;

54281

}

54282

}

54283

if (!HasLEAPotential)

54284

return SDValue();

54285

54286

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

54287

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

54288

SDValue AddOp0 = Add.getOperand(0);

54289

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

54290

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

54291

54292

// The wider add is guaranteed to not wrap because both operands are

54293

// sign-extended.

54294

SDNodeFlags Flags;

54295

Flags.setNoSignedWrap(NSW);

54296

Flags.setNoUnsignedWrap(NUW);

54297

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

54298

}

54299

54300

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

54301

// operands and the result of CMOV is not used anywhere else - promote CMOV

54302

// itself instead of promoting its result. This could be beneficial, because:

54303

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

54304

// (or more) pseudo-CMOVs only when they go one-after-another and

54305

// getting rid of result extension code after CMOV will help that.

54306

// 2) Promotion of constant CMOV arguments is free, hence the

54307

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

54308

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

54309

// promotion is also good in terms of code-size.

54310

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

54311

// promotion).

54312

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

54313

SDValue CMovN = Extend->getOperand(0);

54314

if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

54315

return SDValue();

54316

54317

EVT TargetVT = Extend->getValueType(0);

54318

unsigned ExtendOpcode = Extend->getOpcode();

54319

SDLoc DL(Extend);

54320

54321

EVT VT = CMovN.getValueType();

54322

SDValue CMovOp0 = CMovN.getOperand(0);

54323

SDValue CMovOp1 = CMovN.getOperand(1);

54324

54325

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

54326

!isa<ConstantSDNode>(CMovOp1.getNode()))

54327

return SDValue();

54328

54329

// Only extend to i32 or i64.

54330

if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

54331

return SDValue();

54332

54333

// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

54334

// are free.

54335

if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

54336

return SDValue();

54337

54338

// If this a zero extend to i64, we should only extend to i32 and use a free

54339

// zero extend to finish.

54340

EVT ExtendVT = TargetVT;

54341

if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

54342

ExtendVT = MVT::i32;

54343

54344

CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

54345

CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

54346

54347

SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

54348

CMovN.getOperand(2), CMovN.getOperand(3));

54349

54350

// Finish extending if needed.

54351

if (ExtendVT != TargetVT)

54352

Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

54353

54354

return Res;

54355

}

54356

54357

// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

54358

// result type.

54359

static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

54360

const X86Subtarget &Subtarget) {

54361

SDValue N0 = N->getOperand(0);

54362

EVT VT = N->getValueType(0);

54363

SDLoc dl(N);

54364

54365

// Only do this combine with AVX512 for vector extends.

54366

if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

54367

return SDValue();

54368

54369

// Only combine legal element types.

54370

EVT SVT = VT.getVectorElementType();

54371

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

54372

SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

54373

return SDValue();

54374

54375

// We don't have CMPP Instruction for vxf16

54376

if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)

54377

return SDValue();

54378

// We can only do this if the vector size in 256 bits or less.

54379

unsigned Size = VT.getSizeInBits();

54380

if (Size > 256 && Subtarget.useAVX512Regs())

54381

return SDValue();

54382

54383

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

54384

// that's the only integer compares with we have.

54385

ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

54386

if (ISD::isUnsignedIntSetCC(CC))

54387

return SDValue();

54388

54389

// Only do this combine if the extension will be fully consumed by the setcc.

54390

EVT N00VT = N0.getOperand(0).getValueType();

54391

EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

54392

if (Size != MatchingVecType.getSizeInBits())

54393

return SDValue();

54394

54395

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

54396

54397

if (N->getOpcode() == ISD::ZERO_EXTEND)

54398

Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

54399

54400

return Res;

54401

}

54402

54403

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

54404

TargetLowering::DAGCombinerInfo &DCI,

54405

const X86Subtarget &Subtarget) {

54406

SDValue N0 = N->getOperand(0);

54407

EVT VT = N->getValueType(0);

54408

SDLoc DL(N);

54409

54410

// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

54411

if (!DCI.isBeforeLegalizeOps() &&

54412

N0.getOpcode() == X86ISD::SETCC_CARRY) {

54413

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

54414

N0->getOperand(1));

54415

bool ReplaceOtherUses = !N0.hasOneUse();

54416

DCI.CombineTo(N, Setcc);

54417

// Replace other uses with a truncate of the widened setcc_carry.

54418

if (ReplaceOtherUses) {

54419

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

54420

N0.getValueType(), Setcc);

54421

DCI.CombineTo(N0.getNode(), Trunc);

54422

}

54423

54424

return SDValue(N, 0);

54425

}

54426

54427

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

54428

return NewCMov;

54429

54430

if (!DCI.isBeforeLegalizeOps())

54431

return SDValue();

54432

54433

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

54434

return V;

54435

54436

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,

54437

DAG, DCI, Subtarget))

54438

return V;

54439

54440

if (VT.isVector()) {

54441

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

54442

return R;

54443

54444

if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)

54445

return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));

54446

}

54447

54448

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

54449

return NewAdd;

54450

54451

return SDValue();

54452

}

54453

54454

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

54455

TargetLowering::DAGCombinerInfo &DCI,

54456

const X86Subtarget &Subtarget) {

54457

SDLoc dl(N);

54458

EVT VT = N->getValueType(0);

54459

bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

54460

54461

// Let legalize expand this if it isn't a legal type yet.

54462

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54463

if (!TLI.isTypeLegal(VT))

54464

return SDValue();

54465

54466

SDValue A = N->getOperand(IsStrict ? 1 : 0);

54467

SDValue B = N->getOperand(IsStrict ? 2 : 1);

54468

SDValue C = N->getOperand(IsStrict ? 3 : 2);

54469

54470

// If the operation allows fast-math and the target does not support FMA,

54471

// split this into mul+add to avoid libcall(s).

54472

SDNodeFlags Flags = N->getFlags();

54473

if (!IsStrict && Flags.hasAllowReassociation() &&

54474

TLI.isOperationExpand(ISD::FMA, VT)) {

54475

SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

54476

return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

54477

}

54478

54479

EVT ScalarVT = VT.getScalarType();

54480

if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

54481

!Subtarget.hasAnyFMA()) &&

54482

!(ScalarVT == MVT::f16 && Subtarget.hasFP16()))

54483

return SDValue();

54484

54485

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

54486

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

54487

bool LegalOperations = !DCI.isBeforeLegalizeOps();

54488

if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

54489

CodeSize)) {

54490

V = NegV;

54491

return true;

54492

}

54493

// Look through extract_vector_elts. If it comes from an FNEG, create a

54494

// new extract from the FNEG input.

54495

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

54496

isNullConstant(V.getOperand(1))) {

54497

SDValue Vec = V.getOperand(0);

54498

if (SDValue NegV = TLI.getCheaperNegatedExpression(

54499

Vec, DAG, LegalOperations, CodeSize)) {

54500

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

54501

NegV, V.getOperand(1));

54502

return true;

54503

}

54504

}

54505

54506

return false;

54507

};

54508

54509

// Do not convert the passthru input of scalar intrinsics.

54510

// FIXME: We could allow negations of the lower element only.

54511

bool NegA = invertIfNegative(A);

54512

bool NegB = invertIfNegative(B);

54513

bool NegC = invertIfNegative(C);

54514

54515

if (!NegA && !NegB && !NegC)

54516

return SDValue();

54517

54518

unsigned NewOpcode =

54519

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

54520

54521

// Propagate fast-math-flags to new FMA node.

54522

SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);

54523

if (IsStrict) {

54524

assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54524, __extension__
__PRETTY_FUNCTION__));

54525

return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

54526

{N->getOperand(0), A, B, C});

54527

} else {

54528

if (N->getNumOperands() == 4)

54529

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

54530

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

54531

}

54532

}

54533

54534

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

54535

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)

54536

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

54537

TargetLowering::DAGCombinerInfo &DCI) {

54538

SDLoc dl(N);

54539

EVT VT = N->getValueType(0);

54540

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54541

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

54542

bool LegalOperations = !DCI.isBeforeLegalizeOps();

54543

54544

SDValue N2 = N->getOperand(2);

54545

54546

SDValue NegN2 =

54547

TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

54548

if (!NegN2)

54549

return SDValue();

54550

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

54551

54552

if (N->getNumOperands() == 4)

54553

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

54554

NegN2, N->getOperand(3));

54555

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

54556

NegN2);

54557

}

54558

54559

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

54560

TargetLowering::DAGCombinerInfo &DCI,

54561

const X86Subtarget &Subtarget) {

54562

SDLoc dl(N);

54563

SDValue N0 = N->getOperand(0);

54564

EVT VT = N->getValueType(0);

54565

54566

// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

54567

// FIXME: Is this needed? We don't seem to have any tests for it.

54568

if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

54569

N0.getOpcode() == X86ISD::SETCC_CARRY) {

54570

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

54571

N0->getOperand(1));

54572

bool ReplaceOtherUses = !N0.hasOneUse();

54573

DCI.CombineTo(N, Setcc);

54574

// Replace other uses with a truncate of the widened setcc_carry.

54575

if (ReplaceOtherUses) {

54576

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

54577

N0.getValueType(), Setcc);

54578

DCI.CombineTo(N0.getNode(), Trunc);

54579

}

54580

54581

return SDValue(N, 0);

54582

}

54583

54584

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

54585

return NewCMov;

54586

54587

if (DCI.isBeforeLegalizeOps())

54588

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

54589

return V;

54590

54591

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,

54592

DAG, DCI, Subtarget))

54593

return V;

54594

54595

if (VT.isVector())

54596

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

54597

return R;

54598

54599

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

54600

return NewAdd;

54601

54602

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

54603

return R;

54604

54605

// TODO: Combine with any target/faux shuffle.

54606

if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

54607

VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

54608

SDValue N00 = N0.getOperand(0);

54609

SDValue N01 = N0.getOperand(1);

54610

unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

54611

APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

54612

if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

54613

(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

54614

return concatSubVectors(N00, N01, DAG, dl);

54615

}

54616

}

54617

54618

return SDValue();

54619

}

54620

54621

/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

54622

/// pre-promote its result type since vXi1 vectors don't get promoted

54623

/// during type legalization.

54624

static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,

54625

SDValue RHS, ISD::CondCode CC,

54626

const SDLoc &DL, SelectionDAG &DAG,

54627

const X86Subtarget &Subtarget) {

54628

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

54629

VT.getVectorElementType() == MVT::i1 &&

54630

(OpVT.getVectorElementType() == MVT::i8 ||

54631

OpVT.getVectorElementType() == MVT::i16)) {

54632

SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

54633

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

54634

}

54635

return SDValue();

54636

}

54637

54638

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

54639

TargetLowering::DAGCombinerInfo &DCI,

54640

const X86Subtarget &Subtarget) {

54641

const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

54642

const SDValue LHS = N->getOperand(0);

54643

const SDValue RHS = N->getOperand(1);

54644

EVT VT = N->getValueType(0);

54645

EVT OpVT = LHS.getValueType();

54646

SDLoc DL(N);

54647

54648

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

54649

if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,

54650

Subtarget))

54651

return V;

54652

54653

if (VT == MVT::i1) {

54654

X86::CondCode X86CC;

54655

if (SDValue V =

54656

MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))

54657

return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));

54658

}

54659

54660

if (OpVT.isScalarInteger()) {

54661

// cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)

54662

// cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)

54663

auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {

54664

if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {

54665

if (N0.getOperand(0) == N1)

54666

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

54667

N0.getOperand(1));

54668

if (N0.getOperand(1) == N1)

54669

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

54670

N0.getOperand(0));

54671

}

54672

return SDValue();

54673

};

54674

if (SDValue AndN = MatchOrCmpEq(LHS, RHS))

54675

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54676

if (SDValue AndN = MatchOrCmpEq(RHS, LHS))

54677

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54678

54679

// cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)

54680

// cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)

54681

auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {

54682

if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {

54683

if (N0.getOperand(0) == N1)

54684

return DAG.getNode(ISD::AND, DL, OpVT, N1,

54685

DAG.getNOT(DL, N0.getOperand(1), OpVT));

54686

if (N0.getOperand(1) == N1)

54687

return DAG.getNode(ISD::AND, DL, OpVT, N1,

54688

DAG.getNOT(DL, N0.getOperand(0), OpVT));

54689

}

54690

return SDValue();

54691

};

54692

if (SDValue AndN = MatchAndCmpEq(LHS, RHS))

54693

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54694

if (SDValue AndN = MatchAndCmpEq(RHS, LHS))

54695

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54696

54697

// cmpeq(trunc(x),C) --> cmpeq(x,C)

54698

// cmpne(trunc(x),C) --> cmpne(x,C)

54699

// iff x upper bits are zero.

54700

if (LHS.getOpcode() == ISD::TRUNCATE &&

54701

LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&

54702

isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {

54703

EVT SrcVT = LHS.getOperand(0).getValueType();

54704

APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),

54705

OpVT.getScalarSizeInBits());

54706

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54707

auto *C = cast<ConstantSDNode>(RHS);

54708

if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&

54709

TLI.isTypeLegal(LHS.getOperand(0).getValueType()))

54710

return DAG.getSetCC(DL, VT, LHS.getOperand(0),

54711

DAG.getConstant(C->getAPIntValue().zextOrTrunc(

54712

SrcVT.getScalarSizeInBits()),

54713

DL, SrcVT),

54714

CC);

54715

}

54716

54717

// With C as a power of 2 and C != 0 and C != INT_MIN:

54718

// icmp eq Abs(X) C ->

54719

// (icmp eq A, C) | (icmp eq A, -C)

54720

// icmp ne Abs(X) C ->

54721

// (icmp ne A, C) & (icmp ne A, -C)

54722

// Both of these patterns can be better optimized in

54723

// DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar

54724

// integers which is checked above.

54725

if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {

54726

if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {

54727

const APInt &CInt = C->getAPIntValue();

54728

// We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.

54729

if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {

54730

SDValue BaseOp = LHS.getOperand(0);

54731

SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);

54732

SDValue SETCC1 = DAG.getSetCC(

54733

DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);

54734

return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,

54735

SETCC0, SETCC1);

54736

}

54737

}

54738

}

54739

}

54740

}

54741

54742

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

54743

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

54744

// Using temporaries to avoid messing up operand ordering for later

54745

// transformations if this doesn't work.

54746

SDValue Op0 = LHS;

54747

SDValue Op1 = RHS;

54748

ISD::CondCode TmpCC = CC;

54749

// Put build_vector on the right.

54750

if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

54751

std::swap(Op0, Op1);

54752

TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

54753

}

54754

54755

bool IsSEXT0 =

54756

(Op0.getOpcode() == ISD::SIGN_EXTEND) &&

54757

(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

54758

bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

54759

54760

if (IsSEXT0 && IsVZero1) {

54761

assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))

54762

"Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__));

54763

if (TmpCC == ISD::SETGT)

54764

return DAG.getConstant(0, DL, VT);

54765

if (TmpCC == ISD::SETLE)

54766

return DAG.getConstant(1, DL, VT);

54767

if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

54768

return DAG.getNOT(DL, Op0.getOperand(0), VT);

54769

54770

assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__
__PRETTY_FUNCTION__))

54771

"Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__
__PRETTY_FUNCTION__));

54772

return Op0.getOperand(0);

54773

}

54774

}

54775

54776

// Try and make unsigned vector comparison signed. On pre AVX512 targets there

54777

// only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to

54778

// use `PCMPGT` if the result is mean to stay in a vector (and if its going to

54779

// a mask, there are signed AVX512 comparisons).

54780

if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {

54781

bool CanMakeSigned = false;

54782

if (ISD::isUnsignedIntSetCC(CC)) {

54783

KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),

54784

DAG.computeKnownBits(RHS));

54785

// If we know LHS/RHS share the same sign bit at each element we can

54786

// make this signed.

54787

// NOTE: `computeKnownBits` on a vector type aggregates common bits

54788

// across all lanes. So a pattern where the sign varies from lane to

54789

// lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be

54790

// missed. We could get around this by demanding each lane

54791

// independently, but this isn't the most important optimization and

54792

// that may eat into compile time.

54793

CanMakeSigned =

54794

CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();

54795

}

54796

if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {

54797

SDValue LHSOut = LHS;

54798

SDValue RHSOut = RHS;

54799

ISD::CondCode NewCC = CC;

54800

switch (CC) {

54801

case ISD::SETGE:

54802

case ISD::SETUGE:

54803

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,

54804

/*NSW*/ true))

54805

LHSOut = NewLHS;

54806

else if (SDValue NewRHS = incDecVectorConstant(

54807

RHS, DAG, /*IsInc*/ false, /*NSW*/ true))

54808

RHSOut = NewRHS;

54809

else

54810

break;

54811

54812

[[fallthrough]];

54813

case ISD::SETUGT:

54814

NewCC = ISD::SETGT;

54815

break;

54816

54817

case ISD::SETLE:

54818

case ISD::SETULE:

54819

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,

54820

/*NSW*/ true))

54821

LHSOut = NewLHS;

54822

else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,

54823

/*NSW*/ true))

54824

RHSOut = NewRHS;

54825

else

54826

break;

54827

54828

[[fallthrough]];

54829

case ISD::SETULT:

54830

// Will be swapped to SETGT in LowerVSETCC*.

54831

NewCC = ISD::SETLT;

54832

break;

54833

default:

54834

break;

54835

}

54836

if (NewCC != CC) {

54837

if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,

54838

NewCC, DL, DAG, Subtarget))

54839

return R;

54840

return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);

54841

}

54842

}

54843

}

54844

54845

if (SDValue R =

54846

truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))

54847

return R;

54848

54849

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

54850

// to avoid scalarization via legalization because v4i32 is not a legal type.

54851

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

54852

LHS.getValueType() == MVT::v4f32)

54853

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

54854

54855

// X pred 0.0 --> X pred -X

54856

// If the negation of X already exists, use it in the comparison. This removes

54857

// the need to materialize 0.0 and allows matching to SSE's MIN/MAX

54858

// instructions in patterns with a 'select' node.

54859

if (isNullFPScalarOrVectorConst(RHS)) {

54860

SDVTList FNegVT = DAG.getVTList(OpVT);

54861

if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))

54862

return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);

54863

}

54864

54865

return SDValue();

54866

}

54867

54868

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

54869

TargetLowering::DAGCombinerInfo &DCI,

54870

const X86Subtarget &Subtarget) {

54871

SDValue Src = N->getOperand(0);

54872

MVT SrcVT = Src.getSimpleValueType();

54873

MVT VT = N->getSimpleValueType(0);

54874

unsigned NumBits = VT.getScalarSizeInBits();

54875

unsigned NumElts = SrcVT.getVectorNumElements();

54876

unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();

54877

assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54877, __extension__
__PRETTY_FUNCTION__));

54878

54879

// Perform constant folding.

54880

APInt UndefElts;

54881

SmallVector<APInt, 32> EltBits;

54882

if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {

54883

APInt Imm(32, 0);

54884

for (unsigned Idx = 0; Idx != NumElts; ++Idx)

54885

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

54886

Imm.setBit(Idx);

54887

54888

return DAG.getConstant(Imm, SDLoc(N), VT);

54889

}

54890

54891

// Look through int->fp bitcasts that don't change the element width.

54892

unsigned EltWidth = SrcVT.getScalarSizeInBits();

54893

if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

54894

Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

54895

return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

54896

54897

// Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results

54898

// with scalar comparisons.

54899

if (SDValue NotSrc = IsNOT(Src, DAG)) {

54900

SDLoc DL(N);

54901

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54902

NotSrc = DAG.getBitcast(SrcVT, NotSrc);

54903

return DAG.getNode(ISD::XOR, DL, VT,

54904

DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

54905

DAG.getConstant(NotMask, DL, VT));

54906

}

54907

54908

// Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk

54909

// results with scalar comparisons.

54910

if (Src.getOpcode() == X86ISD::PCMPGT &&

54911

ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {

54912

SDLoc DL(N);

54913

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54914

return DAG.getNode(ISD::XOR, DL, VT,

54915

DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),

54916

DAG.getConstant(NotMask, DL, VT));

54917

}

54918

54919

// Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))

54920

// Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))

54921

// iff pow2splat(c1).

54922

// Use KnownBits to determine if only a single bit is non-zero

54923

// in each element (pow2 or zero), and shift that bit to the msb.

54924

if (Src.getOpcode() == X86ISD::PCMPEQ) {

54925

KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));

54926

KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));

54927

unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();

54928

if (KnownLHS.countMaxPopulation() == 1 &&

54929

(KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&

54930

ShiftAmt == KnownRHS.countMinLeadingZeros()))) {

54931

SDLoc DL(N);

54932

MVT ShiftVT = SrcVT;

54933

SDValue ShiftLHS = Src.getOperand(0);

54934

SDValue ShiftRHS = Src.getOperand(1);

54935

if (ShiftVT.getScalarType() == MVT::i8) {

54936

// vXi8 shifts - we only care about the signbit so can use PSLLW.

54937

ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

54938

ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);

54939

ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);

54940

}

54941

ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

54942

ShiftLHS, ShiftAmt, DAG);

54943

ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

54944

ShiftRHS, ShiftAmt, DAG);

54945

ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);

54946

ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);

54947

SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);

54948

return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));

54949

}

54950

}

54951

54952

// Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)

54953

if (N->isOnlyUserOf(Src.getNode())) {

54954

SDValue SrcBC = peekThroughOneUseBitcasts(Src);

54955

if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {

54956

APInt UndefElts;

54957

SmallVector<APInt, 32> EltBits;

54958

if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,

54959

UndefElts, EltBits)) {

54960

APInt Mask = APInt::getZero(NumBits);

54961

for (unsigned Idx = 0; Idx != NumElts; ++Idx) {

54962

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

54963

Mask.setBit(Idx);

54964

}

54965

SDLoc DL(N);

54966

SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));

54967

SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);

54968

return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,

54969

DAG.getConstant(Mask, DL, VT));

54970

}

54971

}

54972

}

54973

54974

// Simplify the inputs.

54975

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54976

APInt DemandedMask(APInt::getAllOnes(NumBits));

54977

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

54978

return SDValue(N, 0);

54979

54980

return SDValue();

54981

}

54982

54983

static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,

54984

TargetLowering::DAGCombinerInfo &DCI,

54985

const X86Subtarget &Subtarget) {

54986

MVT VT = N->getSimpleValueType(0);

54987

unsigned NumBits = VT.getScalarSizeInBits();

54988

54989

// Simplify the inputs.

54990

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54991

APInt DemandedMask(APInt::getAllOnes(NumBits));

54992

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

54993

return SDValue(N, 0);

54994

54995

return SDValue();

54996

}

54997

54998

static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

54999

TargetLowering::DAGCombinerInfo &DCI,

55000

const X86Subtarget &Subtarget) {

55001

auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);

55002

SDValue BasePtr = MemOp->getBasePtr();

55003

SDValue Index = MemOp->getIndex();

55004

SDValue Scale = MemOp->getScale();

55005

SDValue Mask = MemOp->getMask();

55006

55007

// Attempt to fold an index scale into the scale value directly.

55008

// For smaller indices, implicit sext is performed BEFORE scale, preventing

55009

// this fold under most circumstances.

55010

// TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?

55011

if ((Index.getOpcode() == X86ISD::VSHLI ||

55012

(Index.getOpcode() == ISD::ADD &&

55013

Index.getOperand(0) == Index.getOperand(1))) &&

55014

isa<ConstantSDNode>(Scale) &&

55015

BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {

55016

unsigned ShiftAmt =

55017

Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);

55018

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

55019

uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);

55020

if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {

55021

SDValue NewIndex = Index.getOperand(0);

55022

SDValue NewScale =

55023

DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());

55024

if (N->getOpcode() == X86ISD::MGATHER)

55025

return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,

55026

MemOp->getOperand(1), Mask,

55027

MemOp->getBasePtr(), NewIndex, NewScale,

55028

MemOp->getChain(), Subtarget);

55029

if (N->getOpcode() == X86ISD::MSCATTER)

55030

return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,

55031

MemOp->getOperand(1), Mask, MemOp->getBasePtr(),

55032

NewIndex, NewScale, MemOp->getChain(), Subtarget);

55033

}

55034

}

55035

55036

// With vector masks we only demand the upper bit of the mask.

55037

if (Mask.getScalarValueSizeInBits() != 1) {

55038

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55039

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

55040

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

55041

if (N->getOpcode() != ISD::DELETED_NODE)

55042

DCI.AddToWorklist(N);

55043

return SDValue(N, 0);

55044

}

55045

}

55046

55047

return SDValue();

55048

}

55049

55050

static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

55051

SDValue Index, SDValue Base, SDValue Scale,

55052

SelectionDAG &DAG) {

55053

SDLoc DL(GorS);

55054

55055

if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

55056

SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

55057

Gather->getMask(), Base, Index, Scale } ;

55058

return DAG.getMaskedGather(Gather->getVTList(),

55059

Gather->getMemoryVT(), DL, Ops,

55060

Gather->getMemOperand(),

55061

Gather->getIndexType(),

55062

Gather->getExtensionType());

55063

}

55064

auto *Scatter = cast<MaskedScatterSDNode>(GorS);

55065

SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

55066

Scatter->getMask(), Base, Index, Scale };

55067

return DAG.getMaskedScatter(Scatter->getVTList(),

55068

Scatter->getMemoryVT(), DL,

55069

Ops, Scatter->getMemOperand(),

55070

Scatter->getIndexType(),

55071

Scatter->isTruncatingStore());

55072

}

55073

55074

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

55075

TargetLowering::DAGCombinerInfo &DCI) {

55076

SDLoc DL(N);

55077

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

55078

SDValue Index = GorS->getIndex();

55079

SDValue Base = GorS->getBasePtr();

55080

SDValue Scale = GorS->getScale();

55081

55082

if (DCI.isBeforeLegalize()) {

55083

unsigned IndexWidth = Index.getScalarValueSizeInBits();

55084

55085

// Shrink constant indices if they are larger than 32-bits.

55086

// Only do this before legalize types since v2i64 could become v2i32.

55087

// FIXME: We could check that the type is legal if we're after legalize

55088

// types, but then we would need to construct test cases where that happens.

55089

// FIXME: We could support more than just constant vectors, but we need to

55090

// careful with costing. A truncate that can be optimized out would be fine.

55091

// Otherwise we might only want to create a truncate if it avoids a split.

55092

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {

55093

if (BV->isConstant() && IndexWidth > 32 &&

55094

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

55095

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

55096

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

55097

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

55098

}

55099

}

55100

55101

// Shrink any sign/zero extends from 32 or smaller to larger than 32 if

55102

// there are sufficient sign bits. Only do this before legalize types to

55103

// avoid creating illegal types in truncate.

55104

if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

55105

Index.getOpcode() == ISD::ZERO_EXTEND) &&

55106

IndexWidth > 32 &&

55107

Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&

55108

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

55109

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

55110

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

55111

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

55112

}

55113

}

55114

55115

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55116

EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

55117

// Try to move splat constant adders from the index operand to the base

55118

// pointer operand. Taking care to multiply by the scale. We can only do

55119

// this when index element type is the same as the pointer type.

55120

// Otherwise we need to be sure the math doesn't wrap before the scale.

55121

if (Index.getOpcode() == ISD::ADD &&

55122

Index.getValueType().getVectorElementType() == PtrVT &&

55123

isa<ConstantSDNode>(Scale)) {

55124

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

55125

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {

55126

BitVector UndefElts;

55127

if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {

55128

// FIXME: Allow non-constant?

55129

if (UndefElts.none()) {

55130

// Apply the scale.

55131

APInt Adder = C->getAPIntValue() * ScaleAmt;

55132

// Add it to the existing base.

55133

Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

55134

DAG.getConstant(Adder, DL, PtrVT));

55135

Index = Index.getOperand(0);

55136

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

55137

}

55138

}

55139

55140

// It's also possible base is just a constant. In that case, just

55141

// replace it with 0 and move the displacement into the index.

55142

if (BV->isConstant() && isa<ConstantSDNode>(Base) &&

55143

isOneConstant(Scale)) {

55144

SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);

55145

// Combine the constant build_vector and the constant base.

55146

Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

55147

Index.getOperand(1), Splat);

55148

// Add to the LHS of the original Index add.

55149

Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

55150

Index.getOperand(0), Splat);

55151

Base = DAG.getConstant(0, DL, Base.getValueType());

55152

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

55153

}

55154

}

55155

}

55156

55157

if (DCI.isBeforeLegalizeOps()) {

55158

unsigned IndexWidth = Index.getScalarValueSizeInBits();

55159

55160

// Make sure the index is either i32 or i64

55161

if (IndexWidth != 32 && IndexWidth != 64) {

55162

MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

55163

EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);

55164

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

55165

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

55166

}

55167

}

55168

55169

// With vector masks we only demand the upper bit of the mask.

55170

SDValue Mask = GorS->getMask();

55171

if (Mask.getScalarValueSizeInBits() != 1) {

55172

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55173

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

55174

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

55175

if (N->getOpcode() != ISD::DELETED_NODE)

55176

DCI.AddToWorklist(N);

55177

return SDValue(N, 0);

55178

}

55179

}

55180

55181

return SDValue();

55182

}

55183

55184

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

55185

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

55186

const X86Subtarget &Subtarget) {

55187

SDLoc DL(N);

55188

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

55189

SDValue EFLAGS = N->getOperand(1);

55190

55191

// Try to simplify the EFLAGS and condition code operands.

55192

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

55193

return getSETCC(CC, Flags, DL, DAG);

55194

55195

return SDValue();

55196

}

55197

55198

/// Optimize branch condition evaluation.

55199

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

55200

const X86Subtarget &Subtarget) {

55201

SDLoc DL(N);

55202

SDValue EFLAGS = N->getOperand(3);

55203

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

55204

55205

// Try to simplify the EFLAGS and condition code operands.

55206

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

55207

// RAUW them under us.

55208

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

55209

SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

55210

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

55211

N->getOperand(1), Cond, Flags);

55212

}

55213

55214

return SDValue();

55215

}

55216

55217

// TODO: Could we move this to DAGCombine?

55218

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

55219

SelectionDAG &DAG) {

55220

// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

55221

// to optimize away operation when it's from a constant.

55222

//

55223

// The general transformation is:

55224

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

55225

// AND(VECTOR_CMP(x,y), constant2)

55226

// constant2 = UNARYOP(constant)

55227

55228

// Early exit if this isn't a vector operation, the operand of the

55229

// unary operation isn't a bitwise AND, or if the sizes of the operations

55230

// aren't the same.

55231

EVT VT = N->getValueType(0);

55232

bool IsStrict = N->isStrictFPOpcode();

55233

unsigned NumEltBits = VT.getScalarSizeInBits();

55234

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

55235

if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

55236

DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

55237

VT.getSizeInBits() != Op0.getValueSizeInBits())

55238

return SDValue();

55239

55240

// Now check that the other operand of the AND is a constant. We could

55241

// make the transformation for non-constant splats as well, but it's unclear

55242

// that would be a benefit as it would not eliminate any operations, just

55243

// perform one more step in scalar code before moving to the vector unit.

55244

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

55245

// Bail out if the vector isn't a constant.

55246

if (!BV->isConstant())

55247

return SDValue();

55248

55249

// Everything checks out. Build up the new and improved node.

55250

SDLoc DL(N);

55251

EVT IntVT = BV->getValueType(0);

55252

// Create a new constant of the appropriate type for the transformed

55253

// DAG.

55254

SDValue SourceConst;

55255

if (IsStrict)

55256

SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

55257

{N->getOperand(0), SDValue(BV, 0)});

55258

else

55259

SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

55260

// The AND node needs bitcasts to/from an integer vector type around it.

55261

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

55262

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

55263

MaskConst);

55264

SDValue Res = DAG.getBitcast(VT, NewAnd);

55265

if (IsStrict)

55266

return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

55267

return Res;

55268

}

55269

55270

return SDValue();

55271

}

55272

55273

/// If we are converting a value to floating-point, try to replace scalar

55274

/// truncate of an extracted vector element with a bitcast. This tries to keep

55275

/// the sequence on XMM registers rather than moving between vector and GPRs.

55276

static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

55277

// TODO: This is currently only used by combineSIntToFP, but it is generalized

55278

// to allow being called by any similar cast opcode.

55279

// TODO: Consider merging this into lowering: vectorizeExtractedCast().

55280

SDValue Trunc = N->getOperand(0);

55281

if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

55282

return SDValue();

55283

55284

SDValue ExtElt = Trunc.getOperand(0);

55285

if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55286

!isNullConstant(ExtElt.getOperand(1)))

55287

return SDValue();

55288

55289

EVT TruncVT = Trunc.getValueType();

55290

EVT SrcVT = ExtElt.getValueType();

55291

unsigned DestWidth = TruncVT.getSizeInBits();

55292

unsigned SrcWidth = SrcVT.getSizeInBits();

55293

if (SrcWidth % DestWidth != 0)

55294

return SDValue();

55295

55296

// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

55297

EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

55298

unsigned VecWidth = SrcVecVT.getSizeInBits();

55299

unsigned NumElts = VecWidth / DestWidth;

55300

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

55301

SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

55302

SDLoc DL(N);

55303

SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

55304

BitcastVec, ExtElt.getOperand(1));

55305

return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

55306

}

55307

55308

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

55309

const X86Subtarget &Subtarget) {

55310

bool IsStrict = N->isStrictFPOpcode();

55311

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

55312

EVT VT = N->getValueType(0);

55313

EVT InVT = Op0.getValueType();

55314

55315

// UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))

55316

// UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))

55317

// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))

55318

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

55319

unsigned ScalarSize = InVT.getScalarSizeInBits();

55320

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

55321

return SDValue();

55322

SDLoc dl(N);

55323

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

55324

ScalarSize < 16 ? MVT::i16

55325

: ScalarSize < 32 ? MVT::i32

55326

: MVT::i64,

55327

InVT.getVectorNumElements());

55328

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

55329

if (IsStrict)

55330

return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},

55331

{N->getOperand(0), P});

55332

return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

55333

}

55334

55335

// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

55336

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

55337

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

55338

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

55339

VT.getScalarType() != MVT::f16) {

55340

SDLoc dl(N);

55341

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

55342

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

55343

55344

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

55345

if (IsStrict)

55346

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55347

{N->getOperand(0), P});

55348

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55349

}

55350

55351

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

55352

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

55353

// the optimization here.

55354

if (DAG.SignBitIsZero(Op0)) {

55355

if (IsStrict)

55356

return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

55357

{N->getOperand(0), Op0});

55358

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

55359

}

55360

55361

return SDValue();

55362

}

55363

55364

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

55365

TargetLowering::DAGCombinerInfo &DCI,

55366

const X86Subtarget &Subtarget) {

55367

// First try to optimize away the conversion entirely when it's

55368

// conditionally from a constant. Vectors only.

55369

bool IsStrict = N->isStrictFPOpcode();

55370

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

55371

return Res;

55372

55373

// Now move on to more general possibilities.

55374

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

55375

EVT VT = N->getValueType(0);

55376

EVT InVT = Op0.getValueType();

55377

55378

// SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))

55379

// SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))

55380

// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))

55381

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

55382

unsigned ScalarSize = InVT.getScalarSizeInBits();

55383

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

55384

return SDValue();

55385

SDLoc dl(N);

55386

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

55387

ScalarSize < 16 ? MVT::i16

55388

: ScalarSize < 32 ? MVT::i32

55389

: MVT::i64,

55390

InVT.getVectorNumElements());

55391

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

55392

if (IsStrict)

55393

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55394

{N->getOperand(0), P});

55395

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55396

}

55397

55398

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

55399

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

55400

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

55401

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

55402

VT.getScalarType() != MVT::f16) {

55403

SDLoc dl(N);

55404

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

55405

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

55406

if (IsStrict)

55407

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55408

{N->getOperand(0), P});

55409

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55410

}

55411

55412

// Without AVX512DQ we only support i64 to float scalar conversion. For both

55413

// vectors and scalars, see if we know that the upper bits are all the sign

55414

// bit, in which case we can truncate the input to i32 and convert from that.

55415

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

55416

unsigned BitWidth = InVT.getScalarSizeInBits();

55417

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

55418

if (NumSignBits >= (BitWidth - 31)) {

55419

EVT TruncVT = MVT::i32;

55420

if (InVT.isVector())

55421

TruncVT = InVT.changeVectorElementType(TruncVT);

55422

SDLoc dl(N);

55423

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

55424

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

55425

if (IsStrict)

55426

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55427

{N->getOperand(0), Trunc});

55428

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

55429

}

55430

// If we're after legalize and the type is v2i32 we need to shuffle and

55431

// use CVTSI2P.

55432

assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55432, __extension__
__PRETTY_FUNCTION__));

55433

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

55434

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

55435

{ 0, 2, -1, -1 });

55436

if (IsStrict)

55437

return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

55438

{N->getOperand(0), Shuf});

55439

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

55440

}

55441

}

55442

55443

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

55444

// a 32-bit target where SSE doesn't support i64->FP operations.

55445

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

55446

Op0.getOpcode() == ISD::LOAD) {

55447

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

55448

55449

// This transformation is not supported if the result type is f16 or f128.

55450

if (VT == MVT::f16 || VT == MVT::f128)

55451

return SDValue();

55452

55453

// If we have AVX512DQ we can use packed conversion instructions unless

55454

// the VT is f80.

55455

if (Subtarget.hasDQI() && VT != MVT::f80)

55456

return SDValue();

55457

55458

if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

55459

Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

55460

std::pair<SDValue, SDValue> Tmp =

55461

Subtarget.getTargetLowering()->BuildFILD(

55462

VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

55463

Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

55464

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

55465

return Tmp.first;

55466

}

55467

}

55468

55469

if (IsStrict)

55470

return SDValue();

55471

55472

if (SDValue V = combineToFPTruncExtElt(N, DAG))

55473

return V;

55474

55475

return SDValue();

55476

}

55477

55478

static bool needCarryOrOverflowFlag(SDValue Flags) {

55479

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55479, __extension__
__PRETTY_FUNCTION__));

55480

55481

for (const SDNode *User : Flags->uses()) {

55482

X86::CondCode CC;

55483

switch (User->getOpcode()) {

55484

default:

55485

// Be conservative.

55486

return true;

55487

case X86ISD::SETCC:

55488

case X86ISD::SETCC_CARRY:

55489

CC = (X86::CondCode)User->getConstantOperandVal(0);

55490

break;

55491

case X86ISD::BRCOND:

55492

case X86ISD::CMOV:

55493

CC = (X86::CondCode)User->getConstantOperandVal(2);

55494

break;

55495

}

55496

55497

switch (CC) {

55498

default: break;

55499

case X86::COND_A: case X86::COND_AE:

55500

case X86::COND_B: case X86::COND_BE:

55501

case X86::COND_O: case X86::COND_NO:

55502

case X86::COND_G: case X86::COND_GE:

55503

case X86::COND_L: case X86::COND_LE:

55504

return true;

55505

}

55506

}

55507

55508

return false;

55509

}

55510

55511

static bool onlyZeroFlagUsed(SDValue Flags) {

55512

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55512, __extension__
__PRETTY_FUNCTION__));

55513

55514

for (const SDNode *User : Flags->uses()) {

55515

unsigned CCOpNo;

55516

switch (User->getOpcode()) {

55517

default:

55518

// Be conservative.

55519

return false;

55520

case X86ISD::SETCC:

55521

case X86ISD::SETCC_CARRY:

55522

CCOpNo = 0;

55523

break;

55524

case X86ISD::BRCOND:

55525

case X86ISD::CMOV:

55526

CCOpNo = 2;

55527

break;

55528

}

55529

55530

X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

55531

if (CC != X86::COND_E && CC != X86::COND_NE)

55532

return false;

55533

}

55534

55535

return true;

55536

}

55537

55538

static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {

55539

// Only handle test patterns.

55540

if (!isNullConstant(N->getOperand(1)))

55541

return SDValue();

55542

55543

// If we have a CMP of a truncated binop, see if we can make a smaller binop

55544

// and use its flags directly.

55545

// TODO: Maybe we should try promoting compares that only use the zero flag

55546

// first if we can prove the upper bits with computeKnownBits?

55547

SDLoc dl(N);

55548

SDValue Op = N->getOperand(0);

55549

EVT VT = Op.getValueType();

55550

55551

// If we have a constant logical shift that's only used in a comparison

55552

// against zero turn it into an equivalent AND. This allows turning it into

55553

// a TEST instruction later.

55554

if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

55555

Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

55556

onlyZeroFlagUsed(SDValue(N, 0))) {

55557

unsigned BitWidth = VT.getSizeInBits();

55558

const APInt &ShAmt = Op.getConstantOperandAPInt(1);

55559

if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

55560

unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

55561

APInt Mask = Op.getOpcode() == ISD::SRL

55562

? APInt::getHighBitsSet(BitWidth, MaskBits)

55563

: APInt::getLowBitsSet(BitWidth, MaskBits);

55564

if (Mask.isSignedIntN(32)) {

55565

Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

55566

DAG.getConstant(Mask, dl, VT));

55567

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55568

DAG.getConstant(0, dl, VT));

55569

}

55570

}

55571

}

55572

55573

// Peek through any zero-extend if we're only testing for a zero result.

55574

if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {

55575

SDValue Src = Op.getOperand(0);

55576

EVT SrcVT = Src.getValueType();

55577

if (SrcVT.getScalarSizeInBits() >= 8 &&

55578

DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

55579

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,

55580

DAG.getConstant(0, dl, SrcVT));

55581

}

55582

55583

// Look for a truncate.

55584

if (Op.getOpcode() != ISD::TRUNCATE)

55585

return SDValue();

55586

55587

SDValue Trunc = Op;

55588

Op = Op.getOperand(0);

55589

55590

// See if we can compare with zero against the truncation source,

55591

// which should help using the Z flag from many ops. Only do this for

55592

// i32 truncated op to prevent partial-reg compares of promoted ops.

55593

EVT OpVT = Op.getValueType();

55594

APInt UpperBits =

55595

APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());

55596

if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&

55597

onlyZeroFlagUsed(SDValue(N, 0))) {

55598

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55599

DAG.getConstant(0, dl, OpVT));

55600

}

55601

55602

// After this the truncate and arithmetic op must have a single use.

55603

if (!Trunc.hasOneUse() || !Op.hasOneUse())

55604

return SDValue();

55605

55606

unsigned NewOpc;

55607

switch (Op.getOpcode()) {

55608

default: return SDValue();

55609

case ISD::AND:

55610

// Skip and with constant. We have special handling for and with immediate

55611

// during isel to generate test instructions.

55612

if (isa<ConstantSDNode>(Op.getOperand(1)))

55613

return SDValue();

55614

NewOpc = X86ISD::AND;

55615

break;

55616

case ISD::OR: NewOpc = X86ISD::OR; break;

55617

case ISD::XOR: NewOpc = X86ISD::XOR; break;

55618

case ISD::ADD:

55619

// If the carry or overflow flag is used, we can't truncate.

55620

if (needCarryOrOverflowFlag(SDValue(N, 0)))

55621

return SDValue();

55622

NewOpc = X86ISD::ADD;

55623

break;

55624

case ISD::SUB:

55625

// If the carry or overflow flag is used, we can't truncate.

55626

if (needCarryOrOverflowFlag(SDValue(N, 0)))

55627

return SDValue();

55628

NewOpc = X86ISD::SUB;

55629

break;

55630

}

55631

55632

// We found an op we can narrow. Truncate its inputs.

55633

SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

55634

SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

55635

55636

// Use a X86 specific opcode to avoid DAG combine messing with it.

55637

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55638

Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

55639

55640

// For AND, keep a CMP so that we can match the test pattern.

55641

if (NewOpc == X86ISD::AND)

55642

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55643

DAG.getConstant(0, dl, VT));

55644

55645

// Return the flags.

55646

return Op.getValue(1);

55647

}

55648

55649

static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

55650

TargetLowering::DAGCombinerInfo &DCI) {

55651

assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__
__PRETTY_FUNCTION__))

55652

"Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__
__PRETTY_FUNCTION__));

55653

55654

SDLoc DL(N);

55655

SDValue LHS = N->getOperand(0);

55656

SDValue RHS = N->getOperand(1);

55657

MVT VT = LHS.getSimpleValueType();

55658

bool IsSub = X86ISD::SUB == N->getOpcode();

55659

unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;

55660

55661

// If we don't use the flag result, simplify back to a generic ADD/SUB.

55662

if (!N->hasAnyUseOfValue(1)) {

55663

SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

55664

return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

55665

}

55666

55667

// Fold any similar generic ADD/SUB opcodes to reuse this node.

55668

auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {

55669

SDValue Ops[] = {N0, N1};

55670

SDVTList VTs = DAG.getVTList(N->getValueType(0));

55671

if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {

55672

SDValue Op(N, 0);

55673

if (Negate)

55674

Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);

55675

DCI.CombineTo(GenericAddSub, Op);

55676

}

55677

};

55678

MatchGeneric(LHS, RHS, false);

55679

MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

55680

55681

// TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the

55682

// EFLAGS result doesn't change.

55683

return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,

55684

/*ZeroSecondOpOnly*/ true);

55685

}

55686

55687

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

55688

SDValue LHS = N->getOperand(0);

55689

SDValue RHS = N->getOperand(1);

55690

SDValue BorrowIn = N->getOperand(2);

55691

55692

if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {

55693

MVT VT = N->getSimpleValueType(0);

55694

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55695

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);

55696

}

55697

55698

// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

55699

// iff the flag result is dead.

55700

if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&

55701

!N->hasAnyUseOfValue(1))

55702

return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),

55703

LHS.getOperand(1), BorrowIn);

55704

55705

return SDValue();

55706

}

55707

55708

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

55709

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

55710

TargetLowering::DAGCombinerInfo &DCI) {

55711

SDValue LHS = N->getOperand(0);

55712

SDValue RHS = N->getOperand(1);

55713

SDValue CarryIn = N->getOperand(2);

55714

auto *LHSC = dyn_cast<ConstantSDNode>(LHS);

55715

auto *RHSC = dyn_cast<ConstantSDNode>(RHS);

55716

55717

// Canonicalize constant to RHS.

55718

if (LHSC && !RHSC)

55719

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,

55720

CarryIn);

55721

55722

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

55723

// the result is either zero or one (depending on the input carry bit).

55724

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

55725

if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&

55726

// We don't have a good way to replace an EFLAGS use, so only do this when

55727

// dead right now.

55728

SDValue(N, 1).use_empty()) {

55729

SDLoc DL(N);

55730

EVT VT = N->getValueType(0);

55731

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

55732

SDValue Res1 = DAG.getNode(

55733

ISD::AND, DL, VT,

55734

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

55735

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),

55736

DAG.getConstant(1, DL, VT));

55737

return DCI.CombineTo(N, Res1, CarryOut);

55738

}

55739

55740

// Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)

55741

// iff the flag result is dead.

55742

// TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.

55743

if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {

55744

SDLoc DL(N);

55745

APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();

55746

return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),

55747

DAG.getConstant(0, DL, LHS.getValueType()),

55748

DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);

55749

}

55750

55751

if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {

55752

MVT VT = N->getSimpleValueType(0);

55753

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55754

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);

55755

}

55756

55757

// Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)

55758

// iff the flag result is dead.

55759

if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&

55760

!N->hasAnyUseOfValue(1))

55761

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),

55762

LHS.getOperand(1), CarryIn);

55763

55764

return SDValue();

55765

}

55766

55767

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

55768

const SDLoc &DL, EVT VT,

55769

const X86Subtarget &Subtarget) {

55770

// Example of pattern we try to detect:

55771

// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

55772

//(add (build_vector (extract_elt t, 0),

55773

// (extract_elt t, 2),

55774

// (extract_elt t, 4),

55775

// (extract_elt t, 6)),

55776

// (build_vector (extract_elt t, 1),

55777

// (extract_elt t, 3),

55778

// (extract_elt t, 5),

55779

// (extract_elt t, 7)))

55780

55781

if (!Subtarget.hasSSE2())

55782

return SDValue();

55783

55784

if (Op0.getOpcode() != ISD::BUILD_VECTOR ||

55785

Op1.getOpcode() != ISD::BUILD_VECTOR)

55786

return SDValue();

55787

55788

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55789

VT.getVectorNumElements() < 4 ||

55790

!isPowerOf2_32(VT.getVectorNumElements()))

55791

return SDValue();

55792

55793

// Check if one of Op0,Op1 is of the form:

55794

// (build_vector (extract_elt Mul, 0),

55795

// (extract_elt Mul, 2),

55796

// (extract_elt Mul, 4),

55797

// ...

55798

// the other is of the form:

55799

// (build_vector (extract_elt Mul, 1),

55800

// (extract_elt Mul, 3),

55801

// (extract_elt Mul, 5),

55802

// ...

55803

// and identify Mul.

55804

SDValue Mul;

55805

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

55806

SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

55807

Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

55808

// TODO: Be more tolerant to undefs.

55809

if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55810

Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55811

Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55812

Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55813

return SDValue();

55814

auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));

55815

auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));

55816

auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));

55817

auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));

55818

if (!Const0L || !Const1L || !Const0H || !Const1H)

55819

return SDValue();

55820

unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),

55821

Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();

55822

// Commutativity of mul allows factors of a product to reorder.

55823

if (Idx0L > Idx1L)

55824

std::swap(Idx0L, Idx1L);

55825

if (Idx0H > Idx1H)

55826

std::swap(Idx0H, Idx1H);

55827

// Commutativity of add allows pairs of factors to reorder.

55828

if (Idx0L > Idx0H) {

55829

std::swap(Idx0L, Idx0H);

55830

std::swap(Idx1L, Idx1H);

55831

}

55832

if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

55833

Idx1H != 2 * i + 3)

55834

return SDValue();

55835

if (!Mul) {

55836

// First time an extract_elt's source vector is visited. Must be a MUL

55837

// with 2X number of vector elements than the BUILD_VECTOR.

55838

// Both extracts must be from same MUL.

55839

Mul = Op0L->getOperand(0);

55840

if (Mul->getOpcode() != ISD::MUL ||

55841

Mul.getValueType().getVectorNumElements() != 2 * e)

55842

return SDValue();

55843

}

55844

// Check that the extract is from the same MUL previously seen.

55845

if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||

55846

Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))

55847

return SDValue();

55848

}

55849

55850

// Check if the Mul source can be safely shrunk.

55851

ShrinkMode Mode;

55852

if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

55853

Mode == ShrinkMode::MULU16)

55854

return SDValue();

55855

55856

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55857

VT.getVectorNumElements() * 2);

55858

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

55859

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

55860

55861

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55862

ArrayRef<SDValue> Ops) {

55863

EVT InVT = Ops[0].getValueType();

55864

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55864, __extension__
__PRETTY_FUNCTION__));

55865

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55866

InVT.getVectorNumElements() / 2);

55867

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55868

};

55869

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

55870

}

55871

55872

// Attempt to turn this pattern into PMADDWD.

55873

// (add (mul (sext (build_vector)), (sext (build_vector))),

55874

// (mul (sext (build_vector)), (sext (build_vector)))

55875

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

55876

const SDLoc &DL, EVT VT,

55877

const X86Subtarget &Subtarget) {

55878

if (!Subtarget.hasSSE2())

55879

return SDValue();

55880

55881

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

55882

return SDValue();

55883

55884

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55885

VT.getVectorNumElements() < 4 ||

55886

!isPowerOf2_32(VT.getVectorNumElements()))

55887

return SDValue();

55888

55889

SDValue N00 = N0.getOperand(0);

55890

SDValue N01 = N0.getOperand(1);

55891

SDValue N10 = N1.getOperand(0);

55892

SDValue N11 = N1.getOperand(1);

55893

55894

// All inputs need to be sign extends.

55895

// TODO: Support ZERO_EXTEND from known positive?

55896

if (N00.getOpcode() != ISD::SIGN_EXTEND ||

55897

N01.getOpcode() != ISD::SIGN_EXTEND ||

55898

N10.getOpcode() != ISD::SIGN_EXTEND ||

55899

N11.getOpcode() != ISD::SIGN_EXTEND)

55900

return SDValue();

55901

55902

// Peek through the extends.

55903

N00 = N00.getOperand(0);

55904

N01 = N01.getOperand(0);

55905

N10 = N10.getOperand(0);

55906

N11 = N11.getOperand(0);

55907

55908

// Must be extending from vXi16.

55909

EVT InVT = N00.getValueType();

55910

if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

55911

N10.getValueType() != InVT || N11.getValueType() != InVT)

55912

return SDValue();

55913

55914

// All inputs should be build_vectors.

55915

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

55916

N01.getOpcode() != ISD::BUILD_VECTOR ||

55917

N10.getOpcode() != ISD::BUILD_VECTOR ||

55918

N11.getOpcode() != ISD::BUILD_VECTOR)

55919

return SDValue();

55920

55921

// For each element, we need to ensure we have an odd element from one vector

55922

// multiplied by the odd element of another vector and the even element from

55923

// one of the same vectors being multiplied by the even element from the

55924

// other vector. So we need to make sure for each element i, this operator

55925

// is being performed:

55926

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

55927

SDValue In0, In1;

55928

for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

55929

SDValue N00Elt = N00.getOperand(i);

55930

SDValue N01Elt = N01.getOperand(i);

55931

SDValue N10Elt = N10.getOperand(i);

55932

SDValue N11Elt = N11.getOperand(i);

55933

// TODO: Be more tolerant to undefs.

55934

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55935

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55936

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55937

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55938

return SDValue();

55939

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

55940

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

55941

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

55942

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

55943

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

55944

return SDValue();

55945

unsigned IdxN00 = ConstN00Elt->getZExtValue();

55946

unsigned IdxN01 = ConstN01Elt->getZExtValue();

55947

unsigned IdxN10 = ConstN10Elt->getZExtValue();

55948

unsigned IdxN11 = ConstN11Elt->getZExtValue();

55949

// Add is commutative so indices can be reordered.

55950

if (IdxN00 > IdxN10) {

55951

std::swap(IdxN00, IdxN10);

55952

std::swap(IdxN01, IdxN11);

55953

}

55954

// N0 indices be the even element. N1 indices must be the next odd element.

55955

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

55956

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

55957

return SDValue();

55958

SDValue N00In = N00Elt.getOperand(0);

55959

SDValue N01In = N01Elt.getOperand(0);

55960

SDValue N10In = N10Elt.getOperand(0);

55961

SDValue N11In = N11Elt.getOperand(0);

55962

55963

// First time we find an input capture it.

55964

if (!In0) {

55965

In0 = N00In;

55966

In1 = N01In;

55967

55968

// The input vectors must be at least as wide as the output.

55969

// If they are larger than the output, we extract subvector below.

55970

if (In0.getValueSizeInBits() < VT.getSizeInBits() ||

55971

In1.getValueSizeInBits() < VT.getSizeInBits())

55972

return SDValue();

55973

}

55974

// Mul is commutative so the input vectors can be in any order.

55975

// Canonicalize to make the compares easier.

55976

if (In0 != N00In)

55977

std::swap(N00In, N01In);

55978

if (In0 != N10In)

55979

std::swap(N10In, N11In);

55980

if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

55981

return SDValue();

55982

}

55983

55984

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55985

ArrayRef<SDValue> Ops) {

55986

EVT OpVT = Ops[0].getValueType();

55987

assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__
__PRETTY_FUNCTION__))

55988

"Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__
__PRETTY_FUNCTION__));

55989

assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55989, __extension__
__PRETTY_FUNCTION__));

55990

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55991

OpVT.getVectorNumElements() / 2);

55992

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55993

};

55994

55995

// If the output is narrower than an input, extract the low part of the input

55996

// vector.

55997

EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55998

VT.getVectorNumElements() * 2);

55999

if (OutVT16.bitsLT(In0.getValueType())) {

56000

In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,

56001

DAG.getIntPtrConstant(0, DL));

56002

}

56003

if (OutVT16.bitsLT(In1.getValueType())) {

56004

In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,

56005

DAG.getIntPtrConstant(0, DL));

56006

}

56007

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

56008

PMADDBuilder);

56009

}

56010

56011

// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))

56012

// If upper element in each pair of both VPMADDWD are zero then we can merge

56013

// the operand elements and use the implicit add of VPMADDWD.

56014

// TODO: Add support for VPMADDUBSW (which isn't commutable).

56015

static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,

56016

const SDLoc &DL, EVT VT) {

56017

if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)

56018

return SDValue();

56019

56020

// TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.

56021

if (VT.getSizeInBits() > 128)

56022

return SDValue();

56023

56024

unsigned NumElts = VT.getVectorNumElements();

56025

MVT OpVT = N0.getOperand(0).getSimpleValueType();

56026

APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());

56027

APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));

56028

56029

bool Op0HiZero =

56030

DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||

56031

DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);

56032

bool Op1HiZero =

56033

DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||

56034

DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);

56035

56036

// TODO: Check for zero lower elements once we have actual codegen that

56037

// creates them.

56038

if (!Op0HiZero || !Op1HiZero)

56039

return SDValue();

56040

56041

// Create a shuffle mask packing the lower elements from each VPMADDWD.

56042

SmallVector<int> Mask;

56043

for (int i = 0; i != (int)NumElts; ++i) {

56044

Mask.push_back(2 * i);

56045

Mask.push_back(2 * (i + NumElts));

56046

}

56047

56048

SDValue LHS =

56049

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);

56050

SDValue RHS =

56051

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);

56052

return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);

56053

}

56054

56055

/// CMOV of constants requires materializing constant operands in registers.

56056

/// Try to fold those constants into an 'add' instruction to reduce instruction

56057

/// count. We do this with CMOV rather the generic 'select' because there are

56058

/// earlier folds that may be used to turn select-of-constants into logic hacks.

56059

static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,

56060

const X86Subtarget &Subtarget) {

56061

// If an operand is zero, add-of-0 gets simplified away, so that's clearly

56062

// better because we eliminate 1-2 instructions. This transform is still

56063

// an improvement without zero operands because we trade 2 move constants and

56064

// 1 add for 2 adds (LEA) as long as the constants can be represented as

56065

// immediate asm operands (fit in 32-bits).

56066

auto isSuitableCmov = [](SDValue V) {

56067

if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())

56068

return false;

56069

if (!isa<ConstantSDNode>(V.getOperand(0)) ||

56070

!isa<ConstantSDNode>(V.getOperand(1)))

56071

return false;

56072

return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||

56073

(V.getConstantOperandAPInt(0).isSignedIntN(32) &&

56074

V.getConstantOperandAPInt(1).isSignedIntN(32));

56075

};

56076

56077

// Match an appropriate CMOV as the first operand of the add.

56078

SDValue Cmov = N->getOperand(0);

56079

SDValue OtherOp = N->getOperand(1);

56080

if (!isSuitableCmov(Cmov))

56081

std::swap(Cmov, OtherOp);

56082

if (!isSuitableCmov(Cmov))

56083

return SDValue();

56084

56085

// Don't remove a load folding opportunity for the add. That would neutralize

56086

// any improvements from removing constant materializations.

56087

if (X86::mayFoldLoad(OtherOp, Subtarget))

56088

return SDValue();

56089

56090

EVT VT = N->getValueType(0);

56091

SDLoc DL(N);

56092

SDValue FalseOp = Cmov.getOperand(0);

56093

SDValue TrueOp = Cmov.getOperand(1);

56094

56095

// We will push the add through the select, but we can potentially do better

56096

// if we know there is another add in the sequence and this is pointer math.

56097

// In that case, we can absorb an add into the trailing memory op and avoid

56098

// a 3-operand LEA which is likely slower than a 2-operand LEA.

56099

// TODO: If target has "slow3OpsLEA", do this even without the trailing memop?

56100

if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&

56101

!isa<ConstantSDNode>(OtherOp.getOperand(0)) &&

56102

all_of(N->uses(), [&](SDNode *Use) {

56103

auto *MemNode = dyn_cast<MemSDNode>(Use);

56104

return MemNode && MemNode->getBasePtr().getNode() == N;

56105

})) {

56106

// add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y

56107

// TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but

56108

// it is possible that choosing op1 might be better.

56109

SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);

56110

FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);

56111

TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);

56112

Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,

56113

Cmov.getOperand(2), Cmov.getOperand(3));

56114

return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);

56115

}

56116

56117

// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)

56118

FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);

56119

TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);

56120

return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

56121

Cmov.getOperand(3));

56122

}

56123

56124

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

56125

TargetLowering::DAGCombinerInfo &DCI,

56126

const X86Subtarget &Subtarget) {

56127

EVT VT = N->getValueType(0);

56128

SDValue Op0 = N->getOperand(0);

56129

SDValue Op1 = N->getOperand(1);

56130

SDLoc DL(N);

56131

56132

if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))

56133

return Select;

56134

56135

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))

56136

return MAdd;

56137

if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))

56138

return MAdd;

56139

if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))

56140

return MAdd;

56141

56142

// Try to synthesize horizontal adds from adds of shuffles.

56143

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

56144

return V;

56145

56146

// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

56147

// (sub Y, (sext (vXi1 X))).

56148

// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in

56149

// generic DAG combine without a legal type check, but adding this there

56150

// caused regressions.

56151

if (VT.isVector()) {

56152

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56153

if (Op0.getOpcode() == ISD::ZERO_EXTEND &&

56154

Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

56155

TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {

56156

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));

56157

return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);

56158

}

56159

56160

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&

56161

Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

56162

TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {

56163

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));

56164

return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);

56165

}

56166

}

56167

56168

// Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)

56169

if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&

56170

X86::isZeroNode(Op0.getOperand(1))) {

56171

assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56171, __extension__
__PRETTY_FUNCTION__));

56172

return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,

56173

Op0.getOperand(0), Op0.getOperand(2));

56174

}

56175

56176

return combineAddOrSubToADCOrSBB(N, DAG);

56177

}

56178

56179

// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov

56180

// condition comes from the subtract node that produced -X. This matches the

56181

// cmov expansion for absolute value. By swapping the operands we convert abs

56182

// to nabs.

56183

static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {

56184

SDValue N0 = N->getOperand(0);

56185

SDValue N1 = N->getOperand(1);

56186

56187

if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())

56188

return SDValue();

56189

56190

X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);

56191

if (CC != X86::COND_S && CC != X86::COND_NS)

56192

return SDValue();

56193

56194

// Condition should come from a negate operation.

56195

SDValue Cond = N1.getOperand(3);

56196

if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))

56197

return SDValue();

56198

assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56198, __extension__
__PRETTY_FUNCTION__));

56199

56200

// Get the X and -X from the negate.

56201

SDValue NegX = Cond.getValue(0);

56202

SDValue X = Cond.getOperand(1);

56203

56204

SDValue FalseOp = N1.getOperand(0);

56205

SDValue TrueOp = N1.getOperand(1);

56206

56207

// Cmov operands should be X and NegX. Order doesn't matter.

56208

if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))

56209

return SDValue();

56210

56211

// Build a new CMOV with the operands swapped.

56212

SDLoc DL(N);

56213

MVT VT = N->getSimpleValueType(0);

56214

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,

56215

N1.getOperand(2), Cond);

56216

// Convert sub to add.

56217

return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);

56218

}

56219

56220

static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {

56221

SDValue Op0 = N->getOperand(0);

56222

SDValue Op1 = N->getOperand(1);

56223

56224

// (sub C (zero_extend (setcc)))

56225

// =>

56226

// (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate

56227

// Don't disturb (sub 0 setcc), which is easily done with neg.

56228

EVT VT = N->getValueType(0);

56229

auto *Op0C = dyn_cast<ConstantSDNode>(Op0);

56230

if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&

56231

!Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&

56232

Op1.getOperand(0).hasOneUse()) {

56233

SDValue SetCC = Op1.getOperand(0);

56234

X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);

56235

X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);

56236

uint64_t NewImm = Op0C->getZExtValue() - 1;

56237

SDLoc DL(Op1);

56238

SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);

56239

NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);

56240

return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,

56241

DAG.getConstant(NewImm, DL, VT));

56242

}

56243

56244

return SDValue();

56245

}

56246

56247

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

56248

TargetLowering::DAGCombinerInfo &DCI,

56249

const X86Subtarget &Subtarget) {

56250

SDValue Op0 = N->getOperand(0);

56251

SDValue Op1 = N->getOperand(1);

56252

56253

// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.

56254

auto IsNonOpaqueConstant = [&](SDValue Op) {

56255

if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {

56256

if (auto *Cst = dyn_cast<ConstantSDNode>(C))

56257

return !Cst->isOpaque();

56258

return true;

56259

}

56260

return false;

56261

};

56262

56263

// X86 can't encode an immediate LHS of a sub. See if we can push the

56264

// negation into a preceding instruction. If the RHS of the sub is a XOR with

56265

// one use and a constant, invert the immediate, saving one register.

56266

// sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)

56267

if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&

56268

IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {

56269

SDLoc DL(N);

56270

EVT VT = Op0.getValueType();

56271

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),

56272

DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));

56273

SDValue NewAdd =

56274

DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));

56275

return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);

56276

}

56277

56278

if (SDValue V = combineSubABS(N, DAG))

56279

return V;

56280

56281

// Try to synthesize horizontal subs from subs of shuffles.

56282

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

56283

return V;

56284

56285

// Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)

56286

if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&

56287

X86::isZeroNode(Op1.getOperand(1))) {

56288

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56288, __extension__
__PRETTY_FUNCTION__));

56289

return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,

56290

Op1.getOperand(0), Op1.getOperand(2));

56291

}

56292

56293

// Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)

56294

// Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.

56295

if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&

56296

!(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {

56297

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56297, __extension__
__PRETTY_FUNCTION__));

56298

SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,

56299

Op1.getOperand(1), Op1.getOperand(2));

56300

return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),

56301

Op1.getOperand(0));

56302

}

56303

56304

if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))

56305

return V;

56306

56307

if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))

56308

return V;

56309

56310

return combineSubSetcc(N, DAG);

56311

}

56312

56313

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

56314

const X86Subtarget &Subtarget) {

56315

MVT VT = N->getSimpleValueType(0);

56316

SDLoc DL(N);

56317

56318

if (N->getOperand(0) == N->getOperand(1)) {

56319

if (N->getOpcode() == X86ISD::PCMPEQ)

56320

return DAG.getConstant(-1, DL, VT);

56321

if (N->getOpcode() == X86ISD::PCMPGT)

56322

return DAG.getConstant(0, DL, VT);

56323

}

56324

56325

return SDValue();

56326

}

56327

56328

/// Helper that combines an array of subvector ops as if they were the operands

56329

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

56330

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.

56331

static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

56332

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

56333

TargetLowering::DAGCombinerInfo &DCI,

56334

const X86Subtarget &Subtarget) {

56335

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56335, __extension__
__PRETTY_FUNCTION__));

56336

unsigned EltSizeInBits = VT.getScalarSizeInBits();

56337

56338

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

56339

return DAG.getUNDEF(VT);

56340

56341

if (llvm::all_of(Ops, [](SDValue Op) {

56342

return ISD::isBuildVectorAllZeros(Op.getNode());

56343

}))

56344

return getZeroVector(VT, Subtarget, DAG, DL);

56345

56346

SDValue Op0 = Ops[0];

56347

bool IsSplat = llvm::all_equal(Ops);

56348

56349

// Repeated subvectors.

56350

if (IsSplat &&

56351

(VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

56352

// If this broadcast is inserted into both halves, use a larger broadcast.

56353

if (Op0.getOpcode() == X86ISD::VBROADCAST)

56354

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

56355

56356

// If this simple subvector or scalar/subvector broadcast_load is inserted

56357

// into both halves, use a larger broadcast_load. Update other uses to use

56358

// an extracted subvector.

56359

if (ISD::isNormalLoad(Op0.getNode()) ||

56360

Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||

56361

Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

56362

auto *Mem = cast<MemSDNode>(Op0);

56363

unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD

56364

? X86ISD::VBROADCAST_LOAD

56365

: X86ISD::SUBV_BROADCAST_LOAD;

56366

if (SDValue BcastLd =

56367

getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {

56368

SDValue BcastSrc =

56369

extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());

56370

DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);

56371

return BcastLd;

56372

}

56373

}

56374

56375

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

56376

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

56377

(Subtarget.hasAVX2() ||

56378

X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),

56379

VT.getScalarType(), Subtarget)))

56380

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

56381

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

56382

Op0.getOperand(0),

56383

DAG.getIntPtrConstant(0, DL)));

56384

56385

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

56386

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

56387

(Subtarget.hasAVX2() ||

56388

(EltSizeInBits >= 32 &&

56389

X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&

56390

Op0.getOperand(0).getValueType() == VT.getScalarType())

56391

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

56392

56393

// concat_vectors(extract_subvector(broadcast(x)),

56394

// extract_subvector(broadcast(x))) -> broadcast(x)

56395

if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56396

Op0.getOperand(0).getValueType() == VT) {

56397

if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

56398

Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

56399

return Op0.getOperand(0);

56400

}

56401

}

56402

56403

// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.

56404

// Only concat of subvector high halves which vperm2x128 is best at.

56405

// TODO: This should go in combineX86ShufflesRecursively eventually.

56406

if (VT.is256BitVector() && Ops.size() == 2) {

56407

SDValue Src0 = peekThroughBitcasts(Ops[0]);

56408

SDValue Src1 = peekThroughBitcasts(Ops[1]);

56409

if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56410

Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

56411

EVT SrcVT0 = Src0.getOperand(0).getValueType();

56412

EVT SrcVT1 = Src1.getOperand(0).getValueType();

56413

unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();

56414

unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();

56415

if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&

56416

Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&

56417

Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {

56418

return DAG.getNode(X86ISD::VPERM2X128, DL, VT,

56419

DAG.getBitcast(VT, Src0.getOperand(0)),

56420

DAG.getBitcast(VT, Src1.getOperand(0)),

56421

DAG.getTargetConstant(0x31, DL, MVT::i8));

56422

}

56423

}

56424

}

56425

56426

// Repeated opcode.

56427

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

56428

// but it currently struggles with different vector widths.

56429

if (llvm::all_of(Ops, [Op0](SDValue Op) {

56430

return Op.getOpcode() == Op0.getOpcode();

56431

})) {

56432

auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

56433

SmallVector<SDValue> Subs;

56434

for (SDValue SubOp : SubOps)

56435

Subs.push_back(SubOp.getOperand(I));

56436

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

56437

};

56438

auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {

56439

for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {

56440

SDValue Sub = SubOps[I].getOperand(Op);

56441

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

56442

if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

56443

Sub.getOperand(0).getValueType() != VT ||

56444

Sub.getConstantOperandAPInt(1) != (I * NumSubElts))

56445

return false;

56446

}

56447

return true;

56448

};

56449

56450

unsigned NumOps = Ops.size();

56451

switch (Op0.getOpcode()) {

56452

case X86ISD::VBROADCAST: {

56453

if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {

56454

return Op.getOperand(0).getValueType().is128BitVector();

56455

})) {

56456

if (VT == MVT::v4f64 || VT == MVT::v4i64)

56457

return DAG.getNode(X86ISD::UNPCKL, DL, VT,

56458

ConcatSubOperand(VT, Ops, 0),

56459

ConcatSubOperand(VT, Ops, 0));

56460

// TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.

56461

if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))

56462

return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI

56463

: X86ISD::PSHUFD,

56464

DL, VT, ConcatSubOperand(VT, Ops, 0),

56465

getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));

56466

}

56467

break;

56468

}

56469

case X86ISD::MOVDDUP:

56470

case X86ISD::MOVSHDUP:

56471

case X86ISD::MOVSLDUP: {

56472

if (!IsSplat)

56473

return DAG.getNode(Op0.getOpcode(), DL, VT,

56474

ConcatSubOperand(VT, Ops, 0));

56475

break;

56476

}

56477

case X86ISD::SHUFP: {

56478

// Add SHUFPD support if/when necessary.

56479

if (!IsSplat && VT.getScalarType() == MVT::f32 &&

56480

llvm::all_of(Ops, [Op0](SDValue Op) {

56481

return Op.getOperand(2) == Op0.getOperand(2);

56482

})) {

56483

return DAG.getNode(Op0.getOpcode(), DL, VT,

56484

ConcatSubOperand(VT, Ops, 0),

56485

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56486

}

56487

break;

56488

}

56489

case X86ISD::PSHUFHW:

56490

case X86ISD::PSHUFLW:

56491

case X86ISD::PSHUFD:

56492

if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

56493

Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

56494

return DAG.getNode(Op0.getOpcode(), DL, VT,

56495

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56496

}

56497

[[fallthrough]];

56498

case X86ISD::VPERMILPI:

56499

if (!IsSplat && VT.getScalarSizeInBits() == 32 &&

56500

(VT.is256BitVector() ||

56501

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56502

all_of(Ops, [&Op0](SDValue Op) {

56503

return Op0.getOperand(1) == Op.getOperand(1);

56504

})) {

56505

MVT FloatVT = VT.changeVectorElementType(MVT::f32);

56506

SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));

56507

Res =

56508

DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));

56509

return DAG.getBitcast(VT, Res);

56510

}

56511

if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {

56512

uint64_t Idx0 = Ops[0].getConstantOperandVal(1);

56513

uint64_t Idx1 = Ops[1].getConstantOperandVal(1);

56514

uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);

56515

return DAG.getNode(Op0.getOpcode(), DL, VT,

56516

ConcatSubOperand(VT, Ops, 0),

56517

DAG.getTargetConstant(Idx, DL, MVT::i8));

56518

}

56519

break;

56520

case X86ISD::PSHUFB:

56521

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56522

(VT.is512BitVector() && Subtarget.useBWIRegs()))) {

56523

return DAG.getNode(Op0.getOpcode(), DL, VT,

56524

ConcatSubOperand(VT, Ops, 0),

56525

ConcatSubOperand(VT, Ops, 1));

56526

}

56527

break;

56528

case X86ISD::VPERMV:

56529

if (!IsSplat && NumOps == 2 &&

56530

(VT.is512BitVector() && Subtarget.useAVX512Regs())) {

56531

MVT OpVT = Op0.getSimpleValueType();

56532

int NumSrcElts = OpVT.getVectorNumElements();

56533

SmallVector<int, 64> ConcatMask;

56534

for (unsigned i = 0; i != NumOps; ++i) {

56535

SmallVector<int, 64> SubMask;

56536

SmallVector<SDValue, 2> SubOps;

56537

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

56538

SubMask))

56539

break;

56540

for (int M : SubMask) {

56541

if (0 <= M)

56542

M += i * NumSrcElts;

56543

ConcatMask.push_back(M);

56544

}

56545

}

56546

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

56547

SDValue Src = concatSubVectors(Ops[0].getOperand(1),

56548

Ops[1].getOperand(1), DAG, DL);

56549

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

56550

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

56551

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

56552

return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);

56553

}

56554

}

56555

break;

56556

case X86ISD::VPERMV3:

56557

if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {

56558

MVT OpVT = Op0.getSimpleValueType();

56559

int NumSrcElts = OpVT.getVectorNumElements();

56560

SmallVector<int, 64> ConcatMask;

56561

for (unsigned i = 0; i != NumOps; ++i) {

56562

SmallVector<int, 64> SubMask;

56563

SmallVector<SDValue, 2> SubOps;

56564

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

56565

SubMask))

56566

break;

56567

for (int M : SubMask) {

56568

if (0 <= M) {

56569

M += M < NumSrcElts ? 0 : NumSrcElts;

56570

M += i * NumSrcElts;

56571

}

56572

ConcatMask.push_back(M);

56573

}

56574

}

56575

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

56576

SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),

56577

Ops[1].getOperand(0), DAG, DL);

56578

SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),

56579

Ops[1].getOperand(2), DAG, DL);

56580

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

56581

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

56582

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

56583

return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);

56584

}

56585

}

56586

break;

56587

case ISD::TRUNCATE:

56588

if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {

56589

EVT SrcVT = Ops[0].getOperand(0).getValueType();

56590

if (SrcVT.is256BitVector() && SrcVT.isSimple() &&

56591

SrcVT == Ops[1].getOperand(0).getValueType() &&

56592

Subtarget.useAVX512Regs() &&

56593

Subtarget.getPreferVectorWidth() >= 512 &&

56594

(SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {

56595

EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());

56596

return DAG.getNode(ISD::TRUNCATE, DL, VT,

56597

ConcatSubOperand(NewSrcVT, Ops, 0));

56598

}

56599

}

56600

break;

56601

case X86ISD::VSHLI:

56602

case X86ISD::VSRLI:

56603

// Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.

56604

// TODO: Move this to LowerShiftByScalarImmediate?

56605

if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&

56606

llvm::all_of(Ops, [](SDValue Op) {

56607

return Op.getConstantOperandAPInt(1) == 32;

56608

})) {

56609

SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));

56610

SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);

56611

if (Op0.getOpcode() == X86ISD::VSHLI) {

56612

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

56613

{8, 0, 8, 2, 8, 4, 8, 6});

56614

} else {

56615

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

56616

{1, 8, 3, 8, 5, 8, 7, 8});

56617

}

56618

return DAG.getBitcast(VT, Res);

56619

}

56620

[[fallthrough]];

56621

case X86ISD::VSRAI:

56622

case X86ISD::VSHL:

56623

case X86ISD::VSRL:

56624

case X86ISD::VSRA:

56625

if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

56626

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56627

(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

56628

llvm::all_of(Ops, [Op0](SDValue Op) {

56629

return Op0.getOperand(1) == Op.getOperand(1);

56630

})) {

56631

return DAG.getNode(Op0.getOpcode(), DL, VT,

56632

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56633

}

56634

break;

56635

case X86ISD::VPERMI:

56636

case X86ISD::VROTLI:

56637

case X86ISD::VROTRI:

56638

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56639

llvm::all_of(Ops, [Op0](SDValue Op) {

56640

return Op0.getOperand(1) == Op.getOperand(1);

56641

})) {

56642

return DAG.getNode(Op0.getOpcode(), DL, VT,

56643

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56644

}

56645

break;

56646

case ISD::AND:

56647

case ISD::OR:

56648

case ISD::XOR:

56649

case X86ISD::ANDNP:

56650

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56651

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

56652

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56653

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56654

NumOps * SrcVT.getVectorNumElements());

56655

return DAG.getNode(Op0.getOpcode(), DL, VT,

56656

ConcatSubOperand(SrcVT, Ops, 0),

56657

ConcatSubOperand(SrcVT, Ops, 1));

56658

}

56659

break;

56660

case X86ISD::GF2P8AFFINEQB:

56661

if (!IsSplat &&

56662

(VT.is256BitVector() ||

56663

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56664

llvm::all_of(Ops, [Op0](SDValue Op) {

56665

return Op0.getOperand(2) == Op.getOperand(2);

56666

})) {

56667

return DAG.getNode(Op0.getOpcode(), DL, VT,

56668

ConcatSubOperand(VT, Ops, 0),

56669

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56670

}

56671

break;

56672

case ISD::ADD:

56673

case ISD::SUB:

56674

case ISD::MUL:

56675

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56676

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56677

(EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {

56678

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56679

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56680

NumOps * SrcVT.getVectorNumElements());

56681

return DAG.getNode(Op0.getOpcode(), DL, VT,

56682

ConcatSubOperand(SrcVT, Ops, 0),

56683

ConcatSubOperand(SrcVT, Ops, 1));

56684

}

56685

break;

56686

// Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and

56687

// their latency are short, so here we don't replace them.

56688

case ISD::FDIV:

56689

if (!IsSplat && (VT.is256BitVector() ||

56690

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

56691

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56692

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56693

NumOps * SrcVT.getVectorNumElements());

56694

return DAG.getNode(Op0.getOpcode(), DL, VT,

56695

ConcatSubOperand(SrcVT, Ops, 0),

56696

ConcatSubOperand(SrcVT, Ops, 1));

56697

}

56698

break;

56699

case X86ISD::HADD:

56700

case X86ISD::HSUB:

56701

case X86ISD::FHADD:

56702

case X86ISD::FHSUB:

56703

case X86ISD::PACKSS:

56704

case X86ISD::PACKUS:

56705

if (!IsSplat && VT.is256BitVector() &&

56706

(VT.isFloatingPoint() || Subtarget.hasInt256())) {

56707

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56708

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56709

NumOps * SrcVT.getVectorNumElements());

56710

return DAG.getNode(Op0.getOpcode(), DL, VT,

56711

ConcatSubOperand(SrcVT, Ops, 0),

56712

ConcatSubOperand(SrcVT, Ops, 1));

56713

}

56714

break;

56715

case X86ISD::PALIGNR:

56716

if (!IsSplat &&

56717

((VT.is256BitVector() && Subtarget.hasInt256()) ||

56718

(VT.is512BitVector() && Subtarget.useBWIRegs())) &&

56719

llvm::all_of(Ops, [Op0](SDValue Op) {

56720

return Op0.getOperand(2) == Op.getOperand(2);

56721

})) {

56722

return DAG.getNode(Op0.getOpcode(), DL, VT,

56723

ConcatSubOperand(VT, Ops, 0),

56724

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56725

}

56726

break;

56727

case ISD::VSELECT:

56728

if (!IsSplat && Subtarget.hasAVX512() &&

56729

(VT.is256BitVector() ||

56730

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56731

(EltSizeInBits >= 32 || Subtarget.hasBWI())) {

56732

EVT SelVT = Ops[0].getOperand(0).getValueType();

56733

if (SelVT.getVectorElementType() == MVT::i1) {

56734

SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

56735

Ops.size() * SelVT.getVectorNumElements());

56736

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

56737

return DAG.getNode(Op0.getOpcode(), DL, VT,

56738

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

56739

ConcatSubOperand(VT, Ops, 1),

56740

ConcatSubOperand(VT, Ops, 2));

56741

}

56742

}

56743

[[fallthrough]];

56744

case X86ISD::BLENDV:

56745

if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&

56746

(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&

56747

IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {

56748

EVT SelVT = Ops[0].getOperand(0).getValueType();

56749

SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());

56750

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

56751

return DAG.getNode(Op0.getOpcode(), DL, VT,

56752

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

56753

ConcatSubOperand(VT, Ops, 1),

56754

ConcatSubOperand(VT, Ops, 2));

56755

}

56756

break;

56757

}

56758

}

56759

56760

// Fold subvector loads into one.

56761

// If needed, look through bitcasts to get to the load.

56762

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

56763

unsigned Fast;

56764

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

56765

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

56766

*FirstLd->getMemOperand(), &Fast) &&

56767

Fast) {

56768

if (SDValue Ld =

56769

EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

56770

return Ld;

56771

}

56772

}

56773

56774

// Attempt to fold target constant loads.

56775

if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {

56776

SmallVector<APInt> EltBits;

56777

APInt UndefElts = APInt::getZero(VT.getVectorNumElements());

56778

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

56779

APInt OpUndefElts;

56780

SmallVector<APInt> OpEltBits;

56781

if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,

56782

OpEltBits, true, false))

56783

break;

56784

EltBits.append(OpEltBits);

56785

UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());

56786

}

56787

if (EltBits.size() == VT.getVectorNumElements())

56788

return getConstVector(EltBits, UndefElts, VT, DAG, DL);

56789

}

56790

56791

return SDValue();

56792

}

56793

56794

static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,

56795

TargetLowering::DAGCombinerInfo &DCI,

56796

const X86Subtarget &Subtarget) {

56797

EVT VT = N->getValueType(0);

56798

EVT SrcVT = N->getOperand(0).getValueType();

56799

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56800

SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());

56801

56802

if (VT.getVectorElementType() == MVT::i1) {

56803

// Attempt to constant fold.

56804

unsigned SubSizeInBits = SrcVT.getSizeInBits();

56805

APInt Constant = APInt::getZero(VT.getSizeInBits());

56806

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

56807

auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));

56808

if (!C) break;

56809

Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);

56810

if (I == (E - 1)) {

56811

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

56812

if (TLI.isTypeLegal(IntVT))

56813

return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));

56814

}

56815

}

56816

56817

// Don't do anything else for i1 vectors.

56818

return SDValue();

56819

}

56820

56821

if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

56822

if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

56823

DCI, Subtarget))

56824

return R;

56825

}

56826

56827

return SDValue();

56828

}

56829

56830

static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

56831

TargetLowering::DAGCombinerInfo &DCI,

56832

const X86Subtarget &Subtarget) {

56833

if (DCI.isBeforeLegalizeOps())

56834

return SDValue();

56835

56836

MVT OpVT = N->getSimpleValueType(0);

56837

56838

bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

56839

56840

SDLoc dl(N);

56841

SDValue Vec = N->getOperand(0);

56842

SDValue SubVec = N->getOperand(1);

56843

56844

uint64_t IdxVal = N->getConstantOperandVal(2);

56845

MVT SubVecVT = SubVec.getSimpleValueType();

56846

56847

if (Vec.isUndef() && SubVec.isUndef())

56848

return DAG.getUNDEF(OpVT);

56849

56850

// Inserting undefs/zeros into zeros/undefs is a zero vector.

56851

if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

56852

(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

56853

return getZeroVector(OpVT, Subtarget, DAG, dl);

56854

56855

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

56856

// If we're inserting into a zero vector and then into a larger zero vector,

56857

// just insert into the larger zero vector directly.

56858

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56859

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

56860

uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

56861

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56862

getZeroVector(OpVT, Subtarget, DAG, dl),

56863

SubVec.getOperand(1),

56864

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

56865

}

56866

56867

// If we're inserting into a zero vector and our input was extracted from an

56868

// insert into a zero vector of the same type and the extraction was at

56869

// least as large as the original insertion. Just insert the original

56870

// subvector into a zero vector.

56871

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

56872

isNullConstant(SubVec.getOperand(1)) &&

56873

SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

56874

SDValue Ins = SubVec.getOperand(0);

56875

if (isNullConstant(Ins.getOperand(2)) &&

56876

ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

56877

Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=

56878

SubVecVT.getFixedSizeInBits())

56879

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56880

getZeroVector(OpVT, Subtarget, DAG, dl),

56881

Ins.getOperand(1), N->getOperand(2));

56882

}

56883

}

56884

56885

// Stop here if this is an i1 vector.

56886

if (IsI1Vector)

56887

return SDValue();

56888

56889

// Eliminate an intermediate vector widening:

56890

// insert_subvector X, (insert_subvector undef, Y, 0), Idx -->

56891

// insert_subvector X, Y, Idx

56892

// TODO: This is a more general version of a DAGCombiner fold, can we move it

56893

// there?

56894

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56895

SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))

56896

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,

56897

SubVec.getOperand(1), N->getOperand(2));

56898

56899

// If this is an insert of an extract, combine to a shuffle. Don't do this

56900

// if the insert or extract can be represented with a subregister operation.

56901

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56902

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

56903

(IdxVal != 0 ||

56904

!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

56905

int ExtIdxVal = SubVec.getConstantOperandVal(1);

56906

if (ExtIdxVal != 0) {

56907

int VecNumElts = OpVT.getVectorNumElements();

56908

int SubVecNumElts = SubVecVT.getVectorNumElements();

56909

SmallVector<int, 64> Mask(VecNumElts);

56910

// First create an identity shuffle mask.

56911

for (int i = 0; i != VecNumElts; ++i)

56912

Mask[i] = i;

56913

// Now insert the extracted portion.

56914

for (int i = 0; i != SubVecNumElts; ++i)

56915

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

56916

56917

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

56918

}

56919

}

56920

56921

// Match concat_vector style patterns.

56922

SmallVector<SDValue, 2> SubVectorOps;

56923

if (collectConcatOps(N, SubVectorOps, DAG)) {

56924

if (SDValue Fold =

56925

combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))

56926

return Fold;

56927

56928

// If we're inserting all zeros into the upper half, change this to

56929

// a concat with zero. We will match this to a move

56930

// with implicit upper bit zeroing during isel.

56931

// We do this here because we don't want combineConcatVectorOps to

56932

// create INSERT_SUBVECTOR from CONCAT_VECTORS.

56933

if (SubVectorOps.size() == 2 &&

56934

ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

56935

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56936

getZeroVector(OpVT, Subtarget, DAG, dl),

56937

SubVectorOps[0], DAG.getIntPtrConstant(0, dl));

56938

}

56939

56940

// If this is a broadcast insert into an upper undef, use a larger broadcast.

56941

if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

56942

return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

56943

56944

// If this is a broadcast load inserted into an upper undef, use a larger

56945

// broadcast load.

56946

if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

56947

SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

56948

auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

56949

SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);

56950

SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };

56951

SDValue BcastLd =

56952

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

56953

MemIntr->getMemoryVT(),

56954

MemIntr->getMemOperand());

56955

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

56956

return BcastLd;

56957

}

56958

56959

// If we're splatting the lower half subvector of a full vector load into the

56960

// upper half, attempt to create a subvector broadcast.

56961

if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&

56962

Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {

56963

auto *VecLd = dyn_cast<LoadSDNode>(Vec);

56964

auto *SubLd = dyn_cast<LoadSDNode>(SubVec);

56965

if (VecLd && SubLd &&

56966

DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,

56967

SubVec.getValueSizeInBits() / 8, 0))

56968

return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,

56969

SubLd, 0, DAG);

56970

}

56971

56972

return SDValue();

56973

}

56974

56975

/// If we are extracting a subvector of a vector select and the select condition

56976

/// is composed of concatenated vectors, try to narrow the select width. This

56977

/// is a common pattern for AVX1 integer code because 256-bit selects may be

56978

/// legal, but there is almost no integer math/logic available for 256-bit.

56979

/// This function should only be called with legal types (otherwise, the calls

56980

/// to get simple value types will assert).

56981

static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

56982

SDValue Sel = Ext->getOperand(0);

56983

SmallVector<SDValue, 4> CatOps;

56984

if (Sel.getOpcode() != ISD::VSELECT ||

56985

!collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))

56986

return SDValue();

56987

56988

// Note: We assume simple value types because this should only be called with

56989

// legal operations/types.

56990

// TODO: This can be extended to handle extraction to 256-bits.

56991

MVT VT = Ext->getSimpleValueType(0);

56992

if (!VT.is128BitVector())

56993

return SDValue();

56994

56995

MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

56996

if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

56997

return SDValue();

56998

56999

MVT WideVT = Ext->getOperand(0).getSimpleValueType();

57000

MVT SelVT = Sel.getSimpleValueType();

57001

assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__
__PRETTY_FUNCTION__))

57002

"Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__
__PRETTY_FUNCTION__));

57003

57004

unsigned SelElts = SelVT.getVectorNumElements();

57005

unsigned CastedElts = WideVT.getVectorNumElements();

57006

unsigned ExtIdx = Ext->getConstantOperandVal(1);

57007

if (SelElts % CastedElts == 0) {

57008

// The select has the same or more (narrower) elements than the extract

57009

// operand. The extraction index gets scaled by that factor.

57010

ExtIdx *= (SelElts / CastedElts);

57011

} else if (CastedElts % SelElts == 0) {

57012

// The select has less (wider) elements than the extract operand. Make sure

57013

// that the extraction index can be divided evenly.

57014

unsigned IndexDivisor = CastedElts / SelElts;

57015

if (ExtIdx % IndexDivisor != 0)

57016

return SDValue();

57017

ExtIdx /= IndexDivisor;

57018

} else {

57019

llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57019);

57020

}

57021

57022

unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

57023

unsigned NarrowElts = SelElts / NarrowingFactor;

57024

MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

57025

SDLoc DL(Ext);

57026

SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

57027

SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

57028

SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

57029

SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

57030

return DAG.getBitcast(VT, NarrowSel);

57031

}

57032

57033

static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

57034

TargetLowering::DAGCombinerInfo &DCI,

57035

const X86Subtarget &Subtarget) {

57036

// For AVX1 only, if we are extracting from a 256-bit and+not (which will

57037

// eventually get combined/lowered into ANDNP) with a concatenated operand,

57038

// split the 'and' into 128-bit ops to avoid the concatenate and extract.

57039

// We let generic combining take over from there to simplify the

57040

// insert/extract and 'not'.

57041

// This pattern emerges during AVX1 legalization. We handle it before lowering

57042

// to avoid complications like splitting constant vector loads.

57043

57044

// Capture the original wide type in the likely case that we need to bitcast

57045

// back to this type.

57046

if (!N->getValueType(0).isSimple())

57047

return SDValue();

57048

57049

MVT VT = N->getSimpleValueType(0);

57050

SDValue InVec = N->getOperand(0);

57051

unsigned IdxVal = N->getConstantOperandVal(1);

57052

SDValue InVecBC = peekThroughBitcasts(InVec);

57053

EVT InVecVT = InVec.getValueType();

57054

unsigned SizeInBits = VT.getSizeInBits();

57055

unsigned InSizeInBits = InVecVT.getSizeInBits();

57056

unsigned NumSubElts = VT.getVectorNumElements();

57057

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57058

57059

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

57060

TLI.isTypeLegal(InVecVT) &&

57061

InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {

57062

auto isConcatenatedNot = [](SDValue V) {

57063

V = peekThroughBitcasts(V);

57064

if (!isBitwiseNot(V))

57065

return false;

57066

SDValue NotOp = V->getOperand(0);

57067

return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

57068

};

57069

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

57070

isConcatenatedNot(InVecBC.getOperand(1))) {

57071

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

57072

SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

57073

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

57074

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

57075

}

57076

}

57077

57078

if (DCI.isBeforeLegalizeOps())

57079

return SDValue();

57080

57081

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

57082

return V;

57083

57084

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

57085

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

57086

57087

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

57088

if (VT.getScalarType() == MVT::i1)

57089

return DAG.getConstant(1, SDLoc(N), VT);

57090

return getOnesVector(VT, DAG, SDLoc(N));

57091

}

57092

57093

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

57094

return DAG.getBuildVector(VT, SDLoc(N),

57095

InVec->ops().slice(IdxVal, NumSubElts));

57096

57097

// If we are extracting from an insert into a larger vector, replace with a

57098

// smaller insert if we don't access less than the original subvector. Don't

57099

// do this for i1 vectors.

57100

// TODO: Relax the matching indices requirement?

57101

if (VT.getVectorElementType() != MVT::i1 &&

57102

InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&

57103

IdxVal == InVec.getConstantOperandVal(2) &&

57104

InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {

57105

SDLoc DL(N);

57106

SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,

57107

InVec.getOperand(0), N->getOperand(1));

57108

unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;

57109

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,

57110

InVec.getOperand(1),

57111

DAG.getVectorIdxConstant(NewIdxVal, DL));

57112

}

57113

57114

// If we're extracting an upper subvector from a broadcast we should just

57115

// extract the lowest subvector instead which should allow

57116

// SimplifyDemandedVectorElts do more simplifications.

57117

if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

57118

InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||

57119

DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))

57120

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

57121

57122

// If we're extracting a broadcasted subvector, just use the lowest subvector.

57123

if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

57124

cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)

57125

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

57126

57127

// Attempt to extract from the source of a shuffle vector.

57128

if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {

57129

SmallVector<int, 32> ShuffleMask;

57130

SmallVector<int, 32> ScaledMask;

57131

SmallVector<SDValue, 2> ShuffleInputs;

57132

unsigned NumSubVecs = InSizeInBits / SizeInBits;

57133

// Decode the shuffle mask and scale it so its shuffling subvectors.

57134

if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

57135

scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

57136

unsigned SubVecIdx = IdxVal / NumSubElts;

57137

if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

57138

return DAG.getUNDEF(VT);

57139

if (ScaledMask[SubVecIdx] == SM_SentinelZero)

57140

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

57141

SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

57142

if (Src.getValueSizeInBits() == InSizeInBits) {

57143

unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

57144

unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;

57145

return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

57146

SDLoc(N), SizeInBits);

57147

}

57148

}

57149

}

57150

57151

// If we're extracting the lowest subvector and we're the only user,

57152

// we may be able to perform this with a smaller vector width.

57153

unsigned InOpcode = InVec.getOpcode();

57154

if (InVec.hasOneUse()) {

57155

if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

57156

// v2f64 CVTDQ2PD(v4i32).

57157

if (InOpcode == ISD::SINT_TO_FP &&

57158

InVec.getOperand(0).getValueType() == MVT::v4i32) {

57159

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

57160

}

57161

// v2f64 CVTUDQ2PD(v4i32).

57162

if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

57163

InVec.getOperand(0).getValueType() == MVT::v4i32) {

57164

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

57165

}

57166

// v2f64 CVTPS2PD(v4f32).

57167

if (InOpcode == ISD::FP_EXTEND &&

57168

InVec.getOperand(0).getValueType() == MVT::v4f32) {

57169

return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));

57170

}

57171

}

57172

if (IdxVal == 0 &&

57173

(ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&

57174

(SizeInBits == 128 || SizeInBits == 256) &&

57175

InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

57176

SDLoc DL(N);

57177

SDValue Ext = InVec.getOperand(0);

57178

if (Ext.getValueSizeInBits() > SizeInBits)

57179

Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

57180

unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);

57181

return DAG.getNode(ExtOp, DL, VT, Ext);

57182

}

57183

if (IdxVal == 0 && InOpcode == ISD::VSELECT &&

57184

InVec.getOperand(0).getValueType().is256BitVector() &&

57185

InVec.getOperand(1).getValueType().is256BitVector() &&

57186

InVec.getOperand(2).getValueType().is256BitVector()) {

57187

SDLoc DL(N);

57188

SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

57189

SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

57190

SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

57191

return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

57192

}

57193

if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

57194

(VT.is128BitVector() || VT.is256BitVector())) {

57195

SDLoc DL(N);

57196

SDValue InVecSrc = InVec.getOperand(0);

57197

unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

57198

SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

57199

return DAG.getNode(InOpcode, DL, VT, Ext);

57200

}

57201

if (InOpcode == X86ISD::MOVDDUP &&

57202

(VT.is128BitVector() || VT.is256BitVector())) {

57203

SDLoc DL(N);

57204

SDValue Ext0 =

57205

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

57206

return DAG.getNode(InOpcode, DL, VT, Ext0);

57207

}

57208

}

57209

57210

// Always split vXi64 logical shifts where we're extracting the upper 32-bits

57211

// as this is very likely to fold into a shuffle/truncation.

57212

if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&

57213

InVecVT.getScalarSizeInBits() == 64 &&

57214

InVec.getConstantOperandAPInt(1) == 32) {

57215

SDLoc DL(N);

57216

SDValue Ext =

57217

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

57218

return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));

57219

}

57220

57221

return SDValue();

57222

}

57223

57224

static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

57225

EVT VT = N->getValueType(0);

57226

SDValue Src = N->getOperand(0);

57227

SDLoc DL(N);

57228

57229

// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

57230

// This occurs frequently in our masked scalar intrinsic code and our

57231

// floating point select lowering with AVX512.

57232

// TODO: SimplifyDemandedBits instead?

57233

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&

57234

isOneConstant(Src.getOperand(1)))

57235

return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));

57236

57237

// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

57238

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

57239

Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

57240

Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)

57241

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

57242

if (C->isZero())

57243

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

57244

Src.getOperand(1));

57245

57246

// Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.

57247

// TODO: Move to DAGCombine/SimplifyDemandedBits?

57248

if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {

57249

auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {

57250

if (Op.getValueType() != MVT::i64)

57251

return SDValue();

57252

unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;

57253

if (Op.getOpcode() == Opc &&

57254

Op.getOperand(0).getScalarValueSizeInBits() <= 32)

57255

return Op.getOperand(0);

57256

unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;

57257

if (auto *Ld = dyn_cast<LoadSDNode>(Op))

57258

if (Ld->getExtensionType() == Ext &&

57259

Ld->getMemoryVT().getScalarSizeInBits() <= 32)

57260

return Op;

57261

if (IsZeroExt) {

57262

KnownBits Known = DAG.computeKnownBits(Op);

57263

if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)

57264

return Op;

57265

}

57266

return SDValue();

57267

};

57268

57269

if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))

57270

return DAG.getBitcast(

57271

VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

57272

DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));

57273

57274

if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))

57275

return DAG.getBitcast(

57276

VT,

57277

DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,

57278

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

57279

DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));

57280

}

57281

57282

// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

57283

if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

57284

Src.getOperand(0).getValueType() == MVT::x86mmx)

57285

return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

57286

57287

// See if we're broadcasting the scalar value, in which case just reuse that.

57288

// Ensure the same SDValue from the SDNode use is being used.

57289

if (VT.getScalarType() == Src.getValueType())

57290

for (SDNode *User : Src->uses())

57291

if (User->getOpcode() == X86ISD::VBROADCAST &&

57292

Src == User->getOperand(0)) {

57293

unsigned SizeInBits = VT.getFixedSizeInBits();

57294

unsigned BroadcastSizeInBits =

57295

User->getValueSizeInBits(0).getFixedValue();

57296

if (BroadcastSizeInBits == SizeInBits)

57297

return SDValue(User, 0);

57298

if (BroadcastSizeInBits > SizeInBits)

57299

return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);

57300

// TODO: Handle BroadcastSizeInBits < SizeInBits when we have test

57301

// coverage.

57302

}

57303

57304

return SDValue();

57305

}

57306

57307

// Simplify PMULDQ and PMULUDQ operations.

57308

static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

57309

TargetLowering::DAGCombinerInfo &DCI,

57310

const X86Subtarget &Subtarget) {

57311

SDValue LHS = N->getOperand(0);

57312

SDValue RHS = N->getOperand(1);

57313

57314

// Canonicalize constant to RHS.

57315

if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

57316

!DAG.isConstantIntBuildVectorOrConstantInt(RHS))

57317

return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

57318

57319

// Multiply by zero.

57320

// Don't return RHS as it may contain UNDEFs.

57321

if (ISD::isBuildVectorAllZeros(RHS.getNode()))

57322

return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

57323

57324

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

57325

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57326

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))

57327

return SDValue(N, 0);

57328

57329

// If the input is an extend_invec and the SimplifyDemandedBits call didn't

57330

// convert it to any_extend_invec, due to the LegalOperations check, do the

57331

// conversion directly to a vector shuffle manually. This exposes combine

57332

// opportunities missed by combineEXTEND_VECTOR_INREG not calling

57333

// combineX86ShufflesRecursively on SSE4.1 targets.

57334

// FIXME: This is basically a hack around several other issues related to

57335

// ANY_EXTEND_VECTOR_INREG.

57336

if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

57337

(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

57338

LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

57339

LHS.getOperand(0).getValueType() == MVT::v4i32) {

57340

SDLoc dl(N);

57341

LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

57342

LHS.getOperand(0), { 0, -1, 1, -1 });

57343

LHS = DAG.getBitcast(MVT::v2i64, LHS);

57344

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

57345

}

57346

if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

57347

(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

57348

RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

57349

RHS.getOperand(0).getValueType() == MVT::v4i32) {

57350

SDLoc dl(N);

57351

RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

57352

RHS.getOperand(0), { 0, -1, 1, -1 });

57353

RHS = DAG.getBitcast(MVT::v2i64, RHS);

57354

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

57355

}

57356

57357

return SDValue();

57358

}

57359

57360

// Simplify VPMADDUBSW/VPMADDWD operations.

57361

static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,

57362

TargetLowering::DAGCombinerInfo &DCI) {

57363

EVT VT = N->getValueType(0);

57364

SDValue LHS = N->getOperand(0);

57365

SDValue RHS = N->getOperand(1);

57366

57367

// Multiply by zero.

57368

// Don't return LHS/RHS as it may contain UNDEFs.

57369

if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||

57370

ISD::isBuildVectorAllZeros(RHS.getNode()))

57371

return DAG.getConstant(0, SDLoc(N), VT);

57372

57373

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57374

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

57375

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

57376

return SDValue(N, 0);

57377

57378

return SDValue();

57379

}

57380

57381

static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,

57382

TargetLowering::DAGCombinerInfo &DCI,

57383

const X86Subtarget &Subtarget) {

57384

EVT VT = N->getValueType(0);

57385

SDValue In = N->getOperand(0);

57386

unsigned Opcode = N->getOpcode();

57387

unsigned InOpcode = In.getOpcode();

57388

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57389

SDLoc DL(N);

57390

57391

// Try to merge vector loads and extend_inreg to an extload.

57392

if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

57393

In.hasOneUse()) {

57394

auto *Ld = cast<LoadSDNode>(In);

57395

if (Ld->isSimple()) {

57396

MVT SVT = In.getSimpleValueType().getVectorElementType();

57397

ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

57398

? ISD::SEXTLOAD

57399

: ISD::ZEXTLOAD;

57400

EVT MemVT = VT.changeVectorElementType(SVT);

57401

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

57402

SDValue Load = DAG.getExtLoad(

57403

Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),

57404

MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());

57405

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

57406

return Load;

57407

}

57408

}

57409

}

57410

57411

// Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).

57412

if (Opcode == InOpcode)

57413

return DAG.getNode(Opcode, DL, VT, In.getOperand(0));

57414

57415

// Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))

57416

// -> EXTEND_VECTOR_INREG(X).

57417

// TODO: Handle non-zero subvector indices.

57418

if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&

57419

In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&

57420

In.getOperand(0).getOperand(0).getValueSizeInBits() ==

57421

In.getValueSizeInBits())

57422

return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));

57423

57424

// Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).

57425

// TODO: Move to DAGCombine?

57426

if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&

57427

In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&

57428

In.getValueSizeInBits() == VT.getSizeInBits()) {

57429

unsigned NumElts = VT.getVectorNumElements();

57430

unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();

57431

EVT EltVT = In.getOperand(0).getValueType();

57432

SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));

57433

for (unsigned I = 0; I != NumElts; ++I)

57434

Elts[I * Scale] = In.getOperand(I);

57435

return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));

57436

}

57437

57438

// Attempt to combine as a shuffle on SSE41+ targets.

57439

if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||

57440

Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&

57441

Subtarget.hasSSE41()) {

57442

SDValue Op(N, 0);

57443

if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

57444

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

57445

return Res;

57446

}

57447

57448

return SDValue();

57449

}

57450

57451

static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

57452

TargetLowering::DAGCombinerInfo &DCI) {

57453

EVT VT = N->getValueType(0);

57454

57455

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

57456

return DAG.getConstant(0, SDLoc(N), VT);

57457

57458

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57459

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

57460

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

57461

return SDValue(N, 0);

57462

57463

return SDValue();

57464

}

57465

57466

// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

57467

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

57468

// extra instructions between the conversion due to going to scalar and back.

57469

static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

57470

const X86Subtarget &Subtarget) {

57471

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

57472

return SDValue();

57473

57474

if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

57475

return SDValue();

57476

57477

if (N->getValueType(0) != MVT::f32 ||

57478

N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

57479

return SDValue();

57480

57481

SDLoc dl(N);

57482

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

57483

N->getOperand(0).getOperand(0));

57484

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

57485

DAG.getTargetConstant(4, dl, MVT::i32));

57486

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

57487

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

57488

DAG.getIntPtrConstant(0, dl));

57489

}

57490

57491

static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

57492

const X86Subtarget &Subtarget) {

57493

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

57494

return SDValue();

57495

57496

if (Subtarget.hasFP16())

57497

return SDValue();

57498

57499

bool IsStrict = N->isStrictFPOpcode();

57500

EVT VT = N->getValueType(0);

57501

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

57502

EVT SrcVT = Src.getValueType();

57503

57504

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

57505

return SDValue();

57506

57507

if (VT.getVectorElementType() != MVT::f32 &&

57508

VT.getVectorElementType() != MVT::f64)

57509

return SDValue();

57510

57511

unsigned NumElts = VT.getVectorNumElements();

57512

if (NumElts == 1 || !isPowerOf2_32(NumElts))

57513

return SDValue();

57514

57515

SDLoc dl(N);

57516

57517

// Convert the input to vXi16.

57518

EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

57519

Src = DAG.getBitcast(IntVT, Src);

57520

57521

// Widen to at least 8 input elements.

57522

if (NumElts < 8) {

57523

unsigned NumConcats = 8 / NumElts;

57524

SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

57525

: DAG.getConstant(0, dl, IntVT);

57526

SmallVector<SDValue, 4> Ops(NumConcats, Fill);

57527

Ops[0] = Src;

57528

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

57529

}

57530

57531

// Destination is vXf32 with at least 4 elements.

57532

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

57533

std::max(4U, NumElts));

57534

SDValue Cvt, Chain;

57535

if (IsStrict) {

57536

Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

57537

{N->getOperand(0), Src});

57538

Chain = Cvt.getValue(1);

57539

} else {

57540

Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

57541

}

57542

57543

if (NumElts < 4) {

57544

assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57544, __extension__
__PRETTY_FUNCTION__));

57545

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

57546

DAG.getIntPtrConstant(0, dl));

57547

}

57548

57549

if (IsStrict) {

57550

// Extend to the original VT if necessary.

57551

if (Cvt.getValueType() != VT) {

57552

Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

57553

{Chain, Cvt});

57554

Chain = Cvt.getValue(1);

57555

}

57556

return DAG.getMergeValues({Cvt, Chain}, dl);

57557

}

57558

57559

// Extend to the original VT if necessary.

57560

return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

57561

}

57562

57563

// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract

57564

// from. Limit this to cases where the loads have the same input chain and the

57565

// output chains are unused. This avoids any memory ordering issues.

57566

static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

57567

TargetLowering::DAGCombinerInfo &DCI) {

57568

assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))

57569

N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__))

57570

"Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__
__PRETTY_FUNCTION__));

57571

57572

// Only do this if the chain result is unused.

57573

if (N->hasAnyUseOfValue(1))

57574

return SDValue();

57575

57576

auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

57577

57578

SDValue Ptr = MemIntrin->getBasePtr();

57579

SDValue Chain = MemIntrin->getChain();

57580

EVT VT = N->getSimpleValueType(0);

57581

EVT MemVT = MemIntrin->getMemoryVT();

57582

57583

// Look at other users of our base pointer and try to find a wider broadcast.

57584

// The input chain and the size of the memory VT must match.

57585

for (SDNode *User : Ptr->uses())

57586

if (User != N && User->getOpcode() == N->getOpcode() &&

57587

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

57588

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

57589

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

57590

MemVT.getSizeInBits() &&

57591

!User->hasAnyUseOfValue(1) &&

57592

User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {

57593

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

57594

VT.getSizeInBits());

57595

Extract = DAG.getBitcast(VT, Extract);

57596

return DCI.CombineTo(N, Extract, SDValue(User, 1));

57597

}

57598

57599

return SDValue();

57600

}

57601

57602

static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

57603

const X86Subtarget &Subtarget) {

57604

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

57605

return SDValue();

57606

57607

bool IsStrict = N->isStrictFPOpcode();

57608

EVT VT = N->getValueType(0);

57609

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

57610

EVT SrcVT = Src.getValueType();

57611

57612

if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

57613

SrcVT.getVectorElementType() != MVT::f32)

57614

return SDValue();

57615

57616

SDLoc dl(N);

57617

57618

SDValue Cvt, Chain;

57619

unsigned NumElts = VT.getVectorNumElements();

57620

if (Subtarget.hasFP16()) {

57621

// Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))

57622

// into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))

57623

if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {

57624

SDValue Cvt0, Cvt1;

57625

SDValue Op0 = Src.getOperand(0);

57626

SDValue Op1 = Src.getOperand(1);

57627

bool IsOp0Strict = Op0->isStrictFPOpcode();

57628

if (Op0.getOpcode() != Op1.getOpcode() ||

57629

Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||

57630

Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {

57631

return SDValue();

57632

}

57633

int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};

57634

if (IsStrict) {

57635

assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57635, __extension__
__PRETTY_FUNCTION__));

57636

unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP

57637

? X86ISD::STRICT_CVTSI2P

57638

: X86ISD::STRICT_CVTUI2P;

57639

Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

57640

{Op0.getOperand(0), Op0.getOperand(1)});

57641

Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

57642

{Op1.getOperand(0), Op1.getOperand(1)});

57643

Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

57644

return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);

57645

}

57646

unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P

57647

: X86ISD::CVTUI2P;

57648

Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));

57649

Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));

57650

return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

57651

}

57652

return SDValue();

57653

}

57654

57655

if (NumElts == 1 || !isPowerOf2_32(NumElts))

57656

return SDValue();

57657

57658

// Widen to at least 4 input elements.

57659

if (NumElts < 4)

57660

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

57661

DAG.getConstantFP(0.0, dl, SrcVT));

57662

57663

// Destination is v8i16 with at least 8 elements.

57664

EVT CvtVT =

57665

EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));

57666

SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);

57667

if (IsStrict) {

57668

Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},

57669

{N->getOperand(0), Src, Rnd});

57670

Chain = Cvt.getValue(1);

57671

} else {

57672

Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);

57673

}

57674

57675

// Extract down to real number of elements.

57676

if (NumElts < 8) {

57677

EVT IntVT = VT.changeVectorElementTypeToInteger();

57678

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

57679

DAG.getIntPtrConstant(0, dl));

57680

}

57681

57682

Cvt = DAG.getBitcast(VT, Cvt);

57683

57684

if (IsStrict)

57685

return DAG.getMergeValues({Cvt, Chain}, dl);

57686

57687

return Cvt;

57688

}

57689

57690

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

57691

SDValue Src = N->getOperand(0);

57692

57693

// Turn MOVDQ2Q+simple_load into an mmx load.

57694

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

57695

LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

57696

57697

if (LN->isSimple()) {

57698

SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

57699

LN->getBasePtr(),

57700

LN->getPointerInfo(),

57701

LN->getOriginalAlign(),

57702

LN->getMemOperand()->getFlags());

57703

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

57704

return NewLd;

57705

}

57706

}

57707

57708

return SDValue();

57709

}

57710

57711

static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,

57712

TargetLowering::DAGCombinerInfo &DCI) {

57713

unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();

57714

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57715

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))

57716

return SDValue(N, 0);

57717

57718

return SDValue();

57719

}

57720

57721

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

57722

DAGCombinerInfo &DCI) const {

57723

SelectionDAG &DAG = DCI.DAG;

57724

switch (N->getOpcode()) {

57725

default: break;

57726

case ISD::SCALAR_TO_VECTOR:

57727

return combineScalarToVector(N, DAG);

57728

case ISD::EXTRACT_VECTOR_ELT:

57729

case X86ISD::PEXTRW:

57730

case X86ISD::PEXTRB:

57731

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

57732

case ISD::CONCAT_VECTORS:

57733

return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);

57734

case ISD::INSERT_SUBVECTOR:

57735

return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);

57736

case ISD::EXTRACT_SUBVECTOR:

57737

return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);

57738

case ISD::VSELECT:

57739

case ISD::SELECT:

57740

case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);

57741

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

57742

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

57743

case X86ISD::CMP: return combineCMP(N, DAG);

57744

case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);

57745

case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);

57746

case X86ISD::ADD:

57747

case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);

57748

case X86ISD::SBB: return combineSBB(N, DAG);

57749

case X86ISD::ADC: return combineADC(N, DAG, DCI);

57750

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

57751

case ISD::SHL: return combineShiftLeft(N, DAG);

57752

case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

57753

case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

57754

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

57755

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

57756

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

57757

case X86ISD::BEXTR:

57758

case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);

57759

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

57760

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

57761

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

57762

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

57763

case X86ISD::VEXTRACT_STORE:

57764

return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

57765

case ISD::SINT_TO_FP:

57766

case ISD::STRICT_SINT_TO_FP:

57767

return combineSIntToFP(N, DAG, DCI, Subtarget);

57768

case ISD::UINT_TO_FP:

57769

case ISD::STRICT_UINT_TO_FP:

57770

return combineUIntToFP(N, DAG, Subtarget);

57771

case ISD::FADD:

57772

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

57773

case X86ISD::VFCMULC:

57774

case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);

57775

case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

57776

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

57777

case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

57778

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

57779

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

57780

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

57781

case X86ISD::FXOR:

57782

case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

57783

case X86ISD::FMIN:

57784

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

57785

case ISD::FMINNUM:

57786

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

57787

case X86ISD::CVTSI2P:

57788

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

57789

case X86ISD::CVTP2SI:

57790

case X86ISD::CVTP2UI:

57791

case X86ISD::STRICT_CVTTP2SI:

57792

case X86ISD::CVTTP2SI:

57793

case X86ISD::STRICT_CVTTP2UI:

57794

case X86ISD::CVTTP2UI:

57795

return combineCVTP2I_CVTTP2I(N, DAG, DCI);

57796

case X86ISD::STRICT_CVTPH2PS:

57797

case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

57798

case X86ISD::BT: return combineBT(N, DAG, DCI);

57799

case ISD::ANY_EXTEND:

57800

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

57801

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

57802

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

57803

case ISD::ANY_EXTEND_VECTOR_INREG:

57804

case ISD::SIGN_EXTEND_VECTOR_INREG:

57805

case ISD::ZERO_EXTEND_VECTOR_INREG:

57806

return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);

57807

case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);

57808

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

57809

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

57810

case X86ISD::PACKSS:

57811

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

57812

case X86ISD::HADD:

57813

case X86ISD::HSUB:

57814

case X86ISD::FHADD:

57815

case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

57816

case X86ISD::VSHL:

57817

case X86ISD::VSRA:

57818

case X86ISD::VSRL:

57819

return combineVectorShiftVar(N, DAG, DCI, Subtarget);

57820

case X86ISD::VSHLI:

57821

case X86ISD::VSRAI:

57822

case X86ISD::VSRLI:

57823

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

57824

case ISD::INSERT_VECTOR_ELT:

57825

case X86ISD::PINSRB:

57826

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

57827

case X86ISD::SHUFP: // Handle all target specific shuffles

57828

case X86ISD::INSERTPS:

57829

case X86ISD::EXTRQI:

57830

case X86ISD::INSERTQI:

57831

case X86ISD::VALIGN:

57832

case X86ISD::PALIGNR:

57833

case X86ISD::VSHLDQ:

57834

case X86ISD::VSRLDQ:

57835

case X86ISD::BLENDI:

57836

case X86ISD::UNPCKH:

57837

case X86ISD::UNPCKL:

57838

case X86ISD::MOVHLPS:

57839

case X86ISD::MOVLHPS:

57840

case X86ISD::PSHUFB:

57841

case X86ISD::PSHUFD:

57842

case X86ISD::PSHUFHW:

57843

case X86ISD::PSHUFLW:

57844

case X86ISD::MOVSHDUP:

57845

case X86ISD::MOVSLDUP:

57846

case X86ISD::MOVDDUP:

57847

case X86ISD::MOVSS:

57848

case X86ISD::MOVSD:

57849

case X86ISD::MOVSH:

57850

case X86ISD::VBROADCAST:

57851

case X86ISD::VPPERM:

57852

case X86ISD::VPERMI:

57853

case X86ISD::VPERMV:

57854

case X86ISD::VPERMV3:

57855

case X86ISD::VPERMIL2:

57856

case X86ISD::VPERMILPI:

57857

case X86ISD::VPERMILPV:

57858

case X86ISD::VPERM2X128:

57859

case X86ISD::SHUF128:

57860

case X86ISD::VZEXT_MOVL:

57861

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

57862

case X86ISD::FMADD_RND:

57863

case X86ISD::FMSUB:

57864

case X86ISD::STRICT_FMSUB:

57865

case X86ISD::FMSUB_RND:

57866

case X86ISD::FNMADD:

57867

case X86ISD::STRICT_FNMADD:

57868

case X86ISD::FNMADD_RND:

57869

case X86ISD::FNMSUB:

57870

case X86ISD::STRICT_FNMSUB:

57871

case X86ISD::FNMSUB_RND:

57872

case ISD::FMA:

57873

case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

57874

case X86ISD::FMADDSUB_RND:

57875

case X86ISD::FMSUBADD_RND:

57876

case X86ISD::FMADDSUB:

57877

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);

57878

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);

57879

case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);

57880

case X86ISD::MGATHER:

57881

case X86ISD::MSCATTER:

57882

return combineX86GatherScatter(N, DAG, DCI, Subtarget);

57883

case ISD::MGATHER:

57884

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);

57885

case X86ISD::PCMPEQ:

57886

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

57887

case X86ISD::PMULDQ:

57888

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

57889

case X86ISD::VPMADDUBSW:

57890

case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);

57891

case X86ISD::KSHIFTL:

57892

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

57893

case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

57894

case ISD::STRICT_FP_EXTEND:

57895

case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

57896

case ISD::STRICT_FP_ROUND:

57897

case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

57898

case X86ISD::VBROADCAST_LOAD:

57899

case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);

57900

case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

57901

case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);

57902

}

57903

57904

return SDValue();

57905

}

57906

57907

bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {

57908

return false;

57909

}

57910

57911

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

57912

if (!isTypeLegal(VT))

57913

return false;

57914

57915

// There are no vXi8 shifts.

57916

if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

57917

return false;

57918

57919

// TODO: Almost no 8-bit ops are desirable because they have no actual

57920

// size/speed advantages vs. 32-bit ops, but they do have a major

57921

// potential disadvantage by causing partial register stalls.

57922

//

57923

// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

57924

// we have specializations to turn 32-bit multiply/shl into LEA or other ops.

57925

// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

57926

// check for a constant operand to the multiply.

57927

if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

57928

return false;

57929

57930

// i16 instruction encodings are longer and some i16 instructions are slow,

57931

// so those are not desirable.

57932

if (VT == MVT::i16) {

57933

switch (Opc) {

57934

default:

57935

break;

57936

case ISD::LOAD:

57937

case ISD::SIGN_EXTEND:

57938

case ISD::ZERO_EXTEND:

57939

case ISD::ANY_EXTEND:

57940

case ISD::SHL:

57941

case ISD::SRA:

57942

case ISD::SRL:

57943

case ISD::SUB:

57944

case ISD::ADD:

57945

case ISD::MUL:

57946

case ISD::AND:

57947

case ISD::OR:

57948

case ISD::XOR:

57949

return false;

57950

}

57951

}

57952

57953

// Any legal type not explicitly accounted for above here is desirable.

57954

return true;

57955

}

57956

57957

SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,

57958

SDValue Value, SDValue Addr,

57959

SelectionDAG &DAG) const {

57960

const Module *M = DAG.getMachineFunction().getMMI().getModule();

57961

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

57962

if (IsCFProtectionSupported) {

57963

// In case control-flow branch protection is enabled, we need to add

57964

// notrack prefix to the indirect branch.

57965

// In order to do that we create NT_BRIND SDNode.

57966

// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

57967

return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);

57968

}

57969

57970

return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);

57971

}

57972

57973

TargetLowering::AndOrSETCCFoldKind

57974

X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(

57975

const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {

57976

using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;

57977

EVT VT = LogicOp->getValueType(0);

57978

EVT OpVT = SETCC0->getOperand(0).getValueType();

57979

if (!VT.isInteger())

57980

return AndOrSETCCFoldKind::None;

57981

57982

if (VT.isVector())

57983

return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |

57984

(isOperationLegal(ISD::ABS, OpVT)

57985

? AndOrSETCCFoldKind::ABS

57986

: AndOrSETCCFoldKind::None));

57987

57988

// Don't use `NotAnd` as even though `not` is generally shorter code size than

57989

// `add`, `add` can lower to LEA which can save moves / spills. Any case where

57990

// `NotAnd` applies, `AddAnd` does as well.

57991

// TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,

57992

// if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.

57993

return AndOrSETCCFoldKind::AddAnd;

57994

}

57995

57996

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

57997

EVT VT = Op.getValueType();

57998

bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

57999

isa<ConstantSDNode>(Op.getOperand(1));

58000

58001

// i16 is legal, but undesirable since i16 instruction encodings are longer

58002

// and some i16 instructions are slow.

58003

// 8-bit multiply-by-constant can usually be expanded to something cheaper

58004

// using LEA and/or other ALU ops.

58005

if (VT != MVT::i16 && !Is8BitMulByConstant)

58006

return false;

58007

58008

auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

58009

if (!Op.hasOneUse())

58010

return false;

58011

SDNode *User = *Op->use_begin();

58012

if (!ISD::isNormalStore(User))

58013

return false;

58014

auto *Ld = cast<LoadSDNode>(Load);

58015

auto *St = cast<StoreSDNode>(User);

58016

return Ld->getBasePtr() == St->getBasePtr();

58017

};

58018

58019

auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

58020

if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

58021

return false;

58022

if (!Op.hasOneUse())

58023

return false;

58024

SDNode *User = *Op->use_begin();

58025

if (User->getOpcode() != ISD::ATOMIC_STORE)

58026

return false;

58027

auto *Ld = cast<AtomicSDNode>(Load);

58028

auto *St = cast<AtomicSDNode>(User);

58029

return Ld->getBasePtr() == St->getBasePtr();

58030

};

58031

58032

bool Commute = false;

58033

switch (Op.getOpcode()) {

58034

default: return false;

58035

case ISD::SIGN_EXTEND:

58036

case ISD::ZERO_EXTEND:

58037

case ISD::ANY_EXTEND:

58038

break;

58039

case ISD::SHL:

58040

case ISD::SRA:

58041

case ISD::SRL: {

58042

SDValue N0 = Op.getOperand(0);

58043

// Look out for (store (shl (load), x)).

58044

if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))

58045

return false;

58046

break;

58047

}

58048

case ISD::ADD:

58049

case ISD::MUL:

58050

case ISD::AND:

58051

case ISD::OR:

58052

case ISD::XOR:

58053

Commute = true;

58054

[[fallthrough]];

58055

case ISD::SUB: {

58056

SDValue N0 = Op.getOperand(0);

58057

SDValue N1 = Op.getOperand(1);

58058

// Avoid disabling potential load folding opportunities.

58059

if (X86::mayFoldLoad(N1, Subtarget) &&

58060

(!Commute || !isa<ConstantSDNode>(N0) ||

58061

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

58062

return false;

58063

if (X86::mayFoldLoad(N0, Subtarget) &&

58064

((Commute && !isa<ConstantSDNode>(N1)) ||

58065

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

58066

return false;

58067

if (IsFoldableAtomicRMW(N0, Op) ||

58068

(Commute && IsFoldableAtomicRMW(N1, Op)))

58069

return false;

58070

}

58071

}

58072

58073

PVT = MVT::i32;

58074

return true;

58075

}

58076

58077

//===----------------------------------------------------------------------===//

58078

// X86 Inline Assembly Support

58079

//===----------------------------------------------------------------------===//

58080

58081

// Helper to match a string separated by whitespace.

58082

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

58083

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

58084

58085

for (StringRef Piece : Pieces) {

58086

if (!S.startswith(Piece)) // Check if the piece matches.

58087

return false;

58088

58089

S = S.substr(Piece.size());

58090

StringRef::size_type Pos = S.find_first_not_of(" \t");

58091

if (Pos == 0) // We matched a prefix.

58092

return false;

58093

58094

S = S.substr(Pos);

58095

}

58096

58097

return S.empty();

58098

}

58099

58100

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

58101

58102

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

58103

if (llvm::is_contained(AsmPieces, "~{cc}") &&

58104

llvm::is_contained(AsmPieces, "~{flags}") &&

58105

llvm::is_contained(AsmPieces, "~{fpsr}")) {

58106

58107

if (AsmPieces.size() == 3)

58108

return true;

58109

else if (llvm::is_contained(AsmPieces, "~{dirflag}"))

58110

return true;

58111

}

58112

}

58113

return false;

58114

}

58115

58116

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

58117

InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

58118

58119

const std::string &AsmStr = IA->getAsmString();

58120

58121

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

58122

if (!Ty || Ty->getBitWidth() % 16 != 0)

58123

return false;

58124

58125

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

58126

SmallVector<StringRef, 4> AsmPieces;

58127

SplitString(AsmStr, AsmPieces, ";\n");

58128

58129

switch (AsmPieces.size()) {

58130

default: return false;

58131

case 1:

58132

// FIXME: this should verify that we are targeting a 486 or better. If not,

58133

// we will turn this bswap into something that will be lowered to logical

58134

// ops instead of emitting the bswap asm. For now, we don't support 486 or

58135

// lower so don't worry about this.

58136

// bswap $0

58137

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

58138

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

58139

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

58140

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

58141

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

58142

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

58143

// No need to check constraints, nothing other than the equivalent of

58144

// "=r,0" would be valid here.

58145

return IntrinsicLowering::LowerToByteSwap(CI);

58146

}

58147

58148

// rorw $$8, ${0:w} --> llvm.bswap.i16

58149

if (CI->getType()->isIntegerTy(16) &&

58150

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

58151

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

58152

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

58153

AsmPieces.clear();

58154

StringRef ConstraintsStr = IA->getConstraintString();

58155

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

58156

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

58157

if (clobbersFlagRegisters(AsmPieces))

58158

return IntrinsicLowering::LowerToByteSwap(CI);

58159

}

58160

break;

58161

case 3:

58162

if (CI->getType()->isIntegerTy(32) &&

58163

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

58164

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

58165

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

58166

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

58167

AsmPieces.clear();

58168

StringRef ConstraintsStr = IA->getConstraintString();

58169

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

58170

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

58171

if (clobbersFlagRegisters(AsmPieces))

58172

return IntrinsicLowering::LowerToByteSwap(CI);

58173

}

58174

58175

if (CI->getType()->isIntegerTy(64)) {

58176

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

58177

if (Constraints.size() >= 2 &&

58178

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

58179

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

58180

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

58181

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

58182

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

58183

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

58184

return IntrinsicLowering::LowerToByteSwap(CI);

58185

}

58186

}

58187

break;

58188

}

58189

return false;

58190

}

58191

58192

static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

58193

X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

58194

.Case("{@cca}", X86::COND_A)

58195

.Case("{@ccae}", X86::COND_AE)

58196

.Case("{@ccb}", X86::COND_B)

58197

.Case("{@ccbe}", X86::COND_BE)

58198

.Case("{@ccc}", X86::COND_B)

58199

.Case("{@cce}", X86::COND_E)

58200

.Case("{@ccz}", X86::COND_E)

58201

.Case("{@ccg}", X86::COND_G)

58202

.Case("{@ccge}", X86::COND_GE)

58203

.Case("{@ccl}", X86::COND_L)

58204

.Case("{@ccle}", X86::COND_LE)

58205

.Case("{@ccna}", X86::COND_BE)

58206

.Case("{@ccnae}", X86::COND_B)

58207

.Case("{@ccnb}", X86::COND_AE)

58208

.Case("{@ccnbe}", X86::COND_A)

58209

.Case("{@ccnc}", X86::COND_AE)

58210

.Case("{@ccne}", X86::COND_NE)

58211

.Case("{@ccnz}", X86::COND_NE)

58212

.Case("{@ccng}", X86::COND_LE)

58213

.Case("{@ccnge}", X86::COND_L)

58214

.Case("{@ccnl}", X86::COND_GE)

58215

.Case("{@ccnle}", X86::COND_G)

58216

.Case("{@ccno}", X86::COND_NO)

58217

.Case("{@ccnp}", X86::COND_NP)

58218

.Case("{@ccns}", X86::COND_NS)

58219

.Case("{@cco}", X86::COND_O)

58220

.Case("{@ccp}", X86::COND_P)

58221

.Case("{@ccs}", X86::COND_S)

58222

.Default(X86::COND_INVALID);

58223

return Cond;

58224

}

58225

58226

/// Given a constraint letter, return the type of constraint for this target.

58227

X86TargetLowering::ConstraintType

58228

X86TargetLowering::getConstraintType(StringRef Constraint) const {

58229

if (Constraint.size() == 1) {

58230

switch (Constraint[0]) {

58231

case 'R':

58232

case 'q':

58233

case 'Q':

58234

case 'f':

58235

case 't':

58236

case 'u':

58237

case 'y':

58238

case 'x':

58239

case 'v':

58240

case 'l':

58241

case 'k': // AVX512 masking registers.

58242

return C_RegisterClass;

58243

case 'a':

58244

case 'b':

58245

case 'c':

58246

case 'd':

58247

case 'S':

58248

case 'D':

58249

case 'A':

58250

return C_Register;

58251

case 'I':

58252

case 'J':

58253

case 'K':

58254

case 'N':

58255

case 'G':

58256

case 'L':

58257

case 'M':

58258

return C_Immediate;

58259

case 'C':

58260

case 'e':

58261

case 'Z':

58262

return C_Other;

58263

default:

58264

break;

58265

}

58266

}

58267

else if (Constraint.size() == 2) {

58268

switch (Constraint[0]) {

58269

default:

58270

break;

58271

case 'Y':

58272

switch (Constraint[1]) {

58273

default:

58274

break;

58275

case 'z':

58276

return C_Register;

58277

case 'i':

58278

case 'm':

58279

case 'k':

58280

case 't':

58281

case '2':

58282

return C_RegisterClass;

58283

}

58284

}

58285

} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

58286

return C_Other;

58287

return TargetLowering::getConstraintType(Constraint);

58288

}

58289

58290

/// Examine constraint type and operand type and determine a weight value.

58291

/// This object must already have been set up with the operand type

58292

/// and the current alternative constraint selected.

58293

TargetLowering::ConstraintWeight

58294

X86TargetLowering::getSingleConstraintMatchWeight(

58295

AsmOperandInfo &info, const char *constraint) const {

58296

ConstraintWeight weight = CW_Invalid;

58297

Value *CallOperandVal = info.CallOperandVal;

58298

// If we don't have a value, we can't do a match,

58299

// but allow it at the lowest weight.

58300

if (!CallOperandVal)

58301

return CW_Default;

58302

Type *type = CallOperandVal->getType();

58303

// Look at the constraint type.

58304

switch (*constraint) {

58305

default:

58306

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

58307

[[fallthrough]];

58308

case 'R':

58309

case 'q':

58310

case 'Q':

58311

case 'a':

58312

case 'b':

58313

case 'c':

58314

case 'd':

58315

case 'S':

58316

case 'D':

58317

case 'A':

58318

if (CallOperandVal->getType()->isIntegerTy())

58319

weight = CW_SpecificReg;

58320

break;

58321

case 'f':

58322

case 't':

58323

case 'u':

58324

if (type->isFloatingPointTy())

58325

weight = CW_SpecificReg;

58326

break;

58327

case 'y':

58328

if (type->isX86_MMXTy() && Subtarget.hasMMX())

58329

weight = CW_SpecificReg;

58330

break;

58331

case 'Y':

58332

if (StringRef(constraint).size() != 2)

58333

break;

58334

switch (constraint[1]) {

58335

default:

58336

return CW_Invalid;

58337

// XMM0

58338

case 'z':

58339

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

58340

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

58341

((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

58342

return CW_SpecificReg;

58343

return CW_Invalid;

58344

// Conditional OpMask regs (AVX512)

58345

case 'k':

58346

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

58347

return CW_Register;

58348

return CW_Invalid;

58349

// Any MMX reg

58350

case 'm':

58351

if (type->isX86_MMXTy() && Subtarget.hasMMX())

58352

return weight;

58353

return CW_Invalid;

58354

// Any SSE reg when ISA >= SSE2, same as 'x'

58355

case 'i':

58356

case 't':

58357

case '2':

58358

if (!Subtarget.hasSSE2())

58359

return CW_Invalid;

58360

break;

58361

}

58362

break;

58363

case 'v':

58364

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

58365

weight = CW_Register;

58366

[[fallthrough]];

58367

case 'x':

58368

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

58369

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

58370

weight = CW_Register;

58371

break;

58372

case 'k':

58373

// Enable conditional vector operations using %k<#> registers.

58374

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

58375

weight = CW_Register;

58376

break;

58377

case 'I':

58378

if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

58379

if (C->getZExtValue() <= 31)

58380

weight = CW_Constant;

58381

}

58382

break;

58383

case 'J':

58384

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58385

if (C->getZExtValue() <= 63)

58386

weight = CW_Constant;

58387

}

58388

break;

58389

case 'K':

58390

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58391

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

58392

weight = CW_Constant;

58393

}

58394

break;

58395

case 'L':

58396

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58397

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

58398

weight = CW_Constant;

58399

}

58400

break;

58401

case 'M':

58402

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58403

if (C->getZExtValue() <= 3)

58404

weight = CW_Constant;

58405

}

58406

break;

58407

case 'N':

58408

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58409

if (C->getZExtValue() <= 0xff)

58410

weight = CW_Constant;

58411

}

58412

break;

58413

case 'G':

58414

case 'C':

58415

if (isa<ConstantFP>(CallOperandVal)) {

58416

weight = CW_Constant;

58417

}

58418

break;

58419

case 'e':

58420

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58421

if ((C->getSExtValue() >= -0x80000000LL) &&

58422

(C->getSExtValue() <= 0x7fffffffLL))

58423

weight = CW_Constant;

58424

}

58425

break;

58426

case 'Z':

58427

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58428

if (C->getZExtValue() <= 0xffffffff)

58429

weight = CW_Constant;

58430

}

58431

break;

58432

}

58433

return weight;

58434

}

58435

58436

/// Try to replace an X constraint, which matches anything, with another that

58437

/// has more specific requirements based on the type of the corresponding

58438

/// operand.

58439

const char *X86TargetLowering::

58440

LowerXConstraint(EVT ConstraintVT) const {

58441

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

58442

// 'f' like normal targets.

58443

if (ConstraintVT.isFloatingPoint()) {

58444

if (Subtarget.hasSSE1())

58445

return "x";

58446

}

58447

58448

return TargetLowering::LowerXConstraint(ConstraintVT);

58449

}

58450

58451

// Lower @cc targets via setcc.

58452

SDValue X86TargetLowering::LowerAsmOutputForConstraint(

58453

SDValue &Chain, SDValue &Glue, const SDLoc &DL,

58454

const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

58455

X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

58456

if (Cond == X86::COND_INVALID)

58457

return SDValue();

58458

// Check that return type is valid.

58459

if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

58460

OpInfo.ConstraintVT.getSizeInBits() < 8)

58461

report_fatal_error("Glue output operand is of invalid type");

58462

58463

// Get EFLAGS register. Only update chain when copyfrom is glued.

58464

if (Glue.getNode()) {

58465

Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);

58466

Chain = Glue.getValue(1);

58467

} else

58468

Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

58469

// Extract CC code.

58470

SDValue CC = getSETCC(Cond, Glue, DL, DAG);

58471

// Extend to 32-bits

58472

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

58473

58474

return Result;

58475

}

58476

58477

/// Lower the specified operand into the Ops vector.

58478

/// If it is invalid, don't add anything to Ops.

58479

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

58480

std::string &Constraint,

58481

std::vector<SDValue>&Ops,

58482

SelectionDAG &DAG) const {

58483

SDValue Result;

58484

58485

// Only support length 1 constraints for now.

58486

if (Constraint.length() > 1) return;

58487

58488

char ConstraintLetter = Constraint[0];

58489

switch (ConstraintLetter) {

58490

default: break;

58491

case 'I':

58492

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58493

if (C->getZExtValue() <= 31) {

58494

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58495

Op.getValueType());

58496

break;

58497

}

58498

}

58499

return;

58500

case 'J':

58501

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58502

if (C->getZExtValue() <= 63) {

58503

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58504

Op.getValueType());

58505

break;

58506

}

58507

}

58508

return;

58509

case 'K':

58510

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58511

if (isInt<8>(C->getSExtValue())) {

58512

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58513

Op.getValueType());

58514

break;

58515

}

58516

}

58517

return;

58518

case 'L':

58519

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58520

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

58521

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

58522

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

58523

Op.getValueType());

58524

break;

58525

}

58526

}

58527

return;

58528

case 'M':

58529

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58530

if (C->getZExtValue() <= 3) {

58531

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58532

Op.getValueType());

58533

break;

58534

}

58535

}

58536

return;

58537

case 'N':

58538

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58539

if (C->getZExtValue() <= 255) {

58540

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58541

Op.getValueType());

58542

break;

58543

}

58544

}

58545

return;

58546

case 'O':

58547

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58548

if (C->getZExtValue() <= 127) {

58549

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58550

Op.getValueType());

58551

break;

58552

}

58553

}

58554

return;

58555

case 'e': {

58556

// 32-bit signed value

58557

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58558

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

58559

C->getSExtValue())) {

58560

// Widen to 64 bits here to get it sign extended.

58561

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

58562

break;

58563

}

58564

// FIXME gcc accepts some relocatable values here too, but only in certain

58565

// memory models; it's complicated.

58566

}

58567

return;

58568

}

58569

case 'Z': {

58570

// 32-bit unsigned value

58571

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58572

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

58573

C->getZExtValue())) {

58574

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58575

Op.getValueType());

58576

break;

58577

}

58578

}

58579

// FIXME gcc accepts some relocatable values here too, but only in certain

58580

// memory models; it's complicated.

58581

return;

58582

}

58583

case 'i': {

58584

// Literal immediates are always ok.

58585

if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {

58586

bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

58587

BooleanContent BCont = getBooleanContents(MVT::i64);

58588

ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

58589

: ISD::SIGN_EXTEND;

58590

int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

58591

: CST->getSExtValue();

58592

Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

58593

break;

58594

}

58595

58596

// In any sort of PIC mode addresses need to be computed at runtime by

58597

// adding in a register or some sort of table lookup. These can't

58598

// be used as immediates. BlockAddresses and BasicBlocks are fine though.

58599

if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&

58600

!(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))

58601

return;

58602

58603

// If we are in non-pic codegen mode, we allow the address of a global (with

58604

// an optional displacement) to be used with 'i'.

58605

if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

58606

// If we require an extra load to get this address, as in PIC mode, we

58607

// can't accept it.

58608

if (isGlobalStubReference(

58609

Subtarget.classifyGlobalReference(GA->getGlobal())))

58610

return;

58611

break;

58612

}

58613

}

58614

58615

if (Result.getNode()) {

58616

Ops.push_back(Result);

58617

return;

58618

}

58619

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

58620

}

58621

58622

/// Check if \p RC is a general purpose register class.

58623

/// I.e., GR* or one of their variant.

58624

static bool isGRClass(const TargetRegisterClass &RC) {

58625

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

58626

RC.hasSuperClassEq(&X86::GR16RegClass) ||

58627

RC.hasSuperClassEq(&X86::GR32RegClass) ||

58628

RC.hasSuperClassEq(&X86::GR64RegClass) ||

58629

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

58630

}

58631

58632

/// Check if \p RC is a vector register class.

58633

/// I.e., FR* / VR* or one of their variant.

58634

static bool isFRClass(const TargetRegisterClass &RC) {

58635

return RC.hasSuperClassEq(&X86::FR16XRegClass) ||

58636

RC.hasSuperClassEq(&X86::FR32XRegClass) ||

58637

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

58638

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

58639

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

58640

RC.hasSuperClassEq(&X86::VR512RegClass);

58641

}

58642

58643

/// Check if \p RC is a mask register class.

58644

/// I.e., VK* or one of their variant.

58645

static bool isVKClass(const TargetRegisterClass &RC) {

58646

return RC.hasSuperClassEq(&X86::VK1RegClass) ||

58647

RC.hasSuperClassEq(&X86::VK2RegClass) ||

58648

RC.hasSuperClassEq(&X86::VK4RegClass) ||

58649

RC.hasSuperClassEq(&X86::VK8RegClass) ||

58650

RC.hasSuperClassEq(&X86::VK16RegClass) ||

58651

RC.hasSuperClassEq(&X86::VK32RegClass) ||

58652

RC.hasSuperClassEq(&X86::VK64RegClass);

58653

}

58654

58655

std::pair<unsigned, const TargetRegisterClass *>

58656

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

58657

StringRef Constraint,

58658

MVT VT) const {

58659

// First, see if this is a constraint that directly corresponds to an LLVM

58660

// register class.

58661

if (Constraint.size() == 1) {

58662

// GCC Constraint Letters

58663

switch (Constraint[0]) {

58664

default: break;

58665

// 'A' means [ER]AX + [ER]DX.

58666

case 'A':

58667

if (Subtarget.is64Bit())

58668

return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

58669

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__
__PRETTY_FUNCTION__))

58670

"Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__
__PRETTY_FUNCTION__));

58671

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

58672

58673

// TODO: Slight differences here in allocation order and leaving

58674

// RIP in the class. Do they matter any more here than they do

58675

// in the normal allocation?

58676

case 'k':

58677

if (Subtarget.hasAVX512()) {

58678

if (VT == MVT::i1)

58679

return std::make_pair(0U, &X86::VK1RegClass);

58680

if (VT == MVT::i8)

58681

return std::make_pair(0U, &X86::VK8RegClass);

58682

if (VT == MVT::i16)

58683

return std::make_pair(0U, &X86::VK16RegClass);

58684

}

58685

if (Subtarget.hasBWI()) {

58686

if (VT == MVT::i32)

58687

return std::make_pair(0U, &X86::VK32RegClass);

58688

if (VT == MVT::i64)

58689

return std::make_pair(0U, &X86::VK64RegClass);

58690

}

58691

break;

58692

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

58693

if (Subtarget.is64Bit()) {

58694

if (VT == MVT::i8 || VT == MVT::i1)

58695

return std::make_pair(0U, &X86::GR8RegClass);

58696

if (VT == MVT::i16)

58697

return std::make_pair(0U, &X86::GR16RegClass);

58698

if (VT == MVT::i32 || VT == MVT::f32)

58699

return std::make_pair(0U, &X86::GR32RegClass);

58700

if (VT != MVT::f80 && !VT.isVector())

58701

return std::make_pair(0U, &X86::GR64RegClass);

58702

break;

58703

}

58704

[[fallthrough]];

58705

// 32-bit fallthrough

58706

case 'Q': // Q_REGS

58707

if (VT == MVT::i8 || VT == MVT::i1)

58708

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

58709

if (VT == MVT::i16)

58710

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

58711

if (VT == MVT::i32 || VT == MVT::f32 ||

58712

(!VT.isVector() && !Subtarget.is64Bit()))

58713

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

58714

if (VT != MVT::f80 && !VT.isVector())

58715

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

58716

break;

58717

case 'r': // GENERAL_REGS

58718

case 'l': // INDEX_REGS

58719

if (VT == MVT::i8 || VT == MVT::i1)

58720

return std::make_pair(0U, &X86::GR8RegClass);

58721

if (VT == MVT::i16)

58722

return std::make_pair(0U, &X86::GR16RegClass);

58723

if (VT == MVT::i32 || VT == MVT::f32 ||

58724

(!VT.isVector() && !Subtarget.is64Bit()))

58725

return std::make_pair(0U, &X86::GR32RegClass);

58726

if (VT != MVT::f80 && !VT.isVector())

58727

return std::make_pair(0U, &X86::GR64RegClass);

58728

break;

58729

case 'R': // LEGACY_REGS

58730

if (VT == MVT::i8 || VT == MVT::i1)

58731

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

58732

if (VT == MVT::i16)

58733

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

58734

if (VT == MVT::i32 || VT == MVT::f32 ||

58735

(!VT.isVector() && !Subtarget.is64Bit()))

58736

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

58737

if (VT != MVT::f80 && !VT.isVector())

58738

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

58739

break;

58740

case 'f': // FP Stack registers.

58741

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

58742

// value to the correct fpstack register class.

58743

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

58744

return std::make_pair(0U, &X86::RFP32RegClass);

58745

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

58746

return std::make_pair(0U, &X86::RFP64RegClass);

58747

if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

58748

return std::make_pair(0U, &X86::RFP80RegClass);

58749

break;

58750

case 'y': // MMX_REGS if MMX allowed.

58751

if (!Subtarget.hasMMX()) break;

58752

return std::make_pair(0U, &X86::VR64RegClass);

58753

case 'v':

58754

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

58755

if (!Subtarget.hasSSE1()) break;

58756

bool VConstraint = (Constraint[0] == 'v');

58757

58758

switch (VT.SimpleTy) {

58759

default: break;

58760

// Scalar SSE types.

58761

case MVT::f16:

58762

if (VConstraint && Subtarget.hasFP16())

58763

return std::make_pair(0U, &X86::FR16XRegClass);

58764

break;

58765

case MVT::f32:

58766

case MVT::i32:

58767

if (VConstraint && Subtarget.hasVLX())

58768

return std::make_pair(0U, &X86::FR32XRegClass);

58769

return std::make_pair(0U, &X86::FR32RegClass);

58770

case MVT::f64:

58771

case MVT::i64:

58772

if (VConstraint && Subtarget.hasVLX())

58773

return std::make_pair(0U, &X86::FR64XRegClass);

58774

return std::make_pair(0U, &X86::FR64RegClass);

58775

case MVT::i128:

58776

if (Subtarget.is64Bit()) {

58777

if (VConstraint && Subtarget.hasVLX())

58778

return std::make_pair(0U, &X86::VR128XRegClass);

58779

return std::make_pair(0U, &X86::VR128RegClass);

58780

}

58781

break;

58782

// Vector types and fp128.

58783

case MVT::v8f16:

58784

if (!Subtarget.hasFP16())

58785

break;

58786

[[fallthrough]];

58787

case MVT::f128:

58788

case MVT::v16i8:

58789

case MVT::v8i16:

58790

case MVT::v4i32:

58791

case MVT::v2i64:

58792

case MVT::v4f32:

58793

case MVT::v2f64:

58794

if (VConstraint && Subtarget.hasVLX())

58795

return std::make_pair(0U, &X86::VR128XRegClass);

58796

return std::make_pair(0U, &X86::VR128RegClass);

58797

// AVX types.

58798

case MVT::v16f16:

58799

if (!Subtarget.hasFP16())

58800

break;

58801

[[fallthrough]];

58802

case MVT::v32i8:

58803

case MVT::v16i16:

58804

case MVT::v8i32:

58805

case MVT::v4i64:

58806

case MVT::v8f32:

58807

case MVT::v4f64:

58808

if (VConstraint && Subtarget.hasVLX())

58809

return std::make_pair(0U, &X86::VR256XRegClass);

58810

if (Subtarget.hasAVX())

58811

return std::make_pair(0U, &X86::VR256RegClass);

58812

break;

58813

case MVT::v32f16:

58814

if (!Subtarget.hasFP16())

58815

break;

58816

[[fallthrough]];

58817

case MVT::v64i8:

58818

case MVT::v32i16:

58819

case MVT::v8f64:

58820

case MVT::v16f32:

58821

case MVT::v16i32:

58822

case MVT::v8i64:

58823

if (!Subtarget.hasAVX512()) break;

58824

if (VConstraint)

58825

return std::make_pair(0U, &X86::VR512RegClass);

58826

return std::make_pair(0U, &X86::VR512_0_15RegClass);

58827

}

58828

break;

58829

}

58830

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

58831

switch (Constraint[1]) {

58832

default:

58833

break;

58834

case 'i':

58835

case 't':

58836

case '2':

58837

return getRegForInlineAsmConstraint(TRI, "x", VT);

58838

case 'm':

58839

if (!Subtarget.hasMMX()) break;

58840

return std::make_pair(0U, &X86::VR64RegClass);

58841

case 'z':

58842

if (!Subtarget.hasSSE1()) break;

58843

switch (VT.SimpleTy) {

58844

default: break;

58845

// Scalar SSE types.

58846

case MVT::f16:

58847

if (!Subtarget.hasFP16())

58848

break;

58849

return std::make_pair(X86::XMM0, &X86::FR16XRegClass);

58850

case MVT::f32:

58851

case MVT::i32:

58852

return std::make_pair(X86::XMM0, &X86::FR32RegClass);

58853

case MVT::f64:

58854

case MVT::i64:

58855

return std::make_pair(X86::XMM0, &X86::FR64RegClass);

58856

case MVT::v8f16:

58857

if (!Subtarget.hasFP16())

58858

break;

58859

[[fallthrough]];

58860

case MVT::f128:

58861

case MVT::v16i8:

58862

case MVT::v8i16:

58863

case MVT::v4i32:

58864

case MVT::v2i64:

58865

case MVT::v4f32:

58866

case MVT::v2f64:

58867

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

58868

// AVX types.

58869

case MVT::v16f16:

58870

if (!Subtarget.hasFP16())

58871

break;

58872

[[fallthrough]];

58873

case MVT::v32i8:

58874

case MVT::v16i16:

58875

case MVT::v8i32:

58876

case MVT::v4i64:

58877

case MVT::v8f32:

58878

case MVT::v4f64:

58879

if (Subtarget.hasAVX())

58880

return std::make_pair(X86::YMM0, &X86::VR256RegClass);

58881

break;

58882

case MVT::v32f16:

58883

if (!Subtarget.hasFP16())

58884

break;

58885

[[fallthrough]];

58886

case MVT::v64i8:

58887

case MVT::v32i16:

58888

case MVT::v8f64:

58889

case MVT::v16f32:

58890

case MVT::v16i32:

58891

case MVT::v8i64:

58892

if (Subtarget.hasAVX512())

58893

return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

58894

break;

58895

}

58896

break;

58897

case 'k':

58898

// This register class doesn't allocate k0 for masked vector operation.

58899

if (Subtarget.hasAVX512()) {

58900

if (VT == MVT::i1)

58901

return std::make_pair(0U, &X86::VK1WMRegClass);

58902

if (VT == MVT::i8)

58903

return std::make_pair(0U, &X86::VK8WMRegClass);

58904

if (VT == MVT::i16)

58905

return std::make_pair(0U, &X86::VK16WMRegClass);

58906

}

58907

if (Subtarget.hasBWI()) {

58908

if (VT == MVT::i32)

58909

return std::make_pair(0U, &X86::VK32WMRegClass);

58910

if (VT == MVT::i64)

58911

return std::make_pair(0U, &X86::VK64WMRegClass);

58912

}

58913

break;

58914

}

58915

}

58916

58917

if (parseConstraintCode(Constraint) != X86::COND_INVALID)

58918

return std::make_pair(0U, &X86::GR32RegClass);

58919

58920

// Use the default implementation in TargetLowering to convert the register

58921

// constraint into a member of a register class.

58922

std::pair<Register, const TargetRegisterClass*> Res;

58923

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

58924

58925

// Not found as a standard register?

58926

if (!Res.second) {

58927

// Only match x87 registers if the VT is one SelectionDAGBuilder can convert

58928

// to/from f80.

58929

if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {

58930

// Map st(0) -> st(7) -> ST0

58931

if (Constraint.size() == 7 && Constraint[0] == '{' &&

58932

tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

58933

Constraint[3] == '(' &&

58934

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

58935

Constraint[5] == ')' && Constraint[6] == '}') {

58936

// st(7) is not allocatable and thus not a member of RFP80. Return

58937

// singleton class in cases where we have a reference to it.

58938

if (Constraint[4] == '7')

58939

return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

58940

return std::make_pair(X86::FP0 + Constraint[4] - '0',

58941

&X86::RFP80RegClass);

58942

}

58943

58944

// GCC allows "st(0)" to be called just plain "st".

58945

if (StringRef("{st}").equals_insensitive(Constraint))

58946

return std::make_pair(X86::FP0, &X86::RFP80RegClass);

58947

}

58948

58949

// flags -> EFLAGS

58950

if (StringRef("{flags}").equals_insensitive(Constraint))

58951

return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

58952

58953

// dirflag -> DF

58954

// Only allow for clobber.

58955

if (StringRef("{dirflag}").equals_insensitive(Constraint) &&

58956

VT == MVT::Other)

58957

return std::make_pair(X86::DF, &X86::DFCCRRegClass);

58958

58959

// fpsr -> FPSW

58960

if (StringRef("{fpsr}").equals_insensitive(Constraint))

58961

return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

58962

58963

return Res;

58964

}

58965

58966

// Make sure it isn't a register that requires 64-bit mode.

58967

if (!Subtarget.is64Bit() &&

58968

(isFRClass(*Res.second) || isGRClass(*Res.second)) &&

58969

TRI->getEncodingValue(Res.first) >= 8) {

58970

// Register requires REX prefix, but we're in 32-bit mode.

58971

return std::make_pair(0, nullptr);

58972

}

58973

58974

// Make sure it isn't a register that requires AVX512.

58975

if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

58976

TRI->getEncodingValue(Res.first) & 0x10) {

58977

// Register requires EVEX prefix.

58978

return std::make_pair(0, nullptr);

58979

}

58980

58981

// Otherwise, check to see if this is a register class of the wrong value

58982

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

58983

// turn into {ax},{dx}.

58984

// MVT::Other is used to specify clobber names.

58985

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

58986

return Res; // Correct type already, nothing to do.

58987

58988

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

58989

// return "eax". This should even work for things like getting 64bit integer

58990

// registers when given an f64 type.

58991

const TargetRegisterClass *Class = Res.second;

58992

// The generic code will match the first register class that contains the

58993

// given register. Thus, based on the ordering of the tablegened file,

58994

// the "plain" GR classes might not come first.

58995

// Therefore, use a helper method.

58996

if (isGRClass(*Class)) {

58997

unsigned Size = VT.getSizeInBits();

58998

if (Size == 1) Size = 8;

58999

if (Size != 8 && Size != 16 && Size != 32 && Size != 64)

59000

return std::make_pair(0, nullptr);

59001

Register DestReg = getX86SubSuperRegister(Res.first, Size);

59002

if (DestReg.isValid()) {

59003

bool is64Bit = Subtarget.is64Bit();

59004

const TargetRegisterClass *RC =

59005

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

59006

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

59007

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

59008

: /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);

59009

if (Size == 64 && !is64Bit) {

59010

// Model GCC's behavior here and select a fixed pair of 32-bit

59011

// registers.

59012

switch (DestReg) {

59013

case X86::RAX:

59014

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

59015

case X86::RDX:

59016

return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

59017

case X86::RCX:

59018

return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

59019

case X86::RBX:

59020

return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

59021

case X86::RSI:

59022

return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

59023

case X86::RDI:

59024

return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

59025

case X86::RBP:

59026

return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

59027

default:

59028

return std::make_pair(0, nullptr);

59029

}

59030

}

59031

if (RC && RC->contains(DestReg))

59032

return std::make_pair(DestReg, RC);

59033

return Res;

59034

}

59035

// No register found/type mismatch.

59036

return std::make_pair(0, nullptr);

59037

} else if (isFRClass(*Class)) {

59038

// Handle references to XMM physical registers that got mapped into the

59039

// wrong class. This can happen with constraints like {xmm0} where the

59040

// target independent register mapper will just pick the first match it can

59041

// find, ignoring the required type.

59042

59043

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

59044

if (VT == MVT::f16)

59045

Res.second = &X86::FR16XRegClass;

59046

else if (VT == MVT::f32 || VT == MVT::i32)

59047

Res.second = &X86::FR32XRegClass;

59048

else if (VT == MVT::f64 || VT == MVT::i64)

59049

Res.second = &X86::FR64XRegClass;

59050

else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

59051

Res.second = &X86::VR128XRegClass;

59052

else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

59053

Res.second = &X86::VR256XRegClass;

59054

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

59055

Res.second = &X86::VR512RegClass;

59056

else {

59057

// Type mismatch and not a clobber: Return an error;

59058

Res.first = 0;

59059

Res.second = nullptr;

59060

}

59061

} else if (isVKClass(*Class)) {

59062

if (VT == MVT::i1)

59063

Res.second = &X86::VK1RegClass;

59064

else if (VT == MVT::i8)

59065

Res.second = &X86::VK8RegClass;

59066

else if (VT == MVT::i16)

59067

Res.second = &X86::VK16RegClass;

59068

else if (VT == MVT::i32)

59069

Res.second = &X86::VK32RegClass;

59070

else if (VT == MVT::i64)

59071

Res.second = &X86::VK64RegClass;

59072

else {

59073

// Type mismatch and not a clobber: Return an error;

59074

Res.first = 0;

59075

Res.second = nullptr;

59076

}

59077

}

59078

59079

return Res;

59080

}

59081

59082

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

59083

// Integer division on x86 is expensive. However, when aggressively optimizing

59084

// for code size, we prefer to use a div instruction, as it is usually smaller

59085

// than the alternative sequence.

59086

// The exception to this is vector division. Since x86 doesn't have vector

59087

// integer division, leaving the division as-is is a loss even in terms of

59088

// size, because it will have to be scalarized, while the alternative code

59089

// sequence can be performed in vector form.

59090

bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

59091

return OptSize && !VT.isVector();

59092

}

59093

59094

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

59095

if (!Subtarget.is64Bit())

59096

return;

59097

59098

// Update IsSplitCSR in X86MachineFunctionInfo.

59099

X86MachineFunctionInfo *AFI =

59100

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

59101

AFI->setIsSplitCSR(true);

59102

}

59103

59104

void X86TargetLowering::insertCopiesSplitCSR(

59105

MachineBasicBlock *Entry,

59106

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

59107

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

59108

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

59109

if (!IStart)

59110

return;

59111

59112

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

59113

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

59114

MachineBasicBlock::iterator MBBI = Entry->begin();

59115

for (const MCPhysReg *I = IStart; *I; ++I) {

59116

const TargetRegisterClass *RC = nullptr;

59117

if (X86::GR64RegClass.contains(*I))

59118

RC = &X86::GR64RegClass;

59119

else

59120

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59120);

59121

59122

Register NewVR = MRI->createVirtualRegister(RC);

59123

// Create copy from CSR to a virtual register.

59124

// FIXME: this currently does not emit CFI pseudo-instructions, it works

59125

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

59126

// nounwind. If we want to generalize this later, we may need to emit

59127

// CFI pseudo-instructions.

59128

assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))

59129

Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__))

59130

"Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__
__PRETTY_FUNCTION__));

59131

Entry->addLiveIn(*I);

59132

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

59133

.addReg(*I);

59134

59135

// Insert the copy-back instructions right before the terminator.

59136

for (auto *Exit : Exits)

59137

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

59138

TII->get(TargetOpcode::COPY), *I)

59139

.addReg(NewVR);

59140

}

59141

}

59142

59143

bool X86TargetLowering::supportSwiftError() const {

59144

return Subtarget.is64Bit();

59145

}

59146

59147

MachineInstr *

59148

X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,

59149

MachineBasicBlock::instr_iterator &MBBI,

59150

const TargetInstrInfo *TII) const {

59151

assert(MBBI->isCall() && MBBI->getCFIType() &&(static_cast <bool> (MBBI->isCall() && MBBI->
getCFIType() && "Invalid call instruction for a KCFI check"
) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__
__PRETTY_FUNCTION__))

59152

"Invalid call instruction for a KCFI check")(static_cast <bool> (MBBI->isCall() && MBBI->
getCFIType() && "Invalid call instruction for a KCFI check"
) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__
__PRETTY_FUNCTION__));

59153

59154

MachineFunction &MF = *MBB.getParent();

59155

// If the call target is a memory operand, unfold it and use R11 for the

59156

// call, so KCFI_CHECK won't have to recompute the address.

59157

switch (MBBI->getOpcode()) {

59158

case X86::CALL64m:

59159

case X86::CALL64m_NT:

59160

case X86::TAILJMPm64:

59161

case X86::TAILJMPm64_REX: {

59162

MachineBasicBlock::instr_iterator OrigCall = MBBI;

59163

SmallVector<MachineInstr *, 2> NewMIs;

59164

if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,

59165

/*UnfoldStore=*/false, NewMIs))

59166

report_fatal_error("Failed to unfold memory operand for a KCFI check");

59167

for (auto *NewMI : NewMIs)

59168

MBBI = MBB.insert(OrigCall, NewMI);

59169

assert(MBBI->isCall() &&(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding"
) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__
__PRETTY_FUNCTION__))

59170

"Unexpected instruction after memory operand unfolding")(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding"
) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__
__PRETTY_FUNCTION__));

59171

if (OrigCall->shouldUpdateCallSiteInfo())

59172

MF.moveCallSiteInfo(&*OrigCall, &*MBBI);

59173

MBBI->setCFIType(MF, OrigCall->getCFIType());

59174

OrigCall->eraseFromParent();

59175

break;

59176

}

59177

default:

59178

break;

59179

}

59180

59181

MachineOperand &Target = MBBI->getOperand(0);

59182

Register TargetReg;

59183

switch (MBBI->getOpcode()) {

59184

case X86::CALL64r:

59185

case X86::CALL64r_NT:

59186

case X86::TAILJMPr64:

59187

case X86::TAILJMPr64_REX:

59188

assert(Target.isReg() && "Unexpected target operand for an indirect call")(static_cast <bool> (Target.isReg() && "Unexpected target operand for an indirect call"
) ? void (0) : __assert_fail ("Target.isReg() && \"Unexpected target operand for an indirect call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59188, __extension__
__PRETTY_FUNCTION__));

59189

Target.setIsRenamable(false);

59190

TargetReg = Target.getReg();

59191

break;

59192

case X86::CALL64pcrel32:

59193

case X86::TAILJMPd64:

59194

assert(Target.isSymbol() && "Unexpected target operand for a direct call")(static_cast <bool> (Target.isSymbol() && "Unexpected target operand for a direct call"
) ? void (0) : __assert_fail ("Target.isSymbol() && \"Unexpected target operand for a direct call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59194, __extension__
__PRETTY_FUNCTION__));

59195

// X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for

59196

// 64-bit indirect thunk calls.

59197

assert(StringRef(Target.getSymbolName()).endswith("_r11") &&(static_cast <bool> (StringRef(Target.getSymbolName()).
endswith("_r11") && "Unexpected register for an indirect thunk call"
) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__
__PRETTY_FUNCTION__))

59198

"Unexpected register for an indirect thunk call")(static_cast <bool> (StringRef(Target.getSymbolName()).
endswith("_r11") && "Unexpected register for an indirect thunk call"
) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__
__PRETTY_FUNCTION__));

59199

TargetReg = X86::R11;

59200

break;

59201

default:

59202

llvm_unreachable("Unexpected CFI call opcode")::llvm::llvm_unreachable_internal("Unexpected CFI call opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 59202);

59203

break;

59204

}

59205

59206

return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(X86::KCFI_CHECK))

59207

.addReg(TargetReg)

59208

.addImm(MBBI->getCFIType())

59209

.getInstr();

59210

}

59211

59212

/// Returns true if stack probing through a function call is requested.

59213

bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {

59214

return !getStackProbeSymbolName(MF).empty();

59215

}

59216

59217

/// Returns true if stack probing through inline assembly is requested.

59218

bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {

59219

59220

// No inline stack probe for Windows, they have their own mechanism.

59221

if (Subtarget.isOSWindows() ||

59222

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

59223

return false;

59224

59225

// If the function specifically requests inline stack probes, emit them.

59226

if (MF.getFunction().hasFnAttribute("probe-stack"))

59227

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

59228

"inline-asm";

59229

59230

return false;

59231

}

59232

59233

/// Returns the name of the symbol used to emit stack probes or the empty

59234

/// string if not applicable.

59235

StringRef

59236

X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {

59237

// Inline Stack probes disable stack probe call

59238

if (hasInlineStackProbe(MF))

59239

return "";

59240

59241

// If the function specifically requests stack probes, emit them.

59242

if (MF.getFunction().hasFnAttribute("probe-stack"))

59243

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

59244

59245

// Generally, if we aren't on Windows, the platform ABI does not include

59246

// support for stack probes, so don't emit them.

59247

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||

59248

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

59249

return "";

59250

59251

// We need a stack probe to conform to the Windows ABI. Choose the right

59252

// symbol.

59253

if (Subtarget.is64Bit())

59254

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

59255

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

59256

}

59257

59258

unsigned

59259

X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {

59260

// The default stack probe size is 4096 if the function has no stackprobesize

59261

// attribute.

59262

return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",

59263

4096);

59264

}

59265

59266

Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

59267

if (ML->isInnermost() &&

59268

ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())

59269

return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);

59270

return TargetLowering::getPrefLoopAlignment();

59271

}

File:	build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:	line 45517, column 39 The result of the left shift is undefined due to shifting by '4294967291', which is greater or equal to the width of type 'int'

Bug Summary

Annotated Source Code