/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp

1

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This file defines the interfaces that X86 uses to lower LLVM code into a

10

// selection DAG.

11

//

12

//===----------------------------------------------------------------------===//

13

14

#include "X86ISelLowering.h"

15

#include "MCTargetDesc/X86ShuffleDecode.h"

16

#include "X86.h"

17

#include "X86CallingConv.h"

18

#include "X86FrameLowering.h"

19

#include "X86InstrBuilder.h"

20

#include "X86IntrinsicsInfo.h"

21

#include "X86MachineFunctionInfo.h"

22

#include "X86TargetMachine.h"

23

#include "X86TargetObjectFile.h"

24

#include "llvm/ADT/SmallBitVector.h"

25

#include "llvm/ADT/SmallSet.h"

26

#include "llvm/ADT/Statistic.h"

27

#include "llvm/ADT/StringExtras.h"

28

#include "llvm/ADT/StringSwitch.h"

29

#include "llvm/Analysis/BlockFrequencyInfo.h"

30

#include "llvm/Analysis/EHPersonalities.h"

31

#include "llvm/Analysis/ProfileSummaryInfo.h"

32

#include "llvm/Analysis/VectorUtils.h"

33

#include "llvm/CodeGen/IntrinsicLowering.h"

34

#include "llvm/CodeGen/MachineFrameInfo.h"

35

#include "llvm/CodeGen/MachineFunction.h"

36

#include "llvm/CodeGen/MachineInstrBuilder.h"

37

#include "llvm/CodeGen/MachineJumpTableInfo.h"

38

#include "llvm/CodeGen/MachineModuleInfo.h"

39

#include "llvm/CodeGen/MachineRegisterInfo.h"

40

#include "llvm/CodeGen/TargetLowering.h"

41

#include "llvm/CodeGen/WinEHFuncInfo.h"

42

#include "llvm/IR/CallingConv.h"

43

#include "llvm/IR/Constants.h"

44

#include "llvm/IR/DerivedTypes.h"

45

#include "llvm/IR/DiagnosticInfo.h"

46

#include "llvm/IR/Function.h"

47

#include "llvm/IR/GlobalAlias.h"

48

#include "llvm/IR/GlobalVariable.h"

49

#include "llvm/IR/Instructions.h"

50

#include "llvm/IR/Intrinsics.h"

51

#include "llvm/MC/MCAsmInfo.h"

52

#include "llvm/MC/MCContext.h"

53

#include "llvm/MC/MCExpr.h"

54

#include "llvm/MC/MCSymbol.h"

55

#include "llvm/Support/CommandLine.h"

56

#include "llvm/Support/Debug.h"

57

#include "llvm/Support/ErrorHandling.h"

58

#include "llvm/Support/KnownBits.h"

59

#include "llvm/Support/MathExtras.h"

60

#include "llvm/Target/TargetOptions.h"

61

#include <algorithm>

62

#include <bitset>

63

#include <cctype>

64

#include <numeric>

65

using namespace llvm;

66

67

#define DEBUG_TYPE"x86-isel" "x86-isel"

68

69

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"};

70

71

static cl::opt<int> ExperimentalPrefLoopAlignment(

72

"x86-experimental-pref-loop-alignment", cl::init(4),

73

cl::desc(

74

"Sets the preferable loop alignment for experiments (as log2 bytes)"

75

"(the last x86-experimental-pref-loop-alignment bits"

76

" of the loop header PC will be 0)."),

77

cl::Hidden);

78

79

static cl::opt<bool> MulConstantOptimization(

80

"mul-constant-optimization", cl::init(true),

81

cl::desc("Replace 'mul x, Const' with more effective instructions like "

82

"SHIFT, LEA, etc."),

83

cl::Hidden);

84

85

static cl::opt<bool> ExperimentalUnorderedISEL(

86

"x86-experimental-unordered-atomic-isel", cl::init(false),

87

cl::desc("Use LoadSDNode and StoreSDNode instead of "

88

"AtomicSDNode for unordered atomic loads and "

89

"stores respectively."),

90

cl::Hidden);

91

92

/// Call this when the user attempts to do something unsupported, like

93

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

94

/// report_fatal_error, so calling code should attempt to recover without

95

/// crashing.

96

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

97

const char *Msg) {

98

MachineFunction &MF = DAG.getMachineFunction();

99

DAG.getContext()->diagnose(

100

DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));

101

}

102

103

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

104

const X86Subtarget &STI)

105

: TargetLowering(TM), Subtarget(STI) {

106

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

107

X86ScalarSSEf64 = Subtarget.hasSSE2();

108

X86ScalarSSEf32 = Subtarget.hasSSE1();

109

MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

110

111

// Set up the TargetLowering object.

112

113

// X86 is weird. It always uses i8 for shift amounts and setcc results.

114

setBooleanContents(ZeroOrOneBooleanContent);

115

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

116

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

117

118

// For 64-bit, since we have so many registers, use the ILP scheduler.

119

// For 32-bit, use the register pressure specific scheduling.

120

// For Atom, always use ILP scheduling.

121

if (Subtarget.isAtom())

122

setSchedulingPreference(Sched::ILP);

123

else if (Subtarget.is64Bit())

124

setSchedulingPreference(Sched::ILP);

125

else

126

setSchedulingPreference(Sched::RegPressure);

127

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

128

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

129

130

// Bypass expensive divides and use cheaper ones.

131

if (TM.getOptLevel() >= CodeGenOpt::Default) {

132

if (Subtarget.hasSlowDivide32())

133

addBypassSlowDiv(32, 8);

134

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

135

addBypassSlowDiv(64, 32);

136

}

137

138

if (Subtarget.isTargetWindowsMSVC() ||

139

Subtarget.isTargetWindowsItanium()) {

140

// Setup Windows compiler runtime calls.

141

setLibcallName(RTLIB::SDIV_I64, "_alldiv");

142

setLibcallName(RTLIB::UDIV_I64, "_aulldiv");

143

setLibcallName(RTLIB::SREM_I64, "_allrem");

144

setLibcallName(RTLIB::UREM_I64, "_aullrem");

145

setLibcallName(RTLIB::MUL_I64, "_allmul");

146

setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);

147

setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);

148

setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);

149

setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);

150

setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);

151

}

152

153

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

154

// MSVCRT doesn't have powi; fall back to pow

155

setLibcallName(RTLIB::POWI_F32, nullptr);

156

setLibcallName(RTLIB::POWI_F64, nullptr);

157

}

158

159

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

160

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

161

// FIXME: Should we be limiting the atomic size on other configs? Default is

162

// 1024.

163

if (!Subtarget.hasCmpxchg8b())

164

setMaxAtomicSizeInBitsSupported(32);

165

166

// Set up the register classes.

167

addRegisterClass(MVT::i8, &X86::GR8RegClass);

168

addRegisterClass(MVT::i16, &X86::GR16RegClass);

169

addRegisterClass(MVT::i32, &X86::GR32RegClass);

170

if (Subtarget.is64Bit())

171

addRegisterClass(MVT::i64, &X86::GR64RegClass);

172

173

for (MVT VT : MVT::integer_valuetypes())

174

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

175

176

// We don't accept any truncstore of integer registers.

177

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

178

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

179

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

180

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

181

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

182

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

183

184

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

185

186

// SETOEQ and SETUNE require checking two conditions.

187

for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

188

setCondCodeAction(ISD::SETOEQ, VT, Expand);

189

setCondCodeAction(ISD::SETUNE, VT, Expand);

190

}

191

192

// Integer absolute.

193

if (Subtarget.hasCMov()) {

194

setOperationAction(ISD::ABS , MVT::i16 , Custom);

195

setOperationAction(ISD::ABS , MVT::i32 , Custom);

196

if (Subtarget.is64Bit())

197

setOperationAction(ISD::ABS , MVT::i64 , Custom);

198

}

199

200

// Funnel shifts.

201

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

202

// For slow shld targets we only lower for code size.

203

LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

204

205

setOperationAction(ShiftOp , MVT::i8 , Custom);

206

setOperationAction(ShiftOp , MVT::i16 , Custom);

207

setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

208

if (Subtarget.is64Bit())

209

setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

210

}

211

212

if (!Subtarget.useSoftFloat()) {

213

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

214

// operation.

215

setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

216

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

217

setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

218

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

219

// We have an algorithm for SSE2, and we turn this into a 64-bit

220

// FILD or VCVTUSI2SS/SD for other targets.

221

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

222

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

223

// We have an algorithm for SSE2->double, and we turn this into a

224

// 64-bit FILD followed by conditional FADD for other targets.

225

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

226

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

227

228

// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

229

// this operation.

230

setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

231

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

232

// SSE has no i16 to fp conversion, only i32. We promote in the handler

233

// to allow f80 to use i16 and f64 to use i16 with sse1 only

234

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

235

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

236

// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

237

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

238

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

239

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

240

// are Legal, f80 is custom lowered.

241

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

242

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

243

244

// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

245

// this operation.

246

setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

247

// FIXME: This doesn't generate invalid exception when it should. PR44019.

248

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

249

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

250

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

251

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

252

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

253

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

254

// are Legal, f80 is custom lowered.

255

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

256

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

257

258

// Handle FP_TO_UINT by promoting the destination to a larger signed

259

// conversion.

260

setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

261

// FIXME: This doesn't generate invalid exception when it should. PR44019.

262

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

263

setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

264

// FIXME: This doesn't generate invalid exception when it should. PR44019.

265

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

266

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

267

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

268

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

269

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

270

271

setOperationAction(ISD::LRINT, MVT::f32, Custom);

272

setOperationAction(ISD::LRINT, MVT::f64, Custom);

273

setOperationAction(ISD::LLRINT, MVT::f32, Custom);

274

setOperationAction(ISD::LLRINT, MVT::f64, Custom);

275

276

if (!Subtarget.is64Bit()) {

277

setOperationAction(ISD::LRINT, MVT::i64, Custom);

278

setOperationAction(ISD::LLRINT, MVT::i64, Custom);

279

}

280

}

281

282

// Handle address space casts between mixed sized pointers.

283

setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

284

setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

285

286

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

287

if (!X86ScalarSSEf64) {

288

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

289

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

290

if (Subtarget.is64Bit()) {

291

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

292

// Without SSE, i64->f64 goes through memory.

293

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

294

}

295

} else if (!Subtarget.is64Bit())

296

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

297

298

// Scalar integer divide and remainder are lowered to use operations that

299

// produce two results, to match the available instructions. This exposes

300

// the two-result form to trivial CSE, which is able to combine x/y and x%y

301

// into a single instruction.

302

//

303

// Scalar integer multiply-high is also lowered to use two-result

304

// operations, to match the available instructions. However, plain multiply

305

// (low) operations are left as Legal, as there are single-result

306

// instructions for this in x86. Using the two-result multiply instructions

307

// when both high and low results are needed must be arranged by dagcombine.

308

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

309

setOperationAction(ISD::MULHS, VT, Expand);

310

setOperationAction(ISD::MULHU, VT, Expand);

311

setOperationAction(ISD::SDIV, VT, Expand);

312

setOperationAction(ISD::UDIV, VT, Expand);

313

setOperationAction(ISD::SREM, VT, Expand);

314

setOperationAction(ISD::UREM, VT, Expand);

315

}

316

317

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

318

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

319

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

320

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

321

setOperationAction(ISD::BR_CC, VT, Expand);

322

setOperationAction(ISD::SELECT_CC, VT, Expand);

323

}

324

if (Subtarget.is64Bit())

325

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

326

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

327

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

328

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

329

330

setOperationAction(ISD::FREM , MVT::f32 , Expand);

331

setOperationAction(ISD::FREM , MVT::f64 , Expand);

332

setOperationAction(ISD::FREM , MVT::f80 , Expand);

333

setOperationAction(ISD::FREM , MVT::f128 , Expand);

334

setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

335

336

// Promote the i8 variants and force them on up to i32 which has a shorter

337

// encoding.

338

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

339

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

340

if (!Subtarget.hasBMI()) {

341

setOperationAction(ISD::CTTZ , MVT::i16 , Custom);

342

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

343

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);

344

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

345

if (Subtarget.is64Bit()) {

346

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

347

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

348

}

349

}

350

351

if (Subtarget.hasLZCNT()) {

352

// When promoting the i8 variants, force them to i32 for a shorter

353

// encoding.

354

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

355

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

356

} else {

357

for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

358

if (VT == MVT::i64 && !Subtarget.is64Bit())

359

continue;

360

setOperationAction(ISD::CTLZ , VT, Custom);

361

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

362

}

363

}

364

365

for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

366

ISD::STRICT_FP_TO_FP16}) {

367

// Special handling for half-precision floating point conversions.

368

// If we don't have F16C support, then lower half float conversions

369

// into library calls.

370

setOperationAction(

371

Op, MVT::f32,

372

(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

373

// There's never any support for operations beyond MVT::f32.

374

setOperationAction(Op, MVT::f64, Expand);

375

setOperationAction(Op, MVT::f80, Expand);

376

setOperationAction(Op, MVT::f128, Expand);

377

}

378

379

setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);

380

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);

381

setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);

382

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);

383

setTruncStoreAction(MVT::f32, MVT::f16, Expand);

384

setTruncStoreAction(MVT::f64, MVT::f16, Expand);

385

setTruncStoreAction(MVT::f80, MVT::f16, Expand);

386

setTruncStoreAction(MVT::f128, MVT::f16, Expand);

387

388

setOperationAction(ISD::PARITY, MVT::i8, Custom);

389

if (Subtarget.hasPOPCNT()) {

390

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

391

} else {

392

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

393

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

394

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

395

if (Subtarget.is64Bit())

396

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

397

else

398

setOperationAction(ISD::CTPOP , MVT::i64 , Custom);

399

400

setOperationAction(ISD::PARITY, MVT::i16, Custom);

401

setOperationAction(ISD::PARITY, MVT::i32, Custom);

402

if (Subtarget.is64Bit())

403

setOperationAction(ISD::PARITY, MVT::i64, Custom);

404

}

405

406

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

407

408

if (!Subtarget.hasMOVBE())

409

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

410

411

// X86 wants to expand cmov itself.

412

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

413

setOperationAction(ISD::SELECT, VT, Custom);

414

setOperationAction(ISD::SETCC, VT, Custom);

415

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

416

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

417

}

418

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

419

if (VT == MVT::i64 && !Subtarget.is64Bit())

420

continue;

421

setOperationAction(ISD::SELECT, VT, Custom);

422

setOperationAction(ISD::SETCC, VT, Custom);

423

}

424

425

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

426

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

427

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

428

429

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

430

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

431

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

432

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

433

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

434

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

435

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

436

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

437

438

// Darwin ABI issue.

439

for (auto VT : { MVT::i32, MVT::i64 }) {

440

if (VT == MVT::i64 && !Subtarget.is64Bit())

441

continue;

442

setOperationAction(ISD::ConstantPool , VT, Custom);

443

setOperationAction(ISD::JumpTable , VT, Custom);

444

setOperationAction(ISD::GlobalAddress , VT, Custom);

445

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

446

setOperationAction(ISD::ExternalSymbol , VT, Custom);

447

setOperationAction(ISD::BlockAddress , VT, Custom);

448

}

449

450

// 64-bit shl, sra, srl (iff 32-bit x86)

451

for (auto VT : { MVT::i32, MVT::i64 }) {

452

if (VT == MVT::i64 && !Subtarget.is64Bit())

453

continue;

454

setOperationAction(ISD::SHL_PARTS, VT, Custom);

455

setOperationAction(ISD::SRA_PARTS, VT, Custom);

456

setOperationAction(ISD::SRL_PARTS, VT, Custom);

457

}

458

459

if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())

460

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

461

462

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

463

464

// Expand certain atomics

465

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

466

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

467

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

468

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

469

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

470

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

471

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

472

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

473

}

474

475

if (!Subtarget.is64Bit())

476

setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

477

478

if (Subtarget.hasCmpxchg16b()) {

479

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

480

}

481

482

// FIXME - use subtarget debug flags

483

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

484

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

485

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

486

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

487

}

488

489

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

490

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

491

492

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

493

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

494

495

setOperationAction(ISD::TRAP, MVT::Other, Legal);

496

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

497

498

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

499

setOperationAction(ISD::VASTART , MVT::Other, Custom);

500

setOperationAction(ISD::VAEND , MVT::Other, Expand);

501

bool Is64Bit = Subtarget.is64Bit();

502

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

503

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

504

505

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

506

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

507

508

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

509

510

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

511

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

512

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

513

514

if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {

515

// f32 and f64 use SSE.

516

// Set up the FP register classes.

517

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

518

: &X86::FR32RegClass);

519

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

520

: &X86::FR64RegClass);

521

522

// Disable f32->f64 extload as we can only generate this in one instruction

523

// under optsize. So its easier to pattern match (fpext (load)) for that

524

// case instead of needing to emit 2 instructions for extload in the

525

// non-optsize case.

526

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

527

528

for (auto VT : { MVT::f32, MVT::f64 }) {

529

// Use ANDPD to simulate FABS.

530

setOperationAction(ISD::FABS, VT, Custom);

531

532

// Use XORP to simulate FNEG.

533

setOperationAction(ISD::FNEG, VT, Custom);

534

535

// Use ANDPD and ORPD to simulate FCOPYSIGN.

536

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

537

538

// These might be better off as horizontal vector ops.

539

setOperationAction(ISD::FADD, VT, Custom);

540

setOperationAction(ISD::FSUB, VT, Custom);

541

542

// We don't support sin/cos/fmod

543

setOperationAction(ISD::FSIN , VT, Expand);

544

setOperationAction(ISD::FCOS , VT, Expand);

545

setOperationAction(ISD::FSINCOS, VT, Expand);

546

}

547

548

// Lower this to MOVMSK plus an AND.

549

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

550

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

551

552

} else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&

553

(UseX87 || Is64Bit)) {

554

// Use SSE for f32, x87 for f64.

555

// Set up the FP register classes.

556

addRegisterClass(MVT::f32, &X86::FR32RegClass);

557

if (UseX87)

558

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

559

560

// Use ANDPS to simulate FABS.

561

setOperationAction(ISD::FABS , MVT::f32, Custom);

562

563

// Use XORP to simulate FNEG.

564

setOperationAction(ISD::FNEG , MVT::f32, Custom);

565

566

if (UseX87)

567

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

568

569

// Use ANDPS and ORPS to simulate FCOPYSIGN.

570

if (UseX87)

571

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

572

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

573

574

// We don't support sin/cos/fmod

575

setOperationAction(ISD::FSIN , MVT::f32, Expand);

576

setOperationAction(ISD::FCOS , MVT::f32, Expand);

577

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

578

579

if (UseX87) {

580

// Always expand sin/cos functions even though x87 has an instruction.

581

setOperationAction(ISD::FSIN, MVT::f64, Expand);

582

setOperationAction(ISD::FCOS, MVT::f64, Expand);

583

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

584

}

585

} else if (UseX87) {

586

// f32 and f64 in x87.

587

// Set up the FP register classes.

588

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

589

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

590

591

for (auto VT : { MVT::f32, MVT::f64 }) {

592

setOperationAction(ISD::UNDEF, VT, Expand);

593

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

594

595

// Always expand sin/cos functions even though x87 has an instruction.

596

setOperationAction(ISD::FSIN , VT, Expand);

597

setOperationAction(ISD::FCOS , VT, Expand);

598

setOperationAction(ISD::FSINCOS, VT, Expand);

599

}

600

}

601

602

// Expand FP32 immediates into loads from the stack, save special cases.

603

if (isTypeLegal(MVT::f32)) {

604

if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

605

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

606

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

607

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

608

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

609

} else // SSE immediates.

610

addLegalFPImmediate(APFloat(+0.0f)); // xorps

611

}

612

// Expand FP64 immediates into loads from the stack, save special cases.

613

if (isTypeLegal(MVT::f64)) {

614

if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

615

addLegalFPImmediate(APFloat(+0.0)); // FLD0

616

addLegalFPImmediate(APFloat(+1.0)); // FLD1

617

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

618

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

619

} else // SSE immediates.

620

addLegalFPImmediate(APFloat(+0.0)); // xorpd

621

}

622

// Handle constrained floating-point operations of scalar.

623

setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

624

setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

625

setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

626

setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

627

setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

628

setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

629

setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

630

setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

631

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

632

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

633

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

634

setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

635

setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

636

637

// We don't support FMA.

638

setOperationAction(ISD::FMA, MVT::f64, Expand);

639

setOperationAction(ISD::FMA, MVT::f32, Expand);

640

641

// f80 always uses X87.

642

if (UseX87) {

643

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

644

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

645

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

646

{

647

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

648

addLegalFPImmediate(TmpFlt); // FLD0

649

TmpFlt.changeSign();

650

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

651

652

bool ignored;

653

APFloat TmpFlt2(+1.0);

654

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

655

&ignored);

656

addLegalFPImmediate(TmpFlt2); // FLD1

657

TmpFlt2.changeSign();

658

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

659

}

660

661

// Always expand sin/cos functions even though x87 has an instruction.

662

setOperationAction(ISD::FSIN , MVT::f80, Expand);

663

setOperationAction(ISD::FCOS , MVT::f80, Expand);

664

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

665

666

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

667

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

668

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

669

setOperationAction(ISD::FRINT, MVT::f80, Expand);

670

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

671

setOperationAction(ISD::FMA, MVT::f80, Expand);

672

setOperationAction(ISD::LROUND, MVT::f80, Expand);

673

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

674

setOperationAction(ISD::LRINT, MVT::f80, Custom);

675

setOperationAction(ISD::LLRINT, MVT::f80, Custom);

676

677

// Handle constrained floating-point operations of scalar.

678

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

679

setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

680

setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

681

setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

682

setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

683

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

684

// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

685

// as Custom.

686

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

687

}

688

689

// f128 uses xmm registers, but most operations require libcalls.

690

if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

691

addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

692

: &X86::VR128RegClass);

693

694

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

695

696

setOperationAction(ISD::FADD, MVT::f128, LibCall);

697

setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

698

setOperationAction(ISD::FSUB, MVT::f128, LibCall);

699

setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

700

setOperationAction(ISD::FDIV, MVT::f128, LibCall);

701

setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

702

setOperationAction(ISD::FMUL, MVT::f128, LibCall);

703

setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

704

setOperationAction(ISD::FMA, MVT::f128, LibCall);

705

setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

706

707

setOperationAction(ISD::FABS, MVT::f128, Custom);

708

setOperationAction(ISD::FNEG, MVT::f128, Custom);

709

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

710

711

setOperationAction(ISD::FSIN, MVT::f128, LibCall);

712

setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

713

setOperationAction(ISD::FCOS, MVT::f128, LibCall);

714

setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

715

setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

716

// No STRICT_FSINCOS

717

setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

718

setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

719

720

setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

721

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

722

// We need to custom handle any FP_ROUND with an f128 input, but

723

// LegalizeDAG uses the result type to know when to run a custom handler.

724

// So we have to list all legal floating point result types here.

725

if (isTypeLegal(MVT::f32)) {

726

setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

727

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

728

}

729

if (isTypeLegal(MVT::f64)) {

730

setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

731

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

732

}

733

if (isTypeLegal(MVT::f80)) {

734

setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

735

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

736

}

737

738

setOperationAction(ISD::SETCC, MVT::f128, Custom);

739

740

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

741

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

742

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

743

setTruncStoreAction(MVT::f128, MVT::f32, Expand);

744

setTruncStoreAction(MVT::f128, MVT::f64, Expand);

745

setTruncStoreAction(MVT::f128, MVT::f80, Expand);

746

}

747

748

// Always use a library call for pow.

749

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

750

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

751

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

752

setOperationAction(ISD::FPOW , MVT::f128 , Expand);

753

754

setOperationAction(ISD::FLOG, MVT::f80, Expand);

755

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

756

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

757

setOperationAction(ISD::FEXP, MVT::f80, Expand);

758

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

759

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

760

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

761

762

// Some FP actions are always expanded for vector types.

763

for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,

764

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

765

setOperationAction(ISD::FSIN, VT, Expand);

766

setOperationAction(ISD::FSINCOS, VT, Expand);

767

setOperationAction(ISD::FCOS, VT, Expand);

768

setOperationAction(ISD::FREM, VT, Expand);

769

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

770

setOperationAction(ISD::FPOW, VT, Expand);

771

setOperationAction(ISD::FLOG, VT, Expand);

772

setOperationAction(ISD::FLOG2, VT, Expand);

773

setOperationAction(ISD::FLOG10, VT, Expand);

774

setOperationAction(ISD::FEXP, VT, Expand);

775

setOperationAction(ISD::FEXP2, VT, Expand);

776

}

777

778

// First set operation action for all vector types to either promote

779

// (for widening) or expand (for scalarization). Then we will selectively

780

// turn on ones that can be effectively codegen'd.

781

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

782

setOperationAction(ISD::SDIV, VT, Expand);

783

setOperationAction(ISD::UDIV, VT, Expand);

784

setOperationAction(ISD::SREM, VT, Expand);

785

setOperationAction(ISD::UREM, VT, Expand);

786

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

787

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

788

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

789

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

790

setOperationAction(ISD::FMA, VT, Expand);

791

setOperationAction(ISD::FFLOOR, VT, Expand);

792

setOperationAction(ISD::FCEIL, VT, Expand);

793

setOperationAction(ISD::FTRUNC, VT, Expand);

794

setOperationAction(ISD::FRINT, VT, Expand);

795

setOperationAction(ISD::FNEARBYINT, VT, Expand);

796

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

797

setOperationAction(ISD::MULHS, VT, Expand);

798

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

799

setOperationAction(ISD::MULHU, VT, Expand);

800

setOperationAction(ISD::SDIVREM, VT, Expand);

801

setOperationAction(ISD::UDIVREM, VT, Expand);

802

setOperationAction(ISD::CTPOP, VT, Expand);

803

setOperationAction(ISD::CTTZ, VT, Expand);

804

setOperationAction(ISD::CTLZ, VT, Expand);

805

setOperationAction(ISD::ROTL, VT, Expand);

806

setOperationAction(ISD::ROTR, VT, Expand);

807

setOperationAction(ISD::BSWAP, VT, Expand);

808

setOperationAction(ISD::SETCC, VT, Expand);

809

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

810

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

811

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

812

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

813

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

814

setOperationAction(ISD::TRUNCATE, VT, Expand);

815

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

816

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

817

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

818

setOperationAction(ISD::SELECT_CC, VT, Expand);

819

for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

820

setTruncStoreAction(InnerVT, VT, Expand);

821

822

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

823

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

824

825

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

826

// types, we have to deal with them whether we ask for Expansion or not.

827

// Setting Expand causes its own optimisation problems though, so leave

828

// them legal.

829

if (VT.getVectorElementType() == MVT::i1)

830

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

831

832

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

833

// split/scalarized right now.

834

if (VT.getVectorElementType() == MVT::f16)

835

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

836

}

837

}

838

839

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

840

// with -msoft-float, disable use of MMX as well.

841

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

842

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

843

// No operations on x86mmx supported, everything uses intrinsics.

844

}

845

846

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

847

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

848

: &X86::VR128RegClass);

849

850

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

851

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

852

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

853

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

854

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

855

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

856

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

857

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

858

859

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

860

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

861

862

setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

863

setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

864

setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

865

setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

866

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

867

}

868

869

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

870

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

871

: &X86::VR128RegClass);

872

873

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

874

// registers cannot be used even for integer operations.

875

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

876

: &X86::VR128RegClass);

877

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

878

: &X86::VR128RegClass);

879

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

880

: &X86::VR128RegClass);

881

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

882

: &X86::VR128RegClass);

883

884

for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

885

MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

886

setOperationAction(ISD::SDIV, VT, Custom);

887

setOperationAction(ISD::SREM, VT, Custom);

888

setOperationAction(ISD::UDIV, VT, Custom);

889

setOperationAction(ISD::UREM, VT, Custom);

890

}

891

892

setOperationAction(ISD::MUL, MVT::v2i8, Custom);

893

setOperationAction(ISD::MUL, MVT::v4i8, Custom);

894

setOperationAction(ISD::MUL, MVT::v8i8, Custom);

895

896

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

897

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

898

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

899

setOperationAction(ISD::MULHU, MVT::v4i32, Custom);

900

setOperationAction(ISD::MULHS, MVT::v4i32, Custom);

901

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

902

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

903

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

904

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

905

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

906

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

907

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

908

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

909

910

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

911

setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

912

setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

913

setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

914

setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

915

}

916

917

setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);

918

setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);

919

setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);

920

setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);

921

setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);

922

setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);

923

setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);

924

setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);

925

setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);

926

setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);

927

setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);

928

setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

929

930

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

931

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

932

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

933

934

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

935

setOperationAction(ISD::SETCC, VT, Custom);

936

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

937

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

938

setOperationAction(ISD::CTPOP, VT, Custom);

939

setOperationAction(ISD::ABS, VT, Custom);

940

941

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

942

// setcc all the way to isel and prefer SETGT in some isel patterns.

943

setCondCodeAction(ISD::SETLT, VT, Custom);

944

setCondCodeAction(ISD::SETLE, VT, Custom);

945

}

946

947

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

948

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

949

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

950

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

951

setOperationAction(ISD::VSELECT, VT, Custom);

952

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

953

}

954

955

for (auto VT : { MVT::v2f64, MVT::v2i64 }) {

956

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

957

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

958

setOperationAction(ISD::VSELECT, VT, Custom);

959

960

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

961

continue;

962

963

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

964

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

965

}

966

967

// Custom lower v2i64 and v2f64 selects.

968

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

969

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

970

setOperationAction(ISD::SELECT, MVT::v4i32, Custom);

971

setOperationAction(ISD::SELECT, MVT::v8i16, Custom);

972

setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

973

974

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);

975

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

976

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);

977

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

978

979

// Custom legalize these to avoid over promotion or custom promotion.

980

for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

981

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

982

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

983

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

984

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

985

}

986

987

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);

988

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);

989

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

990

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

991

992

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

993

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

994

995

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

996

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

997

998

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

999

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1000

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

1001

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1002

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

1003

1004

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1005

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

1006

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1007

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

1008

1009

// We want to legalize this to an f64 load rather than an i64 load on

1010

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

1011

// store.

1012

setOperationAction(ISD::LOAD, MVT::v2i32, Custom);

1013

setOperationAction(ISD::LOAD, MVT::v4i16, Custom);

1014

setOperationAction(ISD::LOAD, MVT::v8i8, Custom);

1015

setOperationAction(ISD::STORE, MVT::v2i32, Custom);

1016

setOperationAction(ISD::STORE, MVT::v4i16, Custom);

1017

setOperationAction(ISD::STORE, MVT::v8i8, Custom);

1018

1019

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1020

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1021

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1022

if (!Subtarget.hasAVX512())

1023

setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

1024

1025

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

1026

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

1027

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

1028

1029

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1030

1031

setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);

1032

setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

1033

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);

1034

setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);

1035

setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);

1036

setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

1037

1038

// In the customized shift lowering, the legal v4i32/v2i64 cases

1039

// in AVX2 will be recognized.

1040

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1041

setOperationAction(ISD::SRL, VT, Custom);

1042

setOperationAction(ISD::SHL, VT, Custom);

1043

setOperationAction(ISD::SRA, VT, Custom);

1044

}

1045

1046

setOperationAction(ISD::ROTL, MVT::v4i32, Custom);

1047

setOperationAction(ISD::ROTL, MVT::v8i16, Custom);

1048

1049

// With 512-bit registers or AVX512VL+BW, expanding (and promoting the

1050

// shifts) is better.

1051

if (!Subtarget.useAVX512Regs() &&

1052

!(Subtarget.hasBWI() && Subtarget.hasVLX()))

1053

setOperationAction(ISD::ROTL, MVT::v16i8, Custom);

1054

1055

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

1056

setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

1057

setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

1058

setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

1059

setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

1060

}

1061

1062

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

1063

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

1064

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

1065

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

1066

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

1067

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

1068

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

1069

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

1070

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

1071

1072

// These might be better off as horizontal vector ops.

1073

setOperationAction(ISD::ADD, MVT::i16, Custom);

1074

setOperationAction(ISD::ADD, MVT::i32, Custom);

1075

setOperationAction(ISD::SUB, MVT::i16, Custom);

1076

setOperationAction(ISD::SUB, MVT::i32, Custom);

1077

}

1078

1079

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

1080

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

1081

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

1082

setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

1083

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

1084

setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

1085

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

1086

setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

1087

setOperationAction(ISD::FRINT, RoundedTy, Legal);

1088

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

1089

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

1090

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

1091

setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);

1092

setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);

1093

1094

setOperationAction(ISD::FROUND, RoundedTy, Custom);

1095

}

1096

1097

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

1098

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

1099

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

1100

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

1101

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

1102

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

1103

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

1104

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

1105

1106

// FIXME: Do we need to handle scalar-to-vector here?

1107

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1108

1109

// We directly match byte blends in the backend as they match the VSELECT

1110

// condition form.

1111

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1112

1113

// SSE41 brings specific instructions for doing vector sign extend even in

1114

// cases where we don't have SRA.

1115

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1116

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

1117

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

1118

}

1119

1120

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1121

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1122

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

1123

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

1124

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

1125

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

1126

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

1127

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

1128

}

1129

1130

// i8 vectors are custom because the source register and source

1131

// source memory operand types are not the same width.

1132

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

1133

1134

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

1135

// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

1136

// do the pre and post work in the vector domain.

1137

setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

1138

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

1139

// We need to mark SINT_TO_FP as Custom even though we want to expand it

1140

// so that DAG combine doesn't try to turn it into uint_to_fp.

1141

setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

1142

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

1143

}

1144

}

1145

1146

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

1147

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1148

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1149

setOperationAction(ISD::ROTL, VT, Custom);

1150

1151

// XOP can efficiently perform BITREVERSE with VPPERM.

1152

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

1153

setOperationAction(ISD::BITREVERSE, VT, Custom);

1154

1155

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1156

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1157

setOperationAction(ISD::BITREVERSE, VT, Custom);

1158

}

1159

1160

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

1161

bool HasInt256 = Subtarget.hasInt256();

1162

1163

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

1164

: &X86::VR256RegClass);

1165

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1166

: &X86::VR256RegClass);

1167

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1168

: &X86::VR256RegClass);

1169

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1170

: &X86::VR256RegClass);

1171

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1172

: &X86::VR256RegClass);

1173

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1174

: &X86::VR256RegClass);

1175

1176

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

1177

setOperationAction(ISD::FFLOOR, VT, Legal);

1178

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1179

setOperationAction(ISD::FCEIL, VT, Legal);

1180

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1181

setOperationAction(ISD::FTRUNC, VT, Legal);

1182

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1183

setOperationAction(ISD::FRINT, VT, Legal);

1184

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1185

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1186

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1187

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1188

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1189

1190

setOperationAction(ISD::FROUND, VT, Custom);

1191

1192

setOperationAction(ISD::FNEG, VT, Custom);

1193

setOperationAction(ISD::FABS, VT, Custom);

1194

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1195

}

1196

1197

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1198

// even though v8i16 is a legal type.

1199

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1200

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1201

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1202

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1203

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

1204

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);

1205

1206

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);

1207

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);

1208

1209

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

1210

setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

1211

setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

1212

setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

1213

setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

1214

setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

1215

setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

1216

setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

1217

setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

1218

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

1219

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

1220

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

1221

1222

if (!Subtarget.hasAVX512())

1223

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

1224

1225

// In the customized shift lowering, the legal v8i32/v4i64 cases

1226

// in AVX2 will be recognized.

1227

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1228

setOperationAction(ISD::SRL, VT, Custom);

1229

setOperationAction(ISD::SHL, VT, Custom);

1230

setOperationAction(ISD::SRA, VT, Custom);

1231

}

1232

1233

// These types need custom splitting if their input is a 128-bit vector.

1234

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1235

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1236

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1237

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1238

1239

setOperationAction(ISD::ROTL, MVT::v8i32, Custom);

1240

setOperationAction(ISD::ROTL, MVT::v16i16, Custom);

1241

1242

// With BWI, expanding (and promoting the shifts) is the better.

1243

if (!Subtarget.useBWIRegs())

1244

setOperationAction(ISD::ROTL, MVT::v32i8, Custom);

1245

1246

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1247

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1248

setOperationAction(ISD::SELECT, MVT::v8i32, Custom);

1249

setOperationAction(ISD::SELECT, MVT::v16i16, Custom);

1250

setOperationAction(ISD::SELECT, MVT::v32i8, Custom);

1251

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1252

1253

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1254

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1255

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1256

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1257

}

1258

1259

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1260

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1261

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1262

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1263

1264

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1265

setOperationAction(ISD::SETCC, VT, Custom);

1266

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1267

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1268

setOperationAction(ISD::CTPOP, VT, Custom);

1269

setOperationAction(ISD::CTLZ, VT, Custom);

1270

1271

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1272

// setcc all the way to isel and prefer SETGT in some isel patterns.

1273

setCondCodeAction(ISD::SETLT, VT, Custom);

1274

setCondCodeAction(ISD::SETLE, VT, Custom);

1275

}

1276

1277

if (Subtarget.hasAnyFMA()) {

1278

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1279

MVT::v2f64, MVT::v4f64 }) {

1280

setOperationAction(ISD::FMA, VT, Legal);

1281

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1282

}

1283

}

1284

1285

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1286

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1287

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1288

}

1289

1290

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1291

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1292

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1293

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1294

1295

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);

1296

setOperationAction(ISD::MULHS, MVT::v8i32, Custom);

1297

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1298

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1299

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1300

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1301

1302

setOperationAction(ISD::ABS, MVT::v4i64, Custom);

1303

setOperationAction(ISD::SMAX, MVT::v4i64, Custom);

1304

setOperationAction(ISD::UMAX, MVT::v4i64, Custom);

1305

setOperationAction(ISD::SMIN, MVT::v4i64, Custom);

1306

setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

1307

1308

setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1309

setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1310

setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1311

setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1312

setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1313

setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1314

setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1315

setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1316

1317

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1318

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1319

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1320

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1321

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1322

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1323

}

1324

1325

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

1326

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1327

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1328

}

1329

1330

if (HasInt256) {

1331

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1332

// when we have a 256bit-wide blend with immediate.

1333

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1334

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

1335

1336

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1337

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1338

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1339

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1340

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1341

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1342

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1343

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1344

}

1345

}

1346

1347

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1348

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1349

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1350

setOperationAction(ISD::MSTORE, VT, Legal);

1351

}

1352

1353

// Extract subvector is special because the value type

1354

// (result) is 128-bit but the source is 256-bit wide.

1355

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1356

MVT::v4f32, MVT::v2f64 }) {

1357

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1358

}

1359

1360

// Custom lower several nodes for 256-bit types.

1361

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1362

MVT::v8f32, MVT::v4f64 }) {

1363

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1364

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1365

setOperationAction(ISD::VSELECT, VT, Custom);

1366

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1367

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1368

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1369

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1370

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1371

setOperationAction(ISD::STORE, VT, Custom);

1372

}

1373

1374

if (HasInt256) {

1375

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1376

1377

// Custom legalize 2x32 to get a little better code.

1378

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1379

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1380

1381

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1382

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1383

setOperationAction(ISD::MGATHER, VT, Custom);

1384

}

1385

}

1386

1387

// This block controls legalization of the mask vector sizes that are

1388

// available with AVX512. 512-bit vectors are in a separate block controlled

1389

// by useAVX512Regs.

1390

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1391

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1392

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1393

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1394

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1395

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1396

1397

setOperationAction(ISD::SELECT, MVT::v1i1, Custom);

1398

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1399

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1400

1401

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1402

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1403

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1404

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1405

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1406

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1407

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1408

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1409

setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

1410

setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

1411

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

1412

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

1413

1414

// There is no byte sized k-register load or store without AVX512DQ.

1415

if (!Subtarget.hasDQI()) {

1416

setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

1417

setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

1418

setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

1419

setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

1420

1421

setOperationAction(ISD::STORE, MVT::v1i1, Custom);

1422

setOperationAction(ISD::STORE, MVT::v2i1, Custom);

1423

setOperationAction(ISD::STORE, MVT::v4i1, Custom);

1424

setOperationAction(ISD::STORE, MVT::v8i1, Custom);

1425

}

1426

1427

// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

1428

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1429

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1430

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1431

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1432

}

1433

1434

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1435

setOperationAction(ISD::ADD, VT, Custom);

1436

setOperationAction(ISD::SUB, VT, Custom);

1437

setOperationAction(ISD::MUL, VT, Custom);

1438

setOperationAction(ISD::UADDSAT, VT, Custom);

1439

setOperationAction(ISD::SADDSAT, VT, Custom);

1440

setOperationAction(ISD::USUBSAT, VT, Custom);

1441

setOperationAction(ISD::SSUBSAT, VT, Custom);

1442

setOperationAction(ISD::VSELECT, VT, Expand);

1443

}

1444

1445

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1446

setOperationAction(ISD::SETCC, VT, Custom);

1447

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1448

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1449

setOperationAction(ISD::SELECT, VT, Custom);

1450

setOperationAction(ISD::TRUNCATE, VT, Custom);

1451

1452

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1453

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1454

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1455

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1456

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1457

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1458

}

1459

1460

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

1461

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1462

}

1463

1464

// This block controls legalization for 512-bit operations with 32/64 bit

1465

// elements. 512-bits can be disabled based on prefer-vector-width and

1466

// required-vector-width function attributes.

1467

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

1468

bool HasBWI = Subtarget.hasBWI();

1469

1470

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1471

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1472

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1473

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1474

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1475

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1476

1477

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1478

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1479

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1480

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1481

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1482

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1483

if (HasBWI)

1484

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1485

}

1486

1487

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1488

setOperationAction(ISD::FNEG, VT, Custom);

1489

setOperationAction(ISD::FABS, VT, Custom);

1490

setOperationAction(ISD::FMA, VT, Legal);

1491

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1492

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1493

}

1494

1495

for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {

1496

setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

1497

setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

1498

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

1499

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

1500

}

1501

setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);

1502

setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);

1503

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);

1504

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);

1505

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);

1506

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

1507

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);

1508

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);

1509

1510

setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

1511

setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

1512

setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

1513

setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

1514

setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

1515

setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

1516

setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

1517

setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

1518

setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

1519

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

1520

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

1521

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

1522

1523

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1524

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1525

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1526

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1527

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1528

if (HasBWI)

1529

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1530

1531

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

1532

// to 512-bit rather than use the AVX2 instructions so that we can use

1533

// k-masks.

1534

if (!Subtarget.hasVLX()) {

1535

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1536

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1537

setOperationAction(ISD::MLOAD, VT, Custom);

1538

setOperationAction(ISD::MSTORE, VT, Custom);

1539

}

1540

}

1541

1542

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

1543

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

1544

setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

1545

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1546

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1547

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1548

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1549

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1550

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1551

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1552

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1553

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1554

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1555

1556

if (HasBWI) {

1557

// Extends from v64i1 masks to 512-bit vectors.

1558

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1559

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1560

setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

1561

}

1562

1563

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1564

setOperationAction(ISD::FFLOOR, VT, Legal);

1565

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1566

setOperationAction(ISD::FCEIL, VT, Legal);

1567

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1568

setOperationAction(ISD::FTRUNC, VT, Legal);

1569

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1570

setOperationAction(ISD::FRINT, VT, Legal);

1571

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1572

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1573

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1574

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1575

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1576

1577

setOperationAction(ISD::FROUND, VT, Custom);

1578

}

1579

1580

for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

1581

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1582

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1583

}

1584

1585

setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

1586

setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

1587

setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

1588

setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

1589

1590

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1591

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1592

setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

1593

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1594

1595

setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

1596

setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

1597

setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

1598

setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

1599

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1600

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1601

1602

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1603

1604

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1605

setOperationAction(ISD::SRL, VT, Custom);

1606

setOperationAction(ISD::SHL, VT, Custom);

1607

setOperationAction(ISD::SRA, VT, Custom);

1608

setOperationAction(ISD::SETCC, VT, Custom);

1609

1610

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1611

// setcc all the way to isel and prefer SETGT in some isel patterns.

1612

setCondCodeAction(ISD::SETLT, VT, Custom);

1613

setCondCodeAction(ISD::SETLE, VT, Custom);

1614

}

1615

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1616

setOperationAction(ISD::SMAX, VT, Legal);

1617

setOperationAction(ISD::UMAX, VT, Legal);

1618

setOperationAction(ISD::SMIN, VT, Legal);

1619

setOperationAction(ISD::UMIN, VT, Legal);

1620

setOperationAction(ISD::ABS, VT, Legal);

1621

setOperationAction(ISD::CTPOP, VT, Custom);

1622

setOperationAction(ISD::ROTL, VT, Custom);

1623

setOperationAction(ISD::ROTR, VT, Custom);

1624

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1625

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1626

}

1627

1628

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1629

setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

1630

setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

1631

setOperationAction(ISD::CTLZ, VT, Custom);

1632

setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

1633

setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

1634

setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

1635

setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

1636

setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

1637

setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

1638

setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

1639

setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

1640

}

1641

1642

if (Subtarget.hasDQI()) {

1643

setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);

1644

setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);

1645

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);

1646

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);

1647

setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);

1648

setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);

1649

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);

1650

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);

1651

1652

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1653

}

1654

1655

if (Subtarget.hasCDI()) {

1656

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1657

for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

1658

setOperationAction(ISD::CTLZ, VT, Legal);

1659

}

1660

} // Subtarget.hasCDI()

1661

1662

if (Subtarget.hasVPOPCNTDQ()) {

1663

for (auto VT : { MVT::v16i32, MVT::v8i64 })

1664

setOperationAction(ISD::CTPOP, VT, Legal);

1665

}

1666

1667

// Extract subvector is special because the value type

1668

// (result) is 256-bit but the source is 512-bit wide.

1669

// 128-bit was made Legal under AVX1.

1670

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1671

MVT::v8f32, MVT::v4f64 })

1672

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1673

1674

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

1675

MVT::v16f32, MVT::v8f64 }) {

1676

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1677

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1678

setOperationAction(ISD::SELECT, VT, Custom);

1679

setOperationAction(ISD::VSELECT, VT, Custom);

1680

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1681

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1682

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1683

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1684

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1685

}

1686

1687

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1688

setOperationAction(ISD::MLOAD, VT, Legal);

1689

setOperationAction(ISD::MSTORE, VT, Legal);

1690

setOperationAction(ISD::MGATHER, VT, Custom);

1691

setOperationAction(ISD::MSCATTER, VT, Custom);

1692

}

1693

if (HasBWI) {

1694

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1695

setOperationAction(ISD::MLOAD, VT, Legal);

1696

setOperationAction(ISD::MSTORE, VT, Legal);

1697

}

1698

} else {

1699

setOperationAction(ISD::STORE, MVT::v32i16, Custom);

1700

setOperationAction(ISD::STORE, MVT::v64i8, Custom);

1701

}

1702

1703

if (Subtarget.hasVBMI2()) {

1704

for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1705

setOperationAction(ISD::FSHL, VT, Custom);

1706

setOperationAction(ISD::FSHR, VT, Custom);

1707

}

1708

}

1709

}// useAVX512Regs

1710

1711

// This block controls legalization for operations that don't have

1712

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

1713

// narrower widths.

1714

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1715

// These operations are handled on non-VLX by artificially widening in

1716

// isel patterns.

1717

1718

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,

1719

Subtarget.hasVLX() ? Legal : Custom);

1720

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,

1721

Subtarget.hasVLX() ? Legal : Custom);

1722

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1723

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,

1724

Subtarget.hasVLX() ? Legal : Custom);

1725

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,

1726

Subtarget.hasVLX() ? Legal : Custom);

1727

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

1728

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,

1729

Subtarget.hasVLX() ? Legal : Custom);

1730

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,

1731

Subtarget.hasVLX() ? Legal : Custom);

1732

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,

1733

Subtarget.hasVLX() ? Legal : Custom);

1734

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,

1735

Subtarget.hasVLX() ? Legal : Custom);

1736

1737

if (Subtarget.hasDQI()) {

1738

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

1739

// v2f32 UINT_TO_FP is already custom under SSE2.

1740

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1742, __PRETTY_FUNCTION__))

1741

isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1742, __PRETTY_FUNCTION__))

1742

"Unexpected operation action!")((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1742, __PRETTY_FUNCTION__));

1743

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

1744

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1745

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1746

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

1747

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

1748

}

1749

1750

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1751

setOperationAction(ISD::SMAX, VT, Legal);

1752

setOperationAction(ISD::UMAX, VT, Legal);

1753

setOperationAction(ISD::SMIN, VT, Legal);

1754

setOperationAction(ISD::UMIN, VT, Legal);

1755

setOperationAction(ISD::ABS, VT, Legal);

1756

}

1757

1758

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1759

setOperationAction(ISD::ROTL, VT, Custom);

1760

setOperationAction(ISD::ROTR, VT, Custom);

1761

}

1762

1763

// Custom legalize 2x32 to get a little better code.

1764

setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

1765

setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

1766

1767

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1768

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1769

setOperationAction(ISD::MSCATTER, VT, Custom);

1770

1771

if (Subtarget.hasDQI()) {

1772

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1773

setOperationAction(ISD::SINT_TO_FP, VT,

1774

Subtarget.hasVLX() ? Legal : Custom);

1775

setOperationAction(ISD::UINT_TO_FP, VT,

1776

Subtarget.hasVLX() ? Legal : Custom);

1777

setOperationAction(ISD::STRICT_SINT_TO_FP, VT,

1778

Subtarget.hasVLX() ? Legal : Custom);

1779

setOperationAction(ISD::STRICT_UINT_TO_FP, VT,

1780

Subtarget.hasVLX() ? Legal : Custom);

1781

setOperationAction(ISD::FP_TO_SINT, VT,

1782

Subtarget.hasVLX() ? Legal : Custom);

1783

setOperationAction(ISD::FP_TO_UINT, VT,

1784

Subtarget.hasVLX() ? Legal : Custom);

1785

setOperationAction(ISD::STRICT_FP_TO_SINT, VT,

1786

Subtarget.hasVLX() ? Legal : Custom);

1787

setOperationAction(ISD::STRICT_FP_TO_UINT, VT,

1788

Subtarget.hasVLX() ? Legal : Custom);

1789

setOperationAction(ISD::MUL, VT, Legal);

1790

}

1791

}

1792

1793

if (Subtarget.hasCDI()) {

1794

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1795

setOperationAction(ISD::CTLZ, VT, Legal);

1796

}

1797

} // Subtarget.hasCDI()

1798

1799

if (Subtarget.hasVPOPCNTDQ()) {

1800

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

1801

setOperationAction(ISD::CTPOP, VT, Legal);

1802

}

1803

}

1804

1805

// This block control legalization of v32i1/v64i1 which are available with

1806

// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with

1807

// useBWIRegs.

1808

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

1809

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

1810

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

1811

1812

for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

1813

setOperationAction(ISD::ADD, VT, Custom);

1814

setOperationAction(ISD::SUB, VT, Custom);

1815

setOperationAction(ISD::MUL, VT, Custom);

1816

setOperationAction(ISD::VSELECT, VT, Expand);

1817

setOperationAction(ISD::UADDSAT, VT, Custom);

1818

setOperationAction(ISD::SADDSAT, VT, Custom);

1819

setOperationAction(ISD::USUBSAT, VT, Custom);

1820

setOperationAction(ISD::SSUBSAT, VT, Custom);

1821

1822

setOperationAction(ISD::TRUNCATE, VT, Custom);

1823

setOperationAction(ISD::SETCC, VT, Custom);

1824

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1825

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1826

setOperationAction(ISD::SELECT, VT, Custom);

1827

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1828

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1829

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1830

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1831

}

1832

1833

for (auto VT : { MVT::v16i1, MVT::v32i1 })

1834

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1835

1836

// Extends from v32i1 masks to 256-bit vectors.

1837

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

1838

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

1839

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

1840

1841

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

1842

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1843

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

1844

}

1845

1846

// These operations are handled on non-VLX by artificially widening in

1847

// isel patterns.

1848

// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

1849

1850

if (Subtarget.hasBITALG()) {

1851

for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

1852

setOperationAction(ISD::CTPOP, VT, Legal);

1853

}

1854

}

1855

1856

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

1857

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

1858

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

1859

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

1860

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

1861

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

1862

1863

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

1864

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

1865

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

1866

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

1867

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

1868

1869

if (Subtarget.hasBWI()) {

1870

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

1871

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

1872

}

1873

1874

if (Subtarget.hasVBMI2()) {

1875

// TODO: Make these legal even without VLX?

1876

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,

1877

MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1878

setOperationAction(ISD::FSHL, VT, Custom);

1879

setOperationAction(ISD::FSHR, VT, Custom);

1880

}

1881

}

1882

1883

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);

1884

setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);

1885

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1886

}

1887

1888

// We want to custom lower some of our intrinsics.

1889

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

1890

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

1891

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

1892

if (!Subtarget.is64Bit()) {

1893

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

1894

}

1895

1896

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

1897

// handle type legalization for these operations here.

1898

//

1899

// FIXME: We really should do custom legalization for addition and

1900

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

1901

// than generic legalization for 64-bit multiplication-with-overflow, though.

1902

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

1903

if (VT == MVT::i64 && !Subtarget.is64Bit())

1904

continue;

1905

// Add/Sub/Mul with overflow operations are custom lowered.

1906

setOperationAction(ISD::SADDO, VT, Custom);

1907

setOperationAction(ISD::UADDO, VT, Custom);

1908

setOperationAction(ISD::SSUBO, VT, Custom);

1909

setOperationAction(ISD::USUBO, VT, Custom);

1910

setOperationAction(ISD::SMULO, VT, Custom);

1911

setOperationAction(ISD::UMULO, VT, Custom);

1912

1913

// Support carry in as value rather than glue.

1914

setOperationAction(ISD::ADDCARRY, VT, Custom);

1915

setOperationAction(ISD::SUBCARRY, VT, Custom);

1916

setOperationAction(ISD::SETCCCARRY, VT, Custom);

1917

}

1918

1919

if (!Subtarget.is64Bit()) {

1920

// These libcalls are not available in 32-bit.

1921

setLibcallName(RTLIB::SHL_I128, nullptr);

1922

setLibcallName(RTLIB::SRL_I128, nullptr);

1923

setLibcallName(RTLIB::SRA_I128, nullptr);

1924

setLibcallName(RTLIB::MUL_I128, nullptr);

1925

}

1926

1927

// Combine sin / cos into _sincos_stret if it is available.

1928

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

1929

getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

1930

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

1931

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

1932

}

1933

1934

if (Subtarget.isTargetWin64()) {

1935

setOperationAction(ISD::SDIV, MVT::i128, Custom);

1936

setOperationAction(ISD::UDIV, MVT::i128, Custom);

1937

setOperationAction(ISD::SREM, MVT::i128, Custom);

1938

setOperationAction(ISD::UREM, MVT::i128, Custom);

1939

}

1940

1941

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

1942

// is. We should promote the value to 64-bits to solve this.

1943

// This is what the CRT headers do - `fmodf` is an inline header

1944

// function casting to f64 and calling `fmod`.

1945

if (Subtarget.is32Bit() &&

1946

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

1947

for (ISD::NodeType Op :

1948

{ISD::FCEIL, ISD::STRICT_FCEIL,

1949

ISD::FCOS, ISD::STRICT_FCOS,

1950

ISD::FEXP, ISD::STRICT_FEXP,

1951

ISD::FFLOOR, ISD::STRICT_FFLOOR,

1952

ISD::FREM, ISD::STRICT_FREM,

1953

ISD::FLOG, ISD::STRICT_FLOG,

1954

ISD::FLOG10, ISD::STRICT_FLOG10,

1955

ISD::FPOW, ISD::STRICT_FPOW,

1956

ISD::FSIN, ISD::STRICT_FSIN})

1957

if (isOperationExpand(Op, MVT::f32))

1958

setOperationAction(Op, MVT::f32, Promote);

1959

1960

// We have target-specific dag combine patterns for the following nodes:

1961

setTargetDAGCombine(ISD::VECTOR_SHUFFLE);

1962

setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);

1963

setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

1964

setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);

1965

setTargetDAGCombine(ISD::CONCAT_VECTORS);

1966

setTargetDAGCombine(ISD::INSERT_SUBVECTOR);

1967

setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);

1968

setTargetDAGCombine(ISD::BITCAST);

1969

setTargetDAGCombine(ISD::VSELECT);

1970

setTargetDAGCombine(ISD::SELECT);

1971

setTargetDAGCombine(ISD::SHL);

1972

setTargetDAGCombine(ISD::SRA);

1973

setTargetDAGCombine(ISD::SRL);

1974

setTargetDAGCombine(ISD::OR);

1975

setTargetDAGCombine(ISD::AND);

1976

setTargetDAGCombine(ISD::ADD);

1977

setTargetDAGCombine(ISD::FADD);

1978

setTargetDAGCombine(ISD::FSUB);

1979

setTargetDAGCombine(ISD::FNEG);

1980

setTargetDAGCombine(ISD::FMA);

1981

setTargetDAGCombine(ISD::STRICT_FMA);

1982

setTargetDAGCombine(ISD::FMINNUM);

1983

setTargetDAGCombine(ISD::FMAXNUM);

1984

setTargetDAGCombine(ISD::SUB);

1985

setTargetDAGCombine(ISD::LOAD);

1986

setTargetDAGCombine(ISD::MLOAD);

1987

setTargetDAGCombine(ISD::STORE);

1988

setTargetDAGCombine(ISD::MSTORE);

1989

setTargetDAGCombine(ISD::TRUNCATE);

1990

setTargetDAGCombine(ISD::ZERO_EXTEND);

1991

setTargetDAGCombine(ISD::ANY_EXTEND);

1992

setTargetDAGCombine(ISD::SIGN_EXTEND);

1993

setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);

1994

setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);

1995

setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);

1996

setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);

1997

setTargetDAGCombine(ISD::SINT_TO_FP);

1998

setTargetDAGCombine(ISD::UINT_TO_FP);

1999

setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);

2000

setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);

2001

setTargetDAGCombine(ISD::SETCC);

2002

setTargetDAGCombine(ISD::MUL);

2003

setTargetDAGCombine(ISD::XOR);

2004

setTargetDAGCombine(ISD::MSCATTER);

2005

setTargetDAGCombine(ISD::MGATHER);

2006

setTargetDAGCombine(ISD::FP16_TO_FP);

2007

setTargetDAGCombine(ISD::FP_EXTEND);

2008

setTargetDAGCombine(ISD::STRICT_FP_EXTEND);

2009

setTargetDAGCombine(ISD::FP_ROUND);

2010

2011

computeRegisterProperties(Subtarget.getRegisterInfo());

2012

2013

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

2014

MaxStoresPerMemsetOptSize = 8;

2015

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

2016

MaxStoresPerMemcpyOptSize = 4;

2017

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

2018

MaxStoresPerMemmoveOptSize = 4;

2019

2020

// TODO: These control memcmp expansion in CGP and could be raised higher, but

2021

// that needs to benchmarked and balanced with the potential use of vector

2022

// load/store types (PR33329, PR33914).

2023

MaxLoadsPerMemcmp = 2;

2024

MaxLoadsPerMemcmpOptSize = 2;

2025

2026

// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).

2027

setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));

2028

2029

// An out-of-order CPU can speculatively execute past a predictable branch,

2030

// but a conditional move could be stalled by an expensive earlier operation.

2031

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

2032

EnableExtLdPromotion = true;

2033

setPrefFunctionAlignment(Align(16));

2034

2035

verifyIntrinsicTables();

2036

2037

// Default to having -disable-strictnode-mutation on

2038

IsStrictFPEnabled = true;

2039

}

2040

2041

// This has so far only been implemented for 64-bit MachO.

2042

bool X86TargetLowering::useLoadStackGuardNode() const {

2043

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

2044

}

2045

2046

bool X86TargetLowering::useStackGuardXorFP() const {

2047

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

2048

return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

2049

}

2050

2051

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

2052

const SDLoc &DL) const {

2053

EVT PtrTy = getPointerTy(DAG.getDataLayout());

2054

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

2055

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

2056

return SDValue(Node, 0);

2057

}

2058

2059

TargetLoweringBase::LegalizeTypeAction

2060

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

2061

if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

2062

!Subtarget.hasBWI())

2063

return TypeSplitVector;

2064

2065

if (VT.getVectorNumElements() != 1 &&

2066

VT.getVectorElementType() != MVT::i1)

2067

return TypeWidenVector;

2068

2069

return TargetLoweringBase::getPreferredVectorAction(VT);

2070

}

2071

2072

static std::pair<MVT, unsigned>

2073

handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

2074

const X86Subtarget &Subtarget) {

2075

// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

2076

// convention is one that uses k registers.

2077

if (NumElts == 2)

2078

return {MVT::v2i64, 1};

2079

if (NumElts == 4)

2080

return {MVT::v4i32, 1};

2081

if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

2082

CC != CallingConv::Intel_OCL_BI)

2083

return {MVT::v8i16, 1};

2084

if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

2085

CC != CallingConv::Intel_OCL_BI)

2086

return {MVT::v16i8, 1};

2087

// v32i1 passes in ymm unless we have BWI and the calling convention is

2088

// regcall.

2089

if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

2090

return {MVT::v32i8, 1};

2091

// Split v64i1 vectors if we don't have v64i8 available.

2092

if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

2093

if (Subtarget.useAVX512Regs())

2094

return {MVT::v64i8, 1};

2095

return {MVT::v32i8, 2};

2096

}

2097

2098

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2099

if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

2100

NumElts > 64)

2101

return {MVT::i8, NumElts};

2102

2103

return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

2104

}

2105

2106

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

2107

CallingConv::ID CC,

2108

EVT VT) const {

2109

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2110

Subtarget.hasAVX512()) {

2111

unsigned NumElts = VT.getVectorNumElements();

2112

2113

MVT RegisterVT;

2114

unsigned NumRegisters;

2115

std::tie(RegisterVT, NumRegisters) =

2116

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2117

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2118

return RegisterVT;

2119

}

2120

2121

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

2122

}

2123

2124

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

2125

CallingConv::ID CC,

2126

EVT VT) const {

2127

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2128

Subtarget.hasAVX512()) {

2129

unsigned NumElts = VT.getVectorNumElements();

2130

2131

MVT RegisterVT;

2132

unsigned NumRegisters;

2133

std::tie(RegisterVT, NumRegisters) =

2134

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2135

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2136

return NumRegisters;

2137

}

2138

2139

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

2140

}

2141

2142

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

2143

LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

2144

unsigned &NumIntermediates, MVT &RegisterVT) const {

2145

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2146

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2147

Subtarget.hasAVX512() &&

2148

(!isPowerOf2_32(VT.getVectorNumElements()) ||

2149

(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

2150

VT.getVectorNumElements() > 64)) {

2151

RegisterVT = MVT::i8;

2152

IntermediateVT = MVT::i1;

2153

NumIntermediates = VT.getVectorNumElements();

2154

return NumIntermediates;

2155

}

2156

2157

// Split v64i1 vectors if we don't have v64i8 available.

2158

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

2159

CC != CallingConv::X86_RegCall) {

2160

RegisterVT = MVT::v32i8;

2161

IntermediateVT = MVT::v32i1;

2162

NumIntermediates = 2;

2163

return 2;

2164

}

2165

2166

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

2167

NumIntermediates, RegisterVT);

2168

}

2169

2170

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

2171

LLVMContext& Context,

2172

EVT VT) const {

2173

if (!VT.isVector())

2174

return MVT::i8;

2175

2176

if (Subtarget.hasAVX512()) {

2177

const unsigned NumElts = VT.getVectorNumElements();

2178

2179

// Figure out what this type will be legalized to.

2180

EVT LegalVT = VT;

2181

while (getTypeAction(Context, LegalVT) != TypeLegal)

2182

LegalVT = getTypeToTransformTo(Context, LegalVT);

2183

2184

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

2185

if (LegalVT.getSimpleVT().is512BitVector())

2186

return EVT::getVectorVT(Context, MVT::i1, NumElts);

2187

2188

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

2189

// If we legalized to less than a 512-bit vector, then we will use a vXi1

2190

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

2191

// vXi16/vXi8.

2192

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

2193

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

2194

return EVT::getVectorVT(Context, MVT::i1, NumElts);

2195

}

2196

}

2197

2198

return VT.changeVectorElementTypeToInteger();

2199

}

2200

2201

/// Helper for getByValTypeAlignment to determine

2202

/// the desired ByVal argument alignment.

2203

static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

2204

if (MaxAlign == 16)

2205

return;

2206

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

2207

if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)

2208

MaxAlign = Align(16);

2209

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

2210

Align EltAlign;

2211

getMaxByValAlign(ATy->getElementType(), EltAlign);

2212

if (EltAlign > MaxAlign)

2213

MaxAlign = EltAlign;

2214

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

2215

for (auto *EltTy : STy->elements()) {

2216

Align EltAlign;

2217

getMaxByValAlign(EltTy, EltAlign);

2218

if (EltAlign > MaxAlign)

2219

MaxAlign = EltAlign;

2220

if (MaxAlign == 16)

2221

break;

2222

}

2223

}

2224

}

2225

2226

/// Return the desired alignment for ByVal aggregate

2227

/// function arguments in the caller parameter area. For X86, aggregates

2228

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

2229

/// are at 4-byte boundaries.

2230

unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,

2231

const DataLayout &DL) const {

2232

if (Subtarget.is64Bit()) {

2233

// Max of 8 and alignment of type.

2234

Align TyAlign = DL.getABITypeAlign(Ty);

2235

if (TyAlign > 8)

2236

return TyAlign.value();

2237

return 8;

2238

}

2239

2240

Align Alignment(4);

2241

if (Subtarget.hasSSE1())

2242

getMaxByValAlign(Ty, Alignment);

2243

return Alignment.value();

2244

}

2245

2246

/// It returns EVT::Other if the type should be determined using generic

2247

/// target-independent logic.

2248

/// For vector ops we check that the overall size isn't larger than our

2249

/// preferred vector width.

2250

EVT X86TargetLowering::getOptimalMemOpType(

2251

const MemOp &Op, const AttributeList &FuncAttributes) const {

2252

if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {

2253

if (Op.size() >= 16 &&

2254

(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

2255

// FIXME: Check if unaligned 64-byte accesses are slow.

2256

if (Op.size() >= 64 && Subtarget.hasAVX512() &&

2257

(Subtarget.getPreferVectorWidth() >= 512)) {

2258

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

2259

}

2260

// FIXME: Check if unaligned 32-byte accesses are slow.

2261

if (Op.size() >= 32 && Subtarget.hasAVX() &&

2262

(Subtarget.getPreferVectorWidth() >= 256)) {

2263

// Although this isn't a well-supported type for AVX1, we'll let

2264

// legalization and shuffle lowering produce the optimal codegen. If we

2265

// choose an optimal type with a vector element larger than a byte,

2266

// getMemsetStores() may create an intermediate splat (using an integer

2267

// multiply) before we splat as a vector.

2268

return MVT::v32i8;

2269

}

2270

if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))

2271

return MVT::v16i8;

2272

// TODO: Can SSE1 handle a byte vector?

2273

// If we have SSE1 registers we should be able to use them.

2274

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

2275

(Subtarget.getPreferVectorWidth() >= 128))

2276

return MVT::v4f32;

2277

} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

2278

Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

2279

// Do not use f64 to lower memcpy if source is string constant. It's

2280

// better to use i32 to avoid the loads.

2281

// Also, do not use f64 to lower memset unless this is a memset of zeros.

2282

// The gymnastics of splatting a byte value into an XMM register and then

2283

// only using 8-byte stores (because this is a CPU with slow unaligned

2284

// 16-byte accesses) makes that a loser.

2285

return MVT::f64;

2286

}

2287

}

2288

// This is a compromise. If we reach here, unaligned accesses may be slow on

2289

// this target. However, creating smaller, aligned accesses could be even

2290

// slower and would certainly be a lot more code.

2291

if (Subtarget.is64Bit() && Op.size() >= 8)

2292

return MVT::i64;

2293

return MVT::i32;

2294

}

2295

2296

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

2297

if (VT == MVT::f32)

2298

return X86ScalarSSEf32;

2299

else if (VT == MVT::f64)

2300

return X86ScalarSSEf64;

2301

return true;

2302

}

2303

2304

bool X86TargetLowering::allowsMisalignedMemoryAccesses(

2305

EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,

2306

bool *Fast) const {

2307

if (Fast) {

2308

switch (VT.getSizeInBits()) {

2309

default:

2310

// 8-byte and under are always assumed to be fast.

2311

*Fast = true;

2312

break;

2313

case 128:

2314

*Fast = !Subtarget.isUnalignedMem16Slow();

2315

break;

2316

case 256:

2317

*Fast = !Subtarget.isUnalignedMem32Slow();

2318

break;

2319

// TODO: What about AVX-512 (512-bit) accesses?

2320

}

2321

}

2322

// NonTemporal vector memory ops must be aligned.

2323

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2324

// NT loads can only be vector aligned, so if its less aligned than the

2325

// minimum vector size (which we can split the vector down to), we might as

2326

// well use a regular unaligned vector load.

2327

// We don't have any NT loads pre-SSE41.

2328

if (!!(Flags & MachineMemOperand::MOLoad))

2329

return (Align < 16 || !Subtarget.hasSSE41());

2330

return false;

2331

}

2332

// Misaligned accesses of any size are always allowed.

2333

return true;

2334

}

2335

2336

/// Return the entry encoding for a jump table in the

2337

/// current function. The returned value is a member of the

2338

/// MachineJumpTableInfo::JTEntryKind enum.

2339

unsigned X86TargetLowering::getJumpTableEncoding() const {

2340

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

2341

// symbol.

2342

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

2343

return MachineJumpTableInfo::EK_Custom32;

2344

2345

// Otherwise, use the normal jump table encoding heuristics.

2346

return TargetLowering::getJumpTableEncoding();

2347

}

2348

2349

bool X86TargetLowering::useSoftFloat() const {

2350

return Subtarget.useSoftFloat();

2351

}

2352

2353

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

2354

ArgListTy &Args) const {

2355

2356

// Only relabel X86-32 for C / Stdcall CCs.

2357

if (Subtarget.is64Bit())

2358

return;

2359

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

2360

return;

2361

unsigned ParamRegs = 0;

2362

if (auto *M = MF->getFunction().getParent())

2363

ParamRegs = M->getNumberRegisterParameters();

2364

2365

// Mark the first N int arguments as having reg

2366

for (unsigned Idx = 0; Idx < Args.size(); Idx++) {

2367

Type *T = Args[Idx].Ty;

2368

if (T->isIntOrPtrTy())

2369

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

2370

unsigned numRegs = 1;

2371

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

2372

numRegs = 2;

2373

if (ParamRegs < numRegs)

2374

return;

2375

ParamRegs -= numRegs;

2376

Args[Idx].IsInReg = true;

2377

}

2378

}

2379

}

2380

2381

const MCExpr *

2382

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

2383

const MachineBasicBlock *MBB,

2384

unsigned uid,MCContext &Ctx) const{

2385

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((isPositionIndependent() && Subtarget.isPICStyleGOT(
)) ? static_cast<void> (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2385, __PRETTY_FUNCTION__));

2386

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

2387

// entries.

2388

return MCSymbolRefExpr::create(MBB->getSymbol(),

2389

MCSymbolRefExpr::VK_GOTOFF, Ctx);

2390

}

2391

2392

/// Returns relocation base for the given PIC jumptable.

2393

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

2394

SelectionDAG &DAG) const {

2395

if (!Subtarget.is64Bit())

2396

// This doesn't have SDLoc associated with it, but is not really the

2397

// same as a Register.

2398

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

2399

getPointerTy(DAG.getDataLayout()));

2400

return Table;

2401

}

2402

2403

/// This returns the relocation base for the given PIC jumptable,

2404

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

2405

const MCExpr *X86TargetLowering::

2406

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

2407

MCContext &Ctx) const {

2408

// X86-64 uses RIP relative addressing based on the jump table label.

2409

if (Subtarget.isPICStyleRIPRel())

2410

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

2411

2412

// Otherwise, the reference is relative to the PIC base.

2413

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

2414

}

2415

2416

std::pair<const TargetRegisterClass *, uint8_t>

2417

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

2418

MVT VT) const {

2419

const TargetRegisterClass *RRC = nullptr;

2420

uint8_t Cost = 1;

2421

switch (VT.SimpleTy) {

2422

default:

2423

return TargetLowering::findRepresentativeClass(TRI, VT);

2424

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

2425

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

2426

break;

2427

case MVT::x86mmx:

2428

RRC = &X86::VR64RegClass;

2429

break;

2430

case MVT::f32: case MVT::f64:

2431

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

2432

case MVT::v4f32: case MVT::v2f64:

2433

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

2434

case MVT::v8f32: case MVT::v4f64:

2435

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

2436

case MVT::v16f32: case MVT::v8f64:

2437

RRC = &X86::VR128XRegClass;

2438

break;

2439

}

2440

return std::make_pair(RRC, Cost);

2441

}

2442

2443

unsigned X86TargetLowering::getAddressSpace() const {

2444

if (Subtarget.is64Bit())

2445

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

2446

return 256;

2447

}

2448

2449

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

2450

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

2451

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

2452

}

2453

2454

static Constant* SegmentOffset(IRBuilder<> &IRB,

2455

unsigned Offset, unsigned AddressSpace) {

2456

return ConstantExpr::getIntToPtr(

2457

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

2458

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

2459

}

2460

2461

Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {

2462

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

2463

// tcbhead_t; use it instead of the usual global variable (see

2464

// sysdeps/{i386,x86_64}/nptl/tls.h)

2465

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

2466

if (Subtarget.isTargetFuchsia()) {

2467

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

2468

return SegmentOffset(IRB, 0x10, getAddressSpace());

2469

} else {

2470

// %fs:0x28, unless we're using a Kernel code model, in which case

2471

// it's %gs:0x28. gs:0x14 on i386.

2472

unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

2473

return SegmentOffset(IRB, Offset, getAddressSpace());

2474

}

2475

}

2476

2477

return TargetLowering::getIRStackGuard(IRB);

2478

}

2479

2480

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

2481

// MSVC CRT provides functionalities for stack protection.

2482

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

2483

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

2484

// MSVC CRT has a global variable holding security cookie.

2485

M.getOrInsertGlobal("__security_cookie",

2486

Type::getInt8PtrTy(M.getContext()));

2487

2488

// MSVC CRT has a function to validate security cookie.

2489

FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(

2490

"__security_check_cookie", Type::getVoidTy(M.getContext()),

2491

Type::getInt8PtrTy(M.getContext()));

2492

if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

2493

F->setCallingConv(CallingConv::X86_FastCall);

2494

F->addAttribute(1, Attribute::AttrKind::InReg);

2495

}

2496

return;

2497

}

2498

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

2499

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

2500

return;

2501

TargetLowering::insertSSPDeclarations(M);

2502

}

2503

2504

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

2505

// MSVC CRT has a global variable holding security cookie.

2506

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

2507

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

2508

return M.getGlobalVariable("__security_cookie");

2509

}

2510

return TargetLowering::getSDagStackGuard(M);

2511

}

2512

2513

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

2514

// MSVC CRT has a function to validate security cookie.

2515

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

2516

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

2517

return M.getFunction("__security_check_cookie");

2518

}

2519

return TargetLowering::getSSPStackGuardCheck(M);

2520

}

2521

2522

Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {

2523

if (Subtarget.getTargetTriple().isOSContiki())

2524

return getDefaultSafeStackPointerLocation(IRB, false);

2525

2526

// Android provides a fixed TLS slot for the SafeStack pointer. See the

2527

// definition of TLS_SLOT_SAFESTACK in

2528

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

2529

if (Subtarget.isTargetAndroid()) {

2530

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

2531

// %gs:0x24 on i386

2532

unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

2533

return SegmentOffset(IRB, Offset, getAddressSpace());

2534

}

2535

2536

// Fuchsia is similar.

2537

if (Subtarget.isTargetFuchsia()) {

2538

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

2539

return SegmentOffset(IRB, 0x18, getAddressSpace());

2540

}

2541

2542

return TargetLowering::getSafeStackPointerLocation(IRB);

2543

}

2544

2545

//===----------------------------------------------------------------------===//

2546

// Return Value Calling Convention Implementation

2547

//===----------------------------------------------------------------------===//

2548

2549

bool X86TargetLowering::CanLowerReturn(

2550

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

2551

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

2552

SmallVector<CCValAssign, 16> RVLocs;

2553

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

2554

return CCInfo.CheckReturn(Outs, RetCC_X86);

2555

}

2556

2557

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

2558

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

2559

return ScratchRegs;

2560

}

2561

2562

/// Lowers masks values (v*i1) to the local register values

2563

/// \returns DAG node after lowering to register type

2564

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

2565

const SDLoc &Dl, SelectionDAG &DAG) {

2566

EVT ValVT = ValArg.getValueType();

2567

2568

if (ValVT == MVT::v1i1)

2569

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,

2570

DAG.getIntPtrConstant(0, Dl));

2571

2572

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

2573

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

2574

// Two stage lowering might be required

2575

// bitcast: v8i1 -> i8 / v16i1 -> i16

2576

// anyextend: i8 -> i32 / i16 -> i32

2577

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

2578

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

2579

if (ValLoc == MVT::i32)

2580

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

2581

return ValToCopy;

2582

}

2583

2584

if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

2585

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

2586

// One stage lowering is required

2587

// bitcast: v32i1 -> i32 / v64i1 -> i64

2588

return DAG.getBitcast(ValLoc, ValArg);

2589

}

2590

2591

return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);

2592

}

2593

2594

/// Breaks v64i1 value into two registers and adds the new node to the DAG

2595

static void Passv64i1ArgInRegs(

2596

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

2597

SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

2598

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

2599

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2599, __PRETTY_FUNCTION__));

2600

assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2600, __PRETTY_FUNCTION__));

2601

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"
) ? static_cast<void> (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2601, __PRETTY_FUNCTION__));

2602

assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2603, __PRETTY_FUNCTION__))

2603

"The value should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2603, __PRETTY_FUNCTION__));

2604

2605

// Before splitting the value we cast it to i64

2606

Arg = DAG.getBitcast(MVT::i64, Arg);

2607

2608

// Splitting the value into two i32 types

2609

SDValue Lo, Hi;

2610

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

2611

DAG.getConstant(0, Dl, MVT::i32));

2612

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

2613

DAG.getConstant(1, Dl, MVT::i32));

2614

2615

// Attach the two i32 types into corresponding registers

2616

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

2617

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

2618

}

2619

2620

SDValue

2621

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

2622

bool isVarArg,

2623

const SmallVectorImpl<ISD::OutputArg> &Outs,

2624

const SmallVectorImpl<SDValue> &OutVals,

2625

const SDLoc &dl, SelectionDAG &DAG) const {

2626

MachineFunction &MF = DAG.getMachineFunction();

2627

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

2628

2629

// In some cases we need to disable registers from the default CSR list.

2630

// For example, when they are used for argument passing.

2631

bool ShouldDisableCalleeSavedRegister =

2632

CallConv == CallingConv::X86_RegCall ||

2633

MF.getFunction().hasFnAttribute("no_caller_saved_registers");

2634

2635

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

2636

report_fatal_error("X86 interrupts may not return any value");

2637

2638

SmallVector<CCValAssign, 16> RVLocs;

2639

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

2640

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

2641

2642

SmallVector<std::pair<Register, SDValue>, 4> RetVals;

2643

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

2644

++I, ++OutsIndex) {

2645

CCValAssign &VA = RVLocs[I];

2646

assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2646, __PRETTY_FUNCTION__));

2647

2648

// Add the register to the CalleeSaveDisableRegs list.

2649

if (ShouldDisableCalleeSavedRegister)

2650

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

2651

2652

SDValue ValToCopy = OutVals[OutsIndex];

2653

EVT ValVT = ValToCopy.getValueType();

2654

2655

// Promote values to the appropriate types.

2656

if (VA.getLocInfo() == CCValAssign::SExt)

2657

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

2658

else if (VA.getLocInfo() == CCValAssign::ZExt)

2659

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

2660

else if (VA.getLocInfo() == CCValAssign::AExt) {

2661

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

2662

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

2663

else

2664

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

2665

}

2666

else if (VA.getLocInfo() == CCValAssign::BCvt)

2667

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

2668

2669

assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2670, __PRETTY_FUNCTION__))

2670

"Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2670, __PRETTY_FUNCTION__));

2671

2672

// Report an error if we have attempted to return a value via an XMM

2673

// register and SSE was disabled.

2674

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

2675

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

2676

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

2677

} else if (!Subtarget.hasSSE2() &&

2678

X86::FR64XRegClass.contains(VA.getLocReg()) &&

2679

ValVT == MVT::f64) {

2680

// When returning a double via an XMM register, report an error if SSE2 is

2681

// not enabled.

2682

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

2683

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

2684

}

2685

2686

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

2687

// the RET instruction and handled by the FP Stackifier.

2688

if (VA.getLocReg() == X86::FP0 ||

2689

VA.getLocReg() == X86::FP1) {

2690

// If this is a copy from an xmm register to ST(0), use an FPExtend to

2691

// change the value to the FP stack register class.

2692

if (isScalarFPTypeInSSEReg(VA.getValVT()))

2693

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

2694

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

2695

// Don't emit a copytoreg.

2696

continue;

2697

}

2698

2699

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

2700

// which is returned in RAX / RDX.

2701

if (Subtarget.is64Bit()) {

2702

if (ValVT == MVT::x86mmx) {

2703

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

2704

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

2705

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

2706

ValToCopy);

2707

// If we don't have SSE2 available, convert to v4f32 so the generated

2708

// register is legal.

2709

if (!Subtarget.hasSSE2())

2710

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

2711

}

2712

}

2713

}

2714

2715

if (VA.needsCustom()) {

2716

assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2717, __PRETTY_FUNCTION__))

2717

"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2717, __PRETTY_FUNCTION__));

2718

2719

Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

2720

Subtarget);

2721

2722

// Add the second register to the CalleeSaveDisableRegs list.

2723

if (ShouldDisableCalleeSavedRegister)

2724

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

2725

} else {

2726

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

2727

}

2728

}

2729

2730

SDValue Flag;

2731

SmallVector<SDValue, 6> RetOps;

2732

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

2733

// Operand #1 = Bytes To Pop

2734

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

2735

MVT::i32));

2736

2737

// Copy the result values into the output registers.

2738

for (auto &RetVal : RetVals) {

2739

if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

2740

RetOps.push_back(RetVal.second);

2741

continue; // Don't emit a copytoreg.

2742

}

2743

2744

Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);

2745

Flag = Chain.getValue(1);

2746

RetOps.push_back(

2747

DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

2748

}

2749

2750

// Swift calling convention does not require we copy the sret argument

2751

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

2752

2753

// All x86 ABIs require that for returning structs by value we copy

2754

// the sret argument into %rax/%eax (depending on ABI) for the return.

2755

// We saved the argument into a virtual register in the entry block,

2756

// so now we copy the value out and into %rax/%eax.

2757

//

2758

// Checking Function.hasStructRetAttr() here is insufficient because the IR

2759

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

2760

// false, then an sret argument may be implicitly inserted in the SelDAG. In

2761

// either case FuncInfo->setSRetReturnReg() will have been called.

2762

if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

2763

// When we have both sret and another return value, we should use the

2764

// original Chain stored in RetOps[0], instead of the current Chain updated

2765

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

2766

2767

// For the case of sret and another return value, we have

2768

// Chain_0 at the function entry

2769

// Chain_1 = getCopyToReg(Chain_0) in the above loop

2770

// If we use Chain_1 in getCopyFromReg, we will have

2771

// Val = getCopyFromReg(Chain_1)

2772

// Chain_2 = getCopyToReg(Chain_1, Val) from below

2773

2774

// getCopyToReg(Chain_0) will be glued together with

2775

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

2776

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

2777

// Data dependency from Unit B to Unit A due to usage of Val in

2778

// getCopyToReg(Chain_1, Val)

2779

// Chain dependency from Unit A to Unit B

2780

2781

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

2782

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

2783

getPointerTy(MF.getDataLayout()));

2784

2785

Register RetValReg

2786

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

2787

X86::RAX : X86::EAX;

2788

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);

2789

Flag = Chain.getValue(1);

2790

2791

// RAX/EAX now acts like a return value.

2792

RetOps.push_back(

2793

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

2794

2795

// Add the returned register to the CalleeSaveDisableRegs list.

2796

if (ShouldDisableCalleeSavedRegister)

2797

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

2798

}

2799

2800

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

2801

const MCPhysReg *I =

2802

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

2803

if (I) {

2804

for (; *I; ++I) {

2805

if (X86::GR64RegClass.contains(*I))

2806

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

2807

else

2808

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2808);

2809

}

2810

}

2811

2812

RetOps[0] = Chain; // Update chain.

2813

2814

// Add the flag if we have it.

2815

if (Flag.getNode())

2816

RetOps.push_back(Flag);

2817

2818

X86ISD::NodeType opcode = X86ISD::RET_FLAG;

2819

if (CallConv == CallingConv::X86_INTR)

2820

opcode = X86ISD::IRET;

2821

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

2822

}

2823

2824

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

2825

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

2826

return false;

2827

2828

SDValue TCChain = Chain;

2829

SDNode *Copy = *N->use_begin();

2830

if (Copy->getOpcode() == ISD::CopyToReg) {

2831

// If the copy has a glue operand, we conservatively assume it isn't safe to

2832

// perform a tail call.

2833

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

2834

return false;

2835

TCChain = Copy->getOperand(0);

2836

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

2837

return false;

2838

2839

bool HasRet = false;

2840

for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();

2841

UI != UE; ++UI) {

2842

if (UI->getOpcode() != X86ISD::RET_FLAG)

2843

return false;

2844

// If we are returning more than one value, we can definitely

2845

// not make a tail call see PR19530

2846

if (UI->getNumOperands() > 4)

2847

return false;

2848

if (UI->getNumOperands() == 4 &&

2849

UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)

2850

return false;

2851

HasRet = true;

2852

}

2853

2854

if (!HasRet)

2855

return false;

2856

2857

Chain = TCChain;

2858

return true;

2859

}

2860

2861

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

2862

ISD::NodeType ExtendKind) const {

2863

MVT ReturnMVT = MVT::i32;

2864

2865

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

2866

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

2867

// The ABI does not require i1, i8 or i16 to be extended.

2868

//

2869

// On Darwin, there is code in the wild relying on Clang's old behaviour of

2870

// always extending i8/i16 return values, so keep doing that for now.

2871

// (PR26665).

2872

ReturnMVT = MVT::i8;

2873

}

2874

2875

EVT MinVT = getRegisterType(Context, ReturnMVT);

2876

return VT.bitsLT(MinVT) ? MinVT : VT;

2877

}

2878

2879

/// Reads two 32 bit registers and creates a 64 bit mask value.

2880

/// \param VA The current 32 bit value that need to be assigned.

2881

/// \param NextVA The next 32 bit value that need to be assigned.

2882

/// \param Root The parent DAG node.

2883

/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for

2884

/// glue purposes. In the case the DAG is already using

2885

/// physical register instead of virtual, we should glue

2886

/// our new SDValue to InFlag SDvalue.

2887

/// \return a new SDvalue of size 64bit.

2888

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

2889

SDValue &Root, SelectionDAG &DAG,

2890

const SDLoc &Dl, const X86Subtarget &Subtarget,

2891

SDValue *InFlag = nullptr) {

2892

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2892, __PRETTY_FUNCTION__));

2893

assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2893, __PRETTY_FUNCTION__));

2894

assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2895, __PRETTY_FUNCTION__))

2895

"Expecting first location of 64 bit width type")((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2895, __PRETTY_FUNCTION__));

2896

assert(NextVA.getValVT() == VA.getValVT() &&((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2897, __PRETTY_FUNCTION__))

2897

"The locations should have the same type")((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2897, __PRETTY_FUNCTION__));

2898

assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2899, __PRETTY_FUNCTION__))

2899

"The values should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2899, __PRETTY_FUNCTION__));

2900

2901

SDValue Lo, Hi;

2902

SDValue ArgValueLo, ArgValueHi;

2903

2904

MachineFunction &MF = DAG.getMachineFunction();

2905

const TargetRegisterClass *RC = &X86::GR32RegClass;

2906

2907

// Read a 32 bit value from the registers.

2908

if (nullptr == InFlag) {

2909

// When no physical register is present,

2910

// create an intermediate virtual register.

2911

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

2912

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

2913

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

2914

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

2915

} else {

2916

// When a physical register is available read the value from it and glue

2917

// the reads together.

2918

ArgValueLo =

2919

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);

2920

*InFlag = ArgValueLo.getValue(2);

2921

ArgValueHi =

2922

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);

2923

*InFlag = ArgValueHi.getValue(2);

2924

}

2925

2926

// Convert the i32 type into v32i1 type.

2927

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

2928

2929

// Convert the i32 type into v32i1 type.

2930

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

2931

2932

// Concatenate the two values together.

2933

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

2934

}

2935

2936

/// The function will lower a register of various sizes (8/16/32/64)

2937

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

2938

/// \returns a DAG node contains the operand after lowering to mask type.

2939

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

2940

const EVT &ValLoc, const SDLoc &Dl,

2941

SelectionDAG &DAG) {

2942

SDValue ValReturned = ValArg;

2943

2944

if (ValVT == MVT::v1i1)

2945

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

2946

2947

if (ValVT == MVT::v64i1) {

2948

// In 32 bit machine, this case is handled by getv64i1Argument

2949

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? static_cast<void> (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2949, __PRETTY_FUNCTION__));

2950

// In 64 bit machine, There is no need to truncate the value only bitcast

2951

} else {

2952

MVT maskLen;

2953

switch (ValVT.getSimpleVT().SimpleTy) {

2954

case MVT::v8i1:

2955

maskLen = MVT::i8;

2956

break;

2957

case MVT::v16i1:

2958

maskLen = MVT::i16;

2959

break;

2960

case MVT::v32i1:

2961

maskLen = MVT::i32;

2962

break;

2963

default:

2964

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2964);

2965

}

2966

2967

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

2968

}

2969

return DAG.getBitcast(ValVT, ValReturned);

2970

}

2971

2972

/// Lower the result values of a call into the

2973

/// appropriate copies out of appropriate physical registers.

2974

///

2975

SDValue X86TargetLowering::LowerCallResult(

2976

SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,

2977

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

2978

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

2979

uint32_t *RegMask) const {

2980

2981

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

2982

// Assign locations to each value returned by this call.

2983

SmallVector<CCValAssign, 16> RVLocs;

2984

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

2985

*DAG.getContext());

2986

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

2987

2988

// Copy all of the result registers out of their specified physreg.

2989

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

2990

++I, ++InsIndex) {

2991

CCValAssign &VA = RVLocs[I];

2992

EVT CopyVT = VA.getLocVT();

2993

2994

// In some calling conventions we need to remove the used registers

2995

// from the register mask.

2996

if (RegMask) {

2997

for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);

2998

SubRegs.isValid(); ++SubRegs)

2999

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

3000

}

3001

3002

// Report an error if there was an attempt to return FP values via XMM

3003

// registers.

3004

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3005

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3006

if (VA.getLocReg() == X86::XMM1)

3007

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3008

else

3009

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3010

} else if (!Subtarget.hasSSE2() &&

3011

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3012

CopyVT == MVT::f64) {

3013

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3014

if (VA.getLocReg() == X86::XMM1)

3015

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3016

else

3017

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3018

}

3019

3020

// If we prefer to use the value in xmm registers, copy it out as f80 and

3021

// use a truncate to move it from fp stack reg to xmm reg.

3022

bool RoundAfterCopy = false;

3023

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

3024

isScalarFPTypeInSSEReg(VA.getValVT())) {

3025

if (!Subtarget.hasX87())

3026

report_fatal_error("X87 register return with X87 disabled");

3027

CopyVT = MVT::f80;

3028

RoundAfterCopy = (CopyVT != VA.getLocVT());

3029

}

3030

3031

SDValue Val;

3032

if (VA.needsCustom()) {

3033

assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3034, __PRETTY_FUNCTION__))

3034

"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3034, __PRETTY_FUNCTION__));

3035

Val =

3036

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);

3037

} else {

3038

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)

3039

.getValue(1);

3040

Val = Chain.getValue(0);

3041

InFlag = Chain.getValue(2);

3042

}

3043

3044

if (RoundAfterCopy)

3045

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

3046

// This truncation won't change the value.

3047

DAG.getIntPtrConstant(1, dl));

3048

3049

if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {

3050

if (VA.getValVT().isVector() &&

3051

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3052

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3053

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3054

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

3055

} else

3056

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

3057

}

3058

3059

if (VA.getLocInfo() == CCValAssign::BCvt)

3060

Val = DAG.getBitcast(VA.getValVT(), Val);

3061

3062

InVals.push_back(Val);

3063

}

3064

3065

return Chain;

3066

}

3067

3068

//===----------------------------------------------------------------------===//

3069

// C & StdCall & Fast Calling Convention implementation

3070

//===----------------------------------------------------------------------===//

3071

// StdCall calling convention seems to be standard for many Windows' API

3072

// routines and around. It differs from C calling convention just a little:

3073

// callee should clean up the stack, not caller. Symbols should be also

3074

// decorated in some fancy way :) It doesn't support any vector arguments.

3075

// For info on fast calling convention see Fast Calling Convention (tail call)

3076

// implementation LowerX86_32FastCCCallTo.

3077

3078

/// CallIsStructReturn - Determines whether a call uses struct return

3079

/// semantics.

3080

enum StructReturnType {

3081

NotStructReturn,

3082

RegStructReturn,

3083

StackStructReturn

3084

};

3085

static StructReturnType

3086

callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {

3087

if (Outs.empty())

3088

return NotStructReturn;

3089

3090

const ISD::ArgFlagsTy &Flags = Outs[0].Flags;

3091

if (!Flags.isSRet())

3092

return NotStructReturn;

3093

if (Flags.isInReg() || IsMCU)

3094

return RegStructReturn;

3095

return StackStructReturn;

3096

}

3097

3098

/// Determines whether a function uses struct return semantics.

3099

static StructReturnType

3100

argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {

3101

if (Ins.empty())

3102

return NotStructReturn;

3103

3104

const ISD::ArgFlagsTy &Flags = Ins[0].Flags;

3105

if (!Flags.isSRet())

3106

return NotStructReturn;

3107

if (Flags.isInReg() || IsMCU)

3108

return RegStructReturn;

3109

return StackStructReturn;

3110

}

3111

3112

/// Make a copy of an aggregate at address specified by "Src" to address

3113

/// "Dst" with size and alignment information specified by the specific

3114

/// parameter attribute. The copy will be passed as a byval function parameter.

3115

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

3116

SDValue Chain, ISD::ArgFlagsTy Flags,

3117

SelectionDAG &DAG, const SDLoc &dl) {

3118

SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

3119

3120

return DAG.getMemcpy(

3121

Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

3122

/*isVolatile*/ false, /*AlwaysInline=*/true,

3123

/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

3124

}

3125

3126

/// Return true if the calling convention is one that we can guarantee TCO for.

3127

static bool canGuaranteeTCO(CallingConv::ID CC) {

3128

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

3129

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

3130

CC == CallingConv::HHVM || CC == CallingConv::Tail);

3131

}

3132

3133

/// Return true if we might ever do TCO for calls with this calling convention.

3134

static bool mayTailCallThisCC(CallingConv::ID CC) {

3135

switch (CC) {

3136

// C calling conventions:

3137

case CallingConv::C:

3138

case CallingConv::Win64:

3139

case CallingConv::X86_64_SysV:

3140

// Callee pop conventions:

3141

case CallingConv::X86_ThisCall:

3142

case CallingConv::X86_StdCall:

3143

case CallingConv::X86_VectorCall:

3144

case CallingConv::X86_FastCall:

3145

// Swift:

3146

case CallingConv::Swift:

3147

return true;

3148

default:

3149

return canGuaranteeTCO(CC);

3150

}

3151

}

3152

3153

/// Return true if the function is being made into a tailcall target by

3154

/// changing its ABI.

3155

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

3156

return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;

3157

}

3158

3159

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

3160

if (!CI->isTailCall())

3161

return false;

3162

3163

CallingConv::ID CalleeCC = CI->getCallingConv();

3164

if (!mayTailCallThisCC(CalleeCC))

3165

return false;

3166

3167

return true;

3168

}

3169

3170

SDValue

3171

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

3172

const SmallVectorImpl<ISD::InputArg> &Ins,

3173

const SDLoc &dl, SelectionDAG &DAG,

3174

const CCValAssign &VA,

3175

MachineFrameInfo &MFI, unsigned i) const {

3176

// Create the nodes corresponding to a load from this parameter slot.

3177

ISD::ArgFlagsTy Flags = Ins[i].Flags;

3178

bool AlwaysUseMutable = shouldGuaranteeTCO(

3179

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

3180

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

3181

EVT ValVT;

3182

MVT PtrVT = getPointerTy(DAG.getDataLayout());

3183

3184

// If value is passed by pointer we have address passed instead of the value

3185

// itself. No need to extend if the mask value and location share the same

3186

// absolute size.

3187

bool ExtendedInMem =

3188

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

3189

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

3190

3191

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

3192

ValVT = VA.getLocVT();

3193

else

3194

ValVT = VA.getValVT();

3195

3196

// FIXME: For now, all byval parameter objects are marked mutable. This can be

3197

// changed with more analysis.

3198

// In case of tail call optimization mark all arguments mutable. Since they

3199

// could be overwritten by lowering of arguments in case of a tail call.

3200

if (Flags.isByVal()) {

3201

unsigned Bytes = Flags.getByValSize();

3202

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

3203

3204

// FIXME: For now, all byval parameter objects are marked as aliasing. This

3205

// can be improved with deeper analysis.

3206

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,

3207

/*isAliased=*/true);

3208

return DAG.getFrameIndex(FI, PtrVT);

3209

}

3210

3211

EVT ArgVT = Ins[i].ArgVT;

3212

3213

// If this is a vector that has been split into multiple parts, and the

3214

// scalar size of the parts don't match the vector element size, then we can't

3215

// elide the copy. The parts will have padding between them instead of being

3216

// packed like a vector.

3217

bool ScalarizedAndExtendedVector =

3218

ArgVT.isVector() && !VA.getLocVT().isVector() &&

3219

VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();

3220

3221

// This is an argument in memory. We might be able to perform copy elision.

3222

// If the argument is passed directly in memory without any extension, then we

3223

// can perform copy elision. Large vector types, for example, may be passed

3224

// indirectly by pointer.

3225

if (Flags.isCopyElisionCandidate() &&

3226

VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&

3227

!ScalarizedAndExtendedVector) {

3228

SDValue PartAddr;

3229

if (Ins[i].PartOffset == 0) {

3230

// If this is a one-part value or the first part of a multi-part value,

3231

// create a stack object for the entire argument value type and return a

3232

// load from our portion of it. This assumes that if the first part of an

3233

// argument is in memory, the rest will also be in memory.

3234

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

3235

/*IsImmutable=*/false);

3236

PartAddr = DAG.getFrameIndex(FI, PtrVT);

3237

return DAG.getLoad(

3238

ValVT, dl, Chain, PartAddr,

3239

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3240

} else {

3241

// This is not the first piece of an argument in memory. See if there is

3242

// already a fixed stack object including this offset. If so, assume it

3243

// was created by the PartOffset == 0 branch above and create a load from

3244

// the appropriate offset into it.

3245

int64_t PartBegin = VA.getLocMemOffset();

3246

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

3247

int FI = MFI.getObjectIndexBegin();

3248

for (; MFI.isFixedObjectIndex(FI); ++FI) {

3249

int64_t ObjBegin = MFI.getObjectOffset(FI);

3250

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

3251

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

3252

break;

3253

}

3254

if (MFI.isFixedObjectIndex(FI)) {

3255

SDValue Addr =

3256

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

3257

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

3258

return DAG.getLoad(

3259

ValVT, dl, Chain, Addr,

3260

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

3261

Ins[i].PartOffset));

3262

}

3263

}

3264

}

3265

3266

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

3267

VA.getLocMemOffset(), isImmutable);

3268

3269

// Set SExt or ZExt flag.

3270

if (VA.getLocInfo() == CCValAssign::ZExt) {

3271

MFI.setObjectZExt(FI, true);

3272

} else if (VA.getLocInfo() == CCValAssign::SExt) {

3273

MFI.setObjectSExt(FI, true);

3274

}

3275

3276

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

3277

SDValue Val = DAG.getLoad(

3278

ValVT, dl, Chain, FIN,

3279

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3280

return ExtendedInMem

3281

? (VA.getValVT().isVector()

3282

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

3283

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

3284

: Val;

3285

}

3286

3287

// FIXME: Get this from tablegen.

3288

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

3289

const X86Subtarget &Subtarget) {

3290

assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3290, __PRETTY_FUNCTION__));

3291

3292

if (Subtarget.isCallingConvWin64(CallConv)) {

3293

static const MCPhysReg GPR64ArgRegsWin64[] = {

3294

X86::RCX, X86::RDX, X86::R8, X86::R9

3295

};

3296

return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

3297

}

3298

3299

static const MCPhysReg GPR64ArgRegs64Bit[] = {

3300

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

3301

};

3302

return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

3303

}

3304

3305

// FIXME: Get this from tablegen.

3306

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

3307

CallingConv::ID CallConv,

3308

const X86Subtarget &Subtarget) {

3309

assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3309, __PRETTY_FUNCTION__));

3310

if (Subtarget.isCallingConvWin64(CallConv)) {

3311

// The XMM registers which might contain var arg parameters are shadowed

3312

// in their paired GPR. So we only need to save the GPR to their home

3313

// slots.

3314

// TODO: __vectorcall will change this.

3315

return None;

3316

}

3317

3318

const Function &F = MF.getFunction();

3319

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

3320

bool isSoftFloat = Subtarget.useSoftFloat();

3321

assert(!(isSoftFloat && NoImplicitFloatOps) &&((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3322, __PRETTY_FUNCTION__))

3322

"SSE register cannot be used when SSE is disabled!")((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3322, __PRETTY_FUNCTION__));

3323

if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())

3324

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

3325

// registers.

3326

return None;

3327

3328

static const MCPhysReg XMMArgRegs64Bit[] = {

3329

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3330

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3331

};

3332

return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

3333

}

3334

3335

#ifndef NDEBUG

3336

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

3337

return llvm::is_sorted(

3338

ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

3339

return A.getValNo() < B.getValNo();

3340

});

3341

}

3342

#endif

3343

3344

namespace {

3345

/// This is a helper class for lowering variable arguments parameters.

3346

class VarArgsLoweringHelper {

3347

public:

3348

VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

3349

SelectionDAG &DAG, const X86Subtarget &Subtarget,

3350

CallingConv::ID CallConv, CCState &CCInfo)

3351

: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

3352

TheMachineFunction(DAG.getMachineFunction()),

3353

TheFunction(TheMachineFunction.getFunction()),

3354

FrameInfo(TheMachineFunction.getFrameInfo()),

3355

FrameLowering(*Subtarget.getFrameLowering()),

3356

TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

3357

CCInfo(CCInfo) {}

3358

3359

// Lower variable arguments parameters.

3360

void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

3361

3362

private:

3363

void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

3364

3365

void forwardMustTailParameters(SDValue &Chain);

3366

3367

bool is64Bit() { return Subtarget.is64Bit(); }

3368

bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }

3369

3370

X86MachineFunctionInfo *FuncInfo;

3371

const SDLoc &DL;

3372

SelectionDAG &DAG;

3373

const X86Subtarget &Subtarget;

3374

MachineFunction &TheMachineFunction;

3375

const Function &TheFunction;

3376

MachineFrameInfo &FrameInfo;

3377

const TargetFrameLowering &FrameLowering;

3378

const TargetLowering &TargLowering;

3379

CallingConv::ID CallConv;

3380

CCState &CCInfo;

3381

};

3382

} // namespace

3383

3384

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

3385

SDValue &Chain, unsigned StackSize) {

3386

// If the function takes variable number of arguments, make a frame index for

3387

// the start of the first vararg value... for expansion of llvm.va_start. We

3388

// can skip this if there are no va_start calls.

3389

if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

3390

CallConv != CallingConv::X86_ThisCall)) {

3391

FuncInfo->setVarArgsFrameIndex(

3392

FrameInfo.CreateFixedObject(1, StackSize, true));

3393

}

3394

3395

// Figure out if XMM registers are in use.

3396

assert(!(Subtarget.useSoftFloat() &&((!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute
(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3398, __PRETTY_FUNCTION__))

3397

TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&((!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute
(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3398, __PRETTY_FUNCTION__))

3398

"SSE register cannot be used when SSE is disabled!")((!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute
(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3398, __PRETTY_FUNCTION__));

3399

3400

// 64-bit calling conventions support varargs and register parameters, so we

3401

// have to do extra work to spill them in the prologue.

3402

if (is64Bit()) {

3403

// Find the first unallocated argument registers.

3404

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

3405

ArrayRef<MCPhysReg> ArgXMMs =

3406

get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

3407

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

3408

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

3409

3410

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3411, __PRETTY_FUNCTION__))

3411

"SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3411, __PRETTY_FUNCTION__));

3412

3413

if (isWin64()) {

3414

// Get to the caller-allocated home save location. Add 8 to account

3415

// for the return address.

3416

int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

3417

FuncInfo->setRegSaveFrameIndex(

3418

FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

3419

// Fixup to set vararg frame on shadow area (4 x i64).

3420

if (NumIntRegs < 4)

3421

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

3422

} else {

3423

// For X86-64, if there are vararg parameters that are passed via

3424

// registers, then we must store them to their spots on the stack so

3425

// they may be loaded by dereferencing the result of va_next.

3426

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

3427

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

3428

FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

3429

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

3430

}

3431

3432

SmallVector<SDValue, 6>

3433

LiveGPRs; // list of SDValue for GPR registers keeping live input value

3434

SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

3435

// keeping live input value

3436

SDValue ALVal; // if applicable keeps SDValue for %al register

3437

3438

// Gather all the live in physical registers.

3439

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

3440

Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

3441

LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

3442

}

3443

const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

3444

if (!AvailableXmms.empty()) {

3445

Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

3446

ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

3447

for (MCPhysReg Reg : AvailableXmms) {

3448

Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);

3449

LiveXMMRegs.push_back(

3450

DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));

3451

}

3452

}

3453

3454

// Store the integer parameter registers.

3455

SmallVector<SDValue, 8> MemOps;

3456

SDValue RSFIN =

3457

DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

3458

TargLowering.getPointerTy(DAG.getDataLayout()));

3459

unsigned Offset = FuncInfo->getVarArgsGPOffset();

3460

for (SDValue Val : LiveGPRs) {

3461

SDValue FIN = DAG.getNode(ISD::ADD, DL,

3462

TargLowering.getPointerTy(DAG.getDataLayout()),

3463

RSFIN, DAG.getIntPtrConstant(Offset, DL));

3464

SDValue Store =

3465

DAG.getStore(Val.getValue(1), DL, Val, FIN,

3466

MachinePointerInfo::getFixedStack(

3467

DAG.getMachineFunction(),

3468

FuncInfo->getRegSaveFrameIndex(), Offset));

3469

MemOps.push_back(Store);

3470

Offset += 8;

3471

}

3472

3473

// Now store the XMM (fp + vector) parameter registers.

3474

if (!LiveXMMRegs.empty()) {

3475

SmallVector<SDValue, 12> SaveXMMOps;

3476

SaveXMMOps.push_back(Chain);

3477

SaveXMMOps.push_back(ALVal);

3478

SaveXMMOps.push_back(

3479

DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));

3480

SaveXMMOps.push_back(

3481

DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));

3482

SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),

3483

LiveXMMRegs.end());

3484

MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,

3485

MVT::Other, SaveXMMOps));

3486

}

3487

3488

if (!MemOps.empty())

3489

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

3490

}

3491

}

3492

3493

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

3494

// Find the largest legal vector type.

3495

MVT VecVT = MVT::Other;

3496

// FIXME: Only some x86_32 calling conventions support AVX512.

3497

if (Subtarget.useAVX512Regs() &&

3498

(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

3499

CallConv == CallingConv::Intel_OCL_BI)))

3500

VecVT = MVT::v16f32;

3501

else if (Subtarget.hasAVX())

3502

VecVT = MVT::v8f32;

3503

else if (Subtarget.hasSSE2())

3504

VecVT = MVT::v4f32;

3505

3506

// We forward some GPRs and some vector types.

3507

SmallVector<MVT, 2> RegParmTypes;

3508

MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

3509

RegParmTypes.push_back(IntVT);

3510

if (VecVT != MVT::Other)

3511

RegParmTypes.push_back(VecVT);

3512

3513

// Compute the set of forwarded registers. The rest are scratch.

3514

SmallVectorImpl<ForwardedRegister> &Forwards =

3515

FuncInfo->getForwardedMustTailRegParms();

3516

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

3517

3518

// Forward AL for SysV x86_64 targets, since it is used for varargs.

3519

if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

3520

Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

3521

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

3522

}

3523

3524

// Copy all forwards from physical to virtual registers.

3525

for (ForwardedRegister &FR : Forwards) {

3526

// FIXME: Can we use a less constrained schedule?

3527

SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

3528

FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

3529

TargLowering.getRegClassFor(FR.VT));

3530

Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

3531

}

3532

}

3533

3534

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

3535

unsigned StackSize) {

3536

// Set FrameIndex to the 0xAAAAAAA value to mark unset state.

3537

// If necessary, it would be set into the correct value later.

3538

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

3539

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

3540

3541

if (FrameInfo.hasVAStart())

3542

createVarArgAreaAndStoreRegisters(Chain, StackSize);

3543

3544

if (FrameInfo.hasMustTailInVarArgFunc())

3545

forwardMustTailParameters(Chain);

3546

}

3547

3548

SDValue X86TargetLowering::LowerFormalArguments(

3549

SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

3550

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3551

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

3552

MachineFunction &MF = DAG.getMachineFunction();

3553

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3554

3555

const Function &F = MF.getFunction();

3556

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

3557

F.getName() == "main")

3558

FuncInfo->setForceFramePointer(true);

3559

3560

MachineFrameInfo &MFI = MF.getFrameInfo();

3561

bool Is64Bit = Subtarget.is64Bit();

3562

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

3563

3564

assert(((!(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3566, __PRETTY_FUNCTION__))

3565

!(IsVarArg && canGuaranteeTCO(CallConv)) &&((!(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3566, __PRETTY_FUNCTION__))

3566

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((!(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3566, __PRETTY_FUNCTION__));

3567

3568

// Assign locations to all of the incoming arguments.

3569

SmallVector<CCValAssign, 16> ArgLocs;

3570

CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

3571

3572

// Allocate shadow area for Win64.

3573

if (IsWin64)

3574

CCInfo.AllocateStack(32, Align(8));

3575

3576

CCInfo.AnalyzeArguments(Ins, CC_X86);

3577

3578

// In vectorcall calling convention a second pass is required for the HVA

3579

// types.

3580

if (CallingConv::X86_VectorCall == CallConv) {

3581

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

3582

}

3583

3584

// The next loop assumes that the locations are in the same order of the

3585

// input arguments.

3586

assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3587, __PRETTY_FUNCTION__))

3587

"Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3587, __PRETTY_FUNCTION__));

3588

3589

SDValue ArgValue;

3590

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

3591

++I, ++InsIndex) {

3592

assert(InsIndex < Ins.size() && "Invalid Ins index")((InsIndex < Ins.size() && "Invalid Ins index") ? static_cast
<void> (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3592, __PRETTY_FUNCTION__));

3593

CCValAssign &VA = ArgLocs[I];

3594

3595

if (VA.isRegLoc()) {

3596

EVT RegVT = VA.getLocVT();

3597

if (VA.needsCustom()) {

3598

assert(((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3600, __PRETTY_FUNCTION__))

3599

VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3600, __PRETTY_FUNCTION__))

3600

"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3600, __PRETTY_FUNCTION__));

3601

3602

// v64i1 values, in regcall calling convention, that are

3603

// compiled to 32 bit arch, are split up into two registers.

3604

ArgValue =

3605

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

3606

} else {

3607

const TargetRegisterClass *RC;

3608

if (RegVT == MVT::i8)

3609

RC = &X86::GR8RegClass;

3610

else if (RegVT == MVT::i16)

3611

RC = &X86::GR16RegClass;

3612

else if (RegVT == MVT::i32)

3613

RC = &X86::GR32RegClass;

3614

else if (Is64Bit && RegVT == MVT::i64)

3615

RC = &X86::GR64RegClass;

3616

else if (RegVT == MVT::f32)

3617

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

3618

else if (RegVT == MVT::f64)

3619

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

3620

else if (RegVT == MVT::f80)

3621

RC = &X86::RFP80RegClass;

3622

else if (RegVT == MVT::f128)

3623

RC = &X86::VR128RegClass;

3624

else if (RegVT.is512BitVector())

3625

RC = &X86::VR512RegClass;

3626

else if (RegVT.is256BitVector())

3627

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

3628

else if (RegVT.is128BitVector())

3629

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

3630

else if (RegVT == MVT::x86mmx)

3631

RC = &X86::VR64RegClass;

3632

else if (RegVT == MVT::v1i1)

3633

RC = &X86::VK1RegClass;

3634

else if (RegVT == MVT::v8i1)

3635

RC = &X86::VK8RegClass;

3636

else if (RegVT == MVT::v16i1)

3637

RC = &X86::VK16RegClass;

3638

else if (RegVT == MVT::v32i1)

3639

RC = &X86::VK32RegClass;

3640

else if (RegVT == MVT::v64i1)

3641

RC = &X86::VK64RegClass;

3642

else

3643

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3643);

3644

3645

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3646

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

3647

}

3648

3649

// If this is an 8 or 16-bit value, it is really passed promoted to 32

3650

// bits. Insert an assert[sz]ext to capture this, then truncate to the

3651

// right size.

3652

if (VA.getLocInfo() == CCValAssign::SExt)

3653

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

3654

DAG.getValueType(VA.getValVT()));

3655

else if (VA.getLocInfo() == CCValAssign::ZExt)

3656

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

3657

DAG.getValueType(VA.getValVT()));

3658

else if (VA.getLocInfo() == CCValAssign::BCvt)

3659

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

3660

3661

if (VA.isExtInLoc()) {

3662

// Handle MMX values passed in XMM regs.

3663

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

3664

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

3665

else if (VA.getValVT().isVector() &&

3666

VA.getValVT().getScalarType() == MVT::i1 &&

3667

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3668

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3669

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3670

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

3671

} else

3672

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

3673

}

3674

} else {

3675

assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3675, __PRETTY_FUNCTION__));

3676

ArgValue =

3677

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

3678

}

3679

3680

// If value is passed via pointer - do a load.

3681

if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())

3682

ArgValue =

3683

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

3684

3685

InVals.push_back(ArgValue);

3686

}

3687

3688

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

3689

// Swift calling convention does not require we copy the sret argument

3690

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

3691

if (CallConv == CallingConv::Swift)

3692

continue;

3693

3694

// All x86 ABIs require that for returning structs by value we copy the

3695

// sret argument into %rax/%eax (depending on ABI) for the return. Save

3696

// the argument into a virtual register so that we can access it from the

3697

// return points.

3698

if (Ins[I].Flags.isSRet()) {

3699

Register Reg = FuncInfo->getSRetReturnReg();

3700

if (!Reg) {

3701

MVT PtrTy = getPointerTy(DAG.getDataLayout());

3702

Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

3703

FuncInfo->setSRetReturnReg(Reg);

3704

}

3705

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

3706

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

3707

break;

3708

}

3709

}

3710

3711

unsigned StackSize = CCInfo.getNextStackOffset();

3712

// Align stack specially for tail calls.

3713

if (shouldGuaranteeTCO(CallConv,

3714

MF.getTarget().Options.GuaranteedTailCallOpt))

3715

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

3716

3717

if (IsVarArg)

3718

VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

3719

.lowerVarArgsParameters(Chain, StackSize);

3720

3721

// Some CCs need callee pop.

3722

if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

3723

MF.getTarget().Options.GuaranteedTailCallOpt)) {

3724

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

3725

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

3726

// X86 interrupts must pop the error code (and the alignment padding) if

3727

// present.

3728

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

3729

} else {

3730

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

3731

// If this is an sret function, the return should pop the hidden pointer.

3732

if (!Is64Bit && !canGuaranteeTCO(CallConv) &&

3733

!Subtarget.getTargetTriple().isOSMSVCRT() &&

3734

argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)

3735

FuncInfo->setBytesToPopOnReturn(4);

3736

}

3737

3738

if (!Is64Bit) {

3739

// RegSaveFrameIndex is X86-64 only.

3740

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

3741

}

3742

3743

FuncInfo->setArgumentStackSize(StackSize);

3744

3745

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

3746

EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());

3747

if (Personality == EHPersonality::CoreCLR) {

3748

assert(Is64Bit)((Is64Bit) ? static_cast<void> (0) : __assert_fail ("Is64Bit"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3748, __PRETTY_FUNCTION__));

3749

// TODO: Add a mechanism to frame lowering that will allow us to indicate

3750

// that we'd prefer this slot be allocated towards the bottom of the frame

3751

// (i.e. near the stack pointer after allocating the frame). Every

3752

// funclet needs a copy of this slot in its (mostly empty) frame, and the

3753

// offset from the bottom of this and each funclet's frame must be the

3754

// same, so the size of funclets' (mostly empty) frames is dictated by

3755

// how far this slot is from the bottom (since they allocate just enough

3756

// space to accommodate holding this slot at the correct offset).

3757

int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);

3758

EHInfo->PSPSymFrameIdx = PSPSymFI;

3759

}

3760

}

3761

3762

if (CallConv == CallingConv::X86_RegCall ||

3763

F.hasFnAttribute("no_caller_saved_registers")) {

3764

MachineRegisterInfo &MRI = MF.getRegInfo();

3765

for (std::pair<Register, Register> Pair : MRI.liveins())

3766

MRI.disableCalleeSavedRegister(Pair.first);

3767

}

3768

3769

return Chain;

3770

}

3771

3772

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

3773

SDValue Arg, const SDLoc &dl,

3774

SelectionDAG &DAG,

3775

const CCValAssign &VA,

3776

ISD::ArgFlagsTy Flags,

3777

bool isByVal) const {

3778

unsigned LocMemOffset = VA.getLocMemOffset();

3779

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

3780

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

3781

StackPtr, PtrOff);

3782

if (isByVal)

3783

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

3784

3785

return DAG.getStore(

3786

Chain, dl, Arg, PtrOff,

3787

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));

3788

}

3789

3790

/// Emit a load of return address if tail call

3791

/// optimization is performed and it is required.

3792

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

3793

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

3794

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

3795

// Adjust the Return address stack slot.

3796

EVT VT = getPointerTy(DAG.getDataLayout());

3797

OutRetAddr = getReturnAddressFrameIndex(DAG);

3798

3799

// Load the "old" Return address.

3800

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

3801

return SDValue(OutRetAddr.getNode(), 1);

3802

}

3803

3804

/// Emit a store of the return address if tail call

3805

/// optimization is performed and it is required (FPDiff!=0).

3806

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

3807

SDValue Chain, SDValue RetAddrFrIdx,

3808

EVT PtrVT, unsigned SlotSize,

3809

int FPDiff, const SDLoc &dl) {

3810

// Store the return address to the appropriate stack slot.

3811

if (!FPDiff) return Chain;

3812

// Calculate the new stack slot for the return address.

3813

int NewReturnAddrFI =

3814

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

3815

false);

3816

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

3817

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

3818

MachinePointerInfo::getFixedStack(

3819

DAG.getMachineFunction(), NewReturnAddrFI));

3820

return Chain;

3821

}

3822

3823

/// Returns a vector_shuffle mask for an movs{s|d}, movd

3824

/// operation of specified width.

3825

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

3826

SDValue V2) {

3827

unsigned NumElems = VT.getVectorNumElements();

3828

SmallVector<int, 8> Mask;

3829

Mask.push_back(NumElems);

3830

for (unsigned i = 1; i != NumElems; ++i)

3831

Mask.push_back(i);

3832

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

3833

}

3834

3835

SDValue

3836

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

3837

SmallVectorImpl<SDValue> &InVals) const {

3838

SelectionDAG &DAG = CLI.DAG;

3839

SDLoc &dl = CLI.DL;

3840

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

3841

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

3842

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

3843

SDValue Chain = CLI.Chain;

3844

SDValue Callee = CLI.Callee;

3845

CallingConv::ID CallConv = CLI.CallConv;

3846

bool &isTailCall = CLI.IsTailCall;

3847

bool isVarArg = CLI.IsVarArg;

3848

3849

MachineFunction &MF = DAG.getMachineFunction();

3850

bool Is64Bit = Subtarget.is64Bit();

3851

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

3852

StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());

3853

bool IsSibcall = false;

3854

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

3855

CallConv == CallingConv::Tail;

3856

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

3857

const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);

3858

const Function *Fn = CI ? CI->getCalledFunction() : nullptr;

3859

bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||

3860

(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

3861

const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);

3862

bool HasNoCfCheck =

3863

(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());

3864

const Module *M = MF.getMMI().getModule();

3865

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

3866

3867

MachineFunction::CallSiteInfo CSInfo;

3868

if (CallConv == CallingConv::X86_INTR)

3869

report_fatal_error("X86 interrupts may not be called directly");

3870

3871

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {

3872

// If we are using a GOT, disable tail calls to external symbols with

3873

// default visibility. Tail calling such a symbol requires using a GOT

3874

// relocation, which forces early binding of the symbol. This breaks code

3875

// that require lazy function symbol resolution. Using musttail or

3876

// GuaranteedTailCallOpt will override this.

3877

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

3878

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

3879

G->getGlobal()->hasDefaultVisibility()))

3880

isTailCall = false;

3881

}

3882

3883

bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

3884

if (IsMustTail) {

3885

// Force this to be a tail call. The verifier rules are enough to ensure

3886

// that we can lower this successfully without moving the return address

3887

// around.

3888

isTailCall = true;

3889

} else if (isTailCall) {

3890

// Check if it's really possible to do a tail call.

3891

isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,

3892

isVarArg, SR != NotStructReturn,

3893

MF.getFunction().hasStructRetAttr(), CLI.RetTy,

3894

Outs, OutVals, Ins, DAG);

3895

3896

// Sibcalls are automatically detected tailcalls which do not require

3897

// ABI changes.

3898

if (!IsGuaranteeTCO && isTailCall)

3899

IsSibcall = true;

3900

3901

if (isTailCall)

3902

++NumTailCalls;

3903

}

3904

3905

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3906, __PRETTY_FUNCTION__))

3906

"Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3906, __PRETTY_FUNCTION__));

3907

3908

// Analyze operands of the call, assigning locations to each operand.

3909

SmallVector<CCValAssign, 16> ArgLocs;

3910

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

3911

3912

// Allocate shadow area for Win64.

3913

if (IsWin64)

3914

CCInfo.AllocateStack(32, Align(8));

3915

3916

CCInfo.AnalyzeArguments(Outs, CC_X86);

3917

3918

// In vectorcall calling convention a second pass is required for the HVA

3919

// types.

3920

if (CallingConv::X86_VectorCall == CallConv) {

3921

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

3922

}

3923

3924

// Get a count of how many bytes are to be pushed on the stack.

3925

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

3926

if (IsSibcall)

3927

// This is a sibcall. The memory operands are available in caller's

3928

// own caller's stack.

3929

NumBytes = 0;

3930

else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))

3931

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

3932

3933

int FPDiff = 0;

3934

if (isTailCall && !IsSibcall && !IsMustTail) {

3935

// Lower arguments at fp - stackoffset + fpdiff.

3936

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

3937

3938

FPDiff = NumBytesCallerPushed - NumBytes;

3939

3940

// Set the delta of movement of the returnaddr stackslot.

3941

// But only set if delta is greater than previous delta.

3942

if (FPDiff < X86Info->getTCReturnAddrDelta())

3943

X86Info->setTCReturnAddrDelta(FPDiff);

3944

}

3945

3946

unsigned NumBytesToPush = NumBytes;

3947

unsigned NumBytesToPop = NumBytes;

3948

3949

// If we have an inalloca argument, all stack space has already been allocated

3950

// for us and be right at the top of the stack. We don't support multiple

3951

// arguments passed in memory when using inalloca.

3952

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

3953

NumBytesToPush = 0;

3954

if (!ArgLocs.back().isMemLoc())

3955

report_fatal_error("cannot use inalloca attribute on a register "

3956

"parameter");

3957

if (ArgLocs.back().getLocMemOffset() != 0)

3958

report_fatal_error("any parameter with the inalloca attribute must be "

3959

"the only memory argument");

3960

} else if (CLI.IsPreallocated) {

3961

assert(ArgLocs.back().isMemLoc() &&((ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register "
"parameter") ? static_cast<void> (0) : __assert_fail (
"ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3963, __PRETTY_FUNCTION__))

3962

"cannot use preallocated attribute on a register "((ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register "
"parameter") ? static_cast<void> (0) : __assert_fail (
"ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3963, __PRETTY_FUNCTION__))

3963

"parameter")((ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register "
"parameter") ? static_cast<void> (0) : __assert_fail (
"ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3963, __PRETTY_FUNCTION__));

3964

SmallVector<size_t, 4> PreallocatedOffsets;

3965

for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

3966

if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

3967

PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

3968

}

3969

}

3970

auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

3971

size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

3972

MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

3973

MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

3974

NumBytesToPush = 0;

3975

}

3976

3977

if (!IsSibcall && !IsMustTail)

3978

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

3979

NumBytes - NumBytesToPush, dl);

3980

3981

SDValue RetAddrFrIdx;

3982

// Load return address for tail calls.

3983

if (isTailCall && FPDiff)

3984

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

3985

Is64Bit, FPDiff, dl);

3986

3987

SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

3988

SmallVector<SDValue, 8> MemOpChains;

3989

SDValue StackPtr;

3990

3991

// The next loop assumes that the locations are in the same order of the

3992

// input arguments.

3993

assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3994, __PRETTY_FUNCTION__))

3994

"Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3994, __PRETTY_FUNCTION__));

3995

3996

// Walk the register/memloc assignments, inserting copies/loads. In the case

3997

// of tail call optimization arguments are handle later.

3998

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

3999

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

4000

++I, ++OutIndex) {

4001

assert(OutIndex < Outs.size() && "Invalid Out index")((OutIndex < Outs.size() && "Invalid Out index") ?
static_cast<void> (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4001, __PRETTY_FUNCTION__));

4002

// Skip inalloca/preallocated arguments, they have already been written.

4003

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

4004

if (Flags.isInAlloca() || Flags.isPreallocated())

4005

continue;

4006

4007

CCValAssign &VA = ArgLocs[I];

4008

EVT RegVT = VA.getLocVT();

4009

SDValue Arg = OutVals[OutIndex];

4010

bool isByVal = Flags.isByVal();

4011

4012

// Promote the value if needed.

4013

switch (VA.getLocInfo()) {

4014

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4014);

4015

case CCValAssign::Full: break;

4016

case CCValAssign::SExt:

4017

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

4018

break;

4019

case CCValAssign::ZExt:

4020

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

4021

break;

4022

case CCValAssign::AExt:

4023

if (Arg.getValueType().isVector() &&

4024

Arg.getValueType().getVectorElementType() == MVT::i1)

4025

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

4026

else if (RegVT.is128BitVector()) {

4027

// Special case: passing MMX values in XMM registers.

4028

Arg = DAG.getBitcast(MVT::i64, Arg);

4029

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

4030

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

4031

} else

4032

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

4033

break;

4034

case CCValAssign::BCvt:

4035

Arg = DAG.getBitcast(RegVT, Arg);

4036

break;

4037

case CCValAssign::Indirect: {

4038

if (isByVal) {

4039

// Memcpy the argument to a temporary stack slot to prevent

4040

// the caller from seeing any modifications the callee may make

4041

// as guaranteed by the `byval` attribute.

4042

int FrameIdx = MF.getFrameInfo().CreateStackObject(

4043

Flags.getByValSize(),

4044

std::max(Align(16), Flags.getNonZeroByValAlign()), false);

4045

SDValue StackSlot =

4046

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

4047

Chain =

4048

CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);

4049

// From now on treat this as a regular pointer

4050

Arg = StackSlot;

4051

isByVal = false;

4052

} else {

4053

// Store the argument.

4054

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

4055

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

4056

Chain = DAG.getStore(

4057

Chain, dl, Arg, SpillSlot,

4058

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

4059

Arg = SpillSlot;

4060

}

4061

break;

4062

}

4063

}

4064

4065

if (VA.needsCustom()) {

4066

assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4067, __PRETTY_FUNCTION__))

4067

"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4067, __PRETTY_FUNCTION__));

4068

// Split v64i1 value into two registers

4069

Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);

4070

} else if (VA.isRegLoc()) {

4071

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

4072

const TargetOptions &Options = DAG.getTarget().Options;

4073

if (Options.EmitCallSiteInfo)

4074

CSInfo.emplace_back(VA.getLocReg(), I);

4075

if (isVarArg && IsWin64) {

4076

// Win64 ABI requires argument XMM reg to be copied to the corresponding

4077

// shadow reg if callee is a varargs function.

4078

Register ShadowReg;

4079

switch (VA.getLocReg()) {

4080

case X86::XMM0: ShadowReg = X86::RCX; break;

4081

case X86::XMM1: ShadowReg = X86::RDX; break;

4082

case X86::XMM2: ShadowReg = X86::R8; break;

4083

case X86::XMM3: ShadowReg = X86::R9; break;

4084

}

4085

if (ShadowReg)

4086

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

4087

}

4088

} else if (!IsSibcall && (!isTailCall || isByVal)) {

4089

assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4089, __PRETTY_FUNCTION__));

4090

if (!StackPtr.getNode())

4091

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4092

getPointerTy(DAG.getDataLayout()));

4093

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

4094

dl, DAG, VA, Flags, isByVal));

4095

}

4096

}

4097

4098

if (!MemOpChains.empty())

4099

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

4100

4101

if (Subtarget.isPICStyleGOT()) {

4102

// ELF / PIC requires GOT in the EBX register before function calls via PLT

4103

// GOT pointer.

4104

if (!isTailCall) {

4105

RegsToPass.push_back(std::make_pair(

4106

Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

4107

getPointerTy(DAG.getDataLayout()))));

4108

} else {

4109

// If we are tail calling and generating PIC/GOT style code load the

4110

// address of the callee into ECX. The value in ecx is used as target of

4111

// the tail jump. This is done to circumvent the ebx/callee-saved problem

4112

// for tail calls on PIC/GOT architectures. Normally we would just put the

4113

// address of GOT into ebx and then call target@PLT. But for tail calls

4114

// ebx would be restored (since ebx is callee saved) before jumping to the

4115

// target@PLT.

4116

4117

// Note: The actual moving to ECX is done further down.

4118

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4119

if (G && !G->getGlobal()->hasLocalLinkage() &&

4120

G->getGlobal()->hasDefaultVisibility())

4121

Callee = LowerGlobalAddress(Callee, DAG);

4122

else if (isa<ExternalSymbolSDNode>(Callee))

4123

Callee = LowerExternalSymbol(Callee, DAG);

4124

}

4125

}

4126

4127

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {

4128

// From AMD64 ABI document:

4129

// For calls that may call functions that use varargs or stdargs

4130

// (prototype-less calls or calls to functions containing ellipsis (...) in

4131

// the declaration) %al is used as hidden argument to specify the number

4132

// of SSE registers used. The contents of %al do not need to match exactly

4133

// the number of registers, but must be an ubound on the number of SSE

4134

// registers used and is in the range 0 - 8 inclusive.

4135

4136

// Count the number of XMM registers allocated.

4137

static const MCPhysReg XMMArgRegs[] = {

4138

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

4139

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

4140

};

4141

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

4142

assert((Subtarget.hasSSE1() || !NumXMMRegs)(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4143, __PRETTY_FUNCTION__))

4143

&& "SSE registers cannot be used when SSE is disabled")(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4143, __PRETTY_FUNCTION__));

4144

RegsToPass.push_back(std::make_pair(Register(X86::AL),

4145

DAG.getConstant(NumXMMRegs, dl,

4146

MVT::i8)));

4147

}

4148

4149

if (isVarArg && IsMustTail) {

4150

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

4151

for (const auto &F : Forwards) {

4152

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

4153

RegsToPass.push_back(std::make_pair(F.PReg, Val));

4154

}

4155

}

4156

4157

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

4158

// don't need this because the eligibility check rejects calls that require

4159

// shuffling arguments passed in memory.

4160

if (!IsSibcall && isTailCall) {

4161

// Force all the incoming stack arguments to be loaded from the stack

4162

// before any new outgoing arguments are stored to the stack, because the

4163

// outgoing stack slots may alias the incoming argument stack slots, and

4164

// the alias isn't otherwise explicit. This is slightly more conservative

4165

// than necessary, because it means that each store effectively depends

4166

// on every argument instead of just those arguments it would clobber.

4167

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

4168

4169

SmallVector<SDValue, 8> MemOpChains2;

4170

SDValue FIN;

4171

int FI = 0;

4172

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

4173

++I, ++OutsIndex) {

4174

CCValAssign &VA = ArgLocs[I];

4175

4176

if (VA.isRegLoc()) {

4177

if (VA.needsCustom()) {

4178

assert((CallConv == CallingConv::X86_RegCall) &&(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4179, __PRETTY_FUNCTION__))

4179

"Expecting custom case only in regcall calling convention")(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4179, __PRETTY_FUNCTION__));

4180

// This means that we are in special case where one argument was

4181

// passed through two register locations - Skip the next location

4182

++I;

4183

}

4184

4185

continue;

4186

}

4187

4188

assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4188, __PRETTY_FUNCTION__));

4189

SDValue Arg = OutVals[OutsIndex];

4190

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

4191

// Skip inalloca/preallocated arguments. They don't require any work.

4192

if (Flags.isInAlloca() || Flags.isPreallocated())

4193

continue;

4194

// Create frame index.

4195

int32_t Offset = VA.getLocMemOffset()+FPDiff;

4196

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

4197

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

4198

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

4199

4200

if (Flags.isByVal()) {

4201

// Copy relative to framepointer.

4202

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

4203

if (!StackPtr.getNode())

4204

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4205

getPointerTy(DAG.getDataLayout()));

4206

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4207

StackPtr, Source);

4208

4209

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

4210

ArgChain,

4211

Flags, DAG, dl));

4212

} else {

4213

// Store relative to framepointer.

4214

MemOpChains2.push_back(DAG.getStore(

4215

ArgChain, dl, Arg, FIN,

4216

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

4217

}

4218

}

4219

4220

if (!MemOpChains2.empty())

4221

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

4222

4223

// Store the return address to the appropriate stack slot.

4224

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

4225

getPointerTy(DAG.getDataLayout()),

4226

RegInfo->getSlotSize(), FPDiff, dl);

4227

}

4228

4229

// Build a sequence of copy-to-reg nodes chained together with token chain

4230

// and flag operands which copy the outgoing args into registers.

4231

SDValue InFlag;

4232

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

4233

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

4234

RegsToPass[i].second, InFlag);

4235

InFlag = Chain.getValue(1);

4236

}

4237

4238

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

4239

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode."
) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4239, __PRETTY_FUNCTION__));

4240

// In the 64-bit large code model, we have to make all calls

4241

// through a register, since the call instruction's 32-bit

4242

// pc-relative offset may not be large enough to hold the whole

4243

// address.

4244

} else if (Callee->getOpcode() == ISD::GlobalAddress ||

4245

Callee->getOpcode() == ISD::ExternalSymbol) {

4246

// Lower direct calls to global addresses and external symbols. Setting

4247

// ForCall to true here has the effect of removing WrapperRIP when possible

4248

// to allow direct calls to be selected without first materializing the

4249

// address into a register.

4250

Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);

4251

} else if (Subtarget.isTarget64BitILP32() &&

4252

Callee->getValueType(0) == MVT::i32) {

4253

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

4254

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

4255

}

4256

4257

// Returns a chain & a flag for retval copy to use.

4258

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

4259

SmallVector<SDValue, 8> Ops;

4260

4261

if (!IsSibcall && isTailCall && !IsMustTail) {

4262

Chain = DAG.getCALLSEQ_END(Chain,

4263

DAG.getIntPtrConstant(NumBytesToPop, dl, true),

4264

DAG.getIntPtrConstant(0, dl, true), InFlag, dl);

4265

InFlag = Chain.getValue(1);

4266

}

4267

4268

Ops.push_back(Chain);

4269

Ops.push_back(Callee);

4270

4271

if (isTailCall)

4272

Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

4273

4274

// Add argument registers to the end of the list so that they are known live

4275

// into the call.

4276

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

4277

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

4278

RegsToPass[i].second.getValueType()));

4279

4280

// Add a register mask operand representing the call-preserved registers.

4281

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we

4282

// set X86_INTR calling convention because it has the same CSR mask

4283

// (same preserved registers).

4284

const uint32_t *Mask = RegInfo->getCallPreservedMask(

4285

MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);

4286

assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4286, __PRETTY_FUNCTION__));

4287

4288

// If this is an invoke in a 32-bit function using a funclet-based

4289

// personality, assume the function clobbers all registers. If an exception

4290

// is thrown, the runtime will not restore CSRs.

4291

// FIXME: Model this more precisely so that we can register allocate across

4292

// the normal edge and spill and fill across the exceptional edge.

4293

if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

4294

const Function &CallerFn = MF.getFunction();

4295

EHPersonality Pers =

4296

CallerFn.hasPersonalityFn()

4297

? classifyEHPersonality(CallerFn.getPersonalityFn())

4298

: EHPersonality::Unknown;

4299

if (isFuncletEHPersonality(Pers))

4300

Mask = RegInfo->getNoPreservedMask();

4301

}

4302

4303

// Define a new register mask from the existing mask.

4304

uint32_t *RegMask = nullptr;

4305

4306

// In some calling conventions we need to remove the used physical registers

4307

// from the reg mask.

4308

if (CallConv == CallingConv::X86_RegCall || HasNCSR) {

4309

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

4310

4311

// Allocate a new Reg Mask and copy Mask.

4312

RegMask = MF.allocateRegMask();

4313

unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());

4314

memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

4315

4316

// Make sure all sub registers of the argument registers are reset

4317

// in the RegMask.

4318

for (auto const &RegPair : RegsToPass)

4319

for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);

4320

SubRegs.isValid(); ++SubRegs)

4321

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

4322

4323

// Create the RegMask Operand according to our updated mask.

4324

Ops.push_back(DAG.getRegisterMask(RegMask));

4325

} else {

4326

// Create the RegMask Operand according to the static mask.

4327

Ops.push_back(DAG.getRegisterMask(Mask));

4328

}

4329

4330

if (InFlag.getNode())

4331

Ops.push_back(InFlag);

4332

4333

if (isTailCall) {

4334

// We used to do:

4335

//// If this is the first return lowered for this function, add the regs

4336

//// to the liveout set for the function.

4337

// This isn't right, although it's probably harmless on x86; liveouts

4338

// should be computed from returns not tail calls. Consider a void

4339

// function making a tail call to a function returning int.

4340

MF.getFrameInfo().setHasTailCall();

4341

SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

4342

DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

4343

return Ret;

4344

}

4345

4346

if (HasNoCfCheck && IsCFProtectionSupported) {

4347

Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);

4348

} else {

4349

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

4350

}

4351

InFlag = Chain.getValue(1);

4352

DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

4353

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

4354

4355

// Save heapallocsite metadata.

4356

if (CLI.CB)

4357

if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

4358

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

4359

4360

// Create the CALLSEQ_END node.

4361

unsigned NumBytesForCalleeToPop;

4362

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

4363

DAG.getTarget().Options.GuaranteedTailCallOpt))

4364

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

4365

else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&

4366

!Subtarget.getTargetTriple().isOSMSVCRT() &&

4367

SR == StackStructReturn)

4368

// If this is a call to a struct-return function, the callee

4369

// pops the hidden struct pointer, so we have to push it back.

4370

// This is common for Darwin/X86, Linux & Mingw32 targets.

4371

// For MSVC Win32 targets, the caller pops the hidden struct pointer.

4372

NumBytesForCalleeToPop = 4;

4373

else

4374

NumBytesForCalleeToPop = 0; // Callee pops nothing.

4375

4376

// Returns a flag for retval copy to use.

4377

if (!IsSibcall) {

4378

Chain = DAG.getCALLSEQ_END(Chain,

4379

DAG.getIntPtrConstant(NumBytesToPop, dl, true),

4380

DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,

4381

true),

4382

InFlag, dl);

4383

InFlag = Chain.getValue(1);

4384

}

4385

4386

// Handle result values, copying them out of physregs into vregs that we

4387

// return.

4388

return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,

4389

InVals, RegMask);

4390

}

4391

4392

//===----------------------------------------------------------------------===//

4393

// Fast Calling Convention (tail call) implementation

4394

//===----------------------------------------------------------------------===//

4395

4396

// Like std call, callee cleans arguments, convention except that ECX is

4397

// reserved for storing the tail called function address. Only 2 registers are

4398

// free for argument passing (inreg). Tail call optimization is performed

4399

// provided:

4400

// * tailcallopt is enabled

4401

// * caller/callee are fastcc

4402

// On X86_64 architecture with GOT-style position independent code only local

4403

// (within module) calls are supported at the moment.

4404

// To keep the stack aligned according to platform abi the function

4405

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

4406

// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

4407

// If a tail called function callee has more arguments than the caller the

4408

// caller needs to make sure that there is room to move the RETADDR to. This is

4409

// achieved by reserving an area the size of the argument delta right after the

4410

// original RETADDR, but before the saved framepointer or the spilled registers

4411

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

4412

// stack layout:

4413

// arg1

4414

// arg2

4415

// RETADDR

4416

// [ new RETADDR

4417

// move area ]

4418

// (possible EBP)

4419

// ESI

4420

// EDI

4421

// local1 ..

4422

4423

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

4424

/// requirement.

4425

unsigned

4426

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

4427

SelectionDAG &DAG) const {

4428

const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

4429

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

4430

assert(StackSize % SlotSize == 0 &&((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4431, __PRETTY_FUNCTION__))

4431

"StackSize must be a multiple of SlotSize")((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4431, __PRETTY_FUNCTION__));

4432

return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

4433

}

4434

4435

/// Return true if the given stack call argument is already available in the

4436

/// same position (relatively) of the caller's incoming argument stack.

4437

static

4438

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

4439

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

4440

const X86InstrInfo *TII, const CCValAssign &VA) {

4441

unsigned Bytes = Arg.getValueSizeInBits() / 8;

4442

4443

for (;;) {

4444

// Look through nodes that don't alter the bits of the incoming value.

4445

unsigned Op = Arg.getOpcode();

4446

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

4447

Arg = Arg.getOperand(0);

4448

continue;

4449

}

4450

if (Op == ISD::TRUNCATE) {

4451

const SDValue &TruncInput = Arg.getOperand(0);

4452

if (TruncInput.getOpcode() == ISD::AssertZext &&

4453

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

4454

Arg.getValueType()) {

4455

Arg = TruncInput.getOperand(0);

4456

continue;

4457

}

4458

}

4459

break;

4460

}

4461

4462

int FI = INT_MAX2147483647;

4463

if (Arg.getOpcode() == ISD::CopyFromReg) {

4464

Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

4465

if (!Register::isVirtualRegister(VR))

4466

return false;

4467

MachineInstr *Def = MRI->getVRegDef(VR);

4468

if (!Def)

4469

return false;

4470

if (!Flags.isByVal()) {

4471

if (!TII->isLoadFromStackSlot(*Def, FI))

4472

return false;

4473

} else {

4474

unsigned Opcode = Def->getOpcode();

4475

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

4476

Opcode == X86::LEA64_32r) &&

4477

Def->getOperand(1).isFI()) {

4478

FI = Def->getOperand(1).getIndex();

4479

Bytes = Flags.getByValSize();

4480

} else

4481

return false;

4482

}

4483

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

4484

if (Flags.isByVal())

4485

// ByVal argument is passed in as a pointer but it's now being

4486

// dereferenced. e.g.

4487

// define @foo(%struct.X* %A) {

4488

// tail call @bar(%struct.X* byval %A)

4489

// }

4490

return false;

4491

SDValue Ptr = Ld->getBasePtr();

4492

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

4493

if (!FINode)

4494

return false;

4495

FI = FINode->getIndex();

4496

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

4497

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

4498

FI = FINode->getIndex();

4499

Bytes = Flags.getByValSize();

4500

} else

4501

return false;

4502

4503

assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4503, __PRETTY_FUNCTION__));

4504

if (!MFI.isFixedObjectIndex(FI))

4505

return false;

4506

4507

if (Offset != MFI.getObjectOffset(FI))

4508

return false;

4509

4510

// If this is not byval, check that the argument stack object is immutable.

4511

// inalloca and argument copy elision can create mutable argument stack

4512

// objects. Byval objects can be mutated, but a byval call intends to pass the

4513

// mutated memory.

4514

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

4515

return false;

4516

4517

if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {

4518

// If the argument location is wider than the argument type, check that any

4519

// extension flags match.

4520

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

4521

Flags.isSExt() != MFI.isObjectSExt(FI)) {

4522

return false;

4523

}

4524

}

4525

4526

return Bytes == MFI.getObjectSize(FI);

4527

}

4528

4529

/// Check whether the call is eligible for tail call optimization. Targets

4530

/// that want to do tail call optimization should implement this function.

4531

bool X86TargetLowering::IsEligibleForTailCallOptimization(

4532

SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,

4533

bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,

4534

const SmallVectorImpl<ISD::OutputArg> &Outs,

4535

const SmallVectorImpl<SDValue> &OutVals,

4536

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

4537

if (!mayTailCallThisCC(CalleeCC))

4538

return false;

4539

4540

// If -tailcallopt is specified, make fastcc functions tail-callable.

4541

MachineFunction &MF = DAG.getMachineFunction();

4542

const Function &CallerF = MF.getFunction();

4543

4544

// If the function return type is x86_fp80 and the callee return type is not,

4545

// then the FP_EXTEND of the call result is not a nop. It's not safe to

4546

// perform a tailcall optimization here.

4547

if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

4548

return false;

4549

4550

CallingConv::ID CallerCC = CallerF.getCallingConv();

4551

bool CCMatch = CallerCC == CalleeCC;

4552

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

4553

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

4554

bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||

4555

CalleeCC == CallingConv::Tail;

4556

4557

// Win64 functions have extra shadow space for argument homing. Don't do the

4558

// sibcall if the caller and callee have mismatched expectations for this

4559

// space.

4560

if (IsCalleeWin64 != IsCallerWin64)

4561

return false;

4562

4563

if (IsGuaranteeTCO) {

4564

if (canGuaranteeTCO(CalleeCC) && CCMatch)

4565

return true;

4566

return false;

4567

}

4568

4569

// Look for obvious safe cases to perform tail call optimization that do not

4570

// require ABI changes. This is what gcc calls sibcall.

4571

4572

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

4573

// emit a special epilogue.

4574

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4575

if (RegInfo->needsStackRealignment(MF))

4576

return false;

4577

4578

// Also avoid sibcall optimization if either caller or callee uses struct

4579

// return semantics.

4580

if (isCalleeStructRet || isCallerStructRet)

4581

return false;

4582

4583

// Do not sibcall optimize vararg calls unless all arguments are passed via

4584

// registers.

4585

LLVMContext &C = *DAG.getContext();

4586

if (isVarArg && !Outs.empty()) {

4587

// Optimizing for varargs on Win64 is unlikely to be safe without

4588

// additional testing.

4589

if (IsCalleeWin64 || IsCallerWin64)

4590

return false;

4591

4592

SmallVector<CCValAssign, 16> ArgLocs;

4593

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

4594

4595

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

4596

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

4597

if (!ArgLocs[i].isRegLoc())

4598

return false;

4599

}

4600

4601

// If the call result is in ST0 / ST1, it needs to be popped off the x87

4602

// stack. Therefore, if it's not used by the call it is not safe to optimize

4603

// this into a sibcall.

4604

bool Unused = false;

4605

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

4606

if (!Ins[i].Used) {

4607

Unused = true;

4608

break;

4609

}

4610

}

4611

if (Unused) {

4612

SmallVector<CCValAssign, 16> RVLocs;

4613

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

4614

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

4615

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

4616

CCValAssign &VA = RVLocs[i];

4617

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

4618

return false;

4619

}

4620

}

4621

4622

// Check that the call results are passed in the same way.

4623

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

4624

RetCC_X86, RetCC_X86))

4625

return false;

4626

// The callee has to preserve all registers the caller needs to preserve.

4627

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

4628

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

4629

if (!CCMatch) {

4630

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

4631

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

4632

return false;

4633

}

4634

4635

unsigned StackArgsSize = 0;

4636

4637

// If the callee takes no arguments then go on to check the results of the

4638

// call.

4639

if (!Outs.empty()) {

4640

// Check if stack adjustment is needed. For now, do not do this if any

4641

// argument is passed on the stack.

4642

SmallVector<CCValAssign, 16> ArgLocs;

4643

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

4644

4645

// Allocate shadow area for Win64

4646

if (IsCalleeWin64)

4647

CCInfo.AllocateStack(32, Align(8));

4648

4649

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

4650

StackArgsSize = CCInfo.getNextStackOffset();

4651

4652

if (CCInfo.getNextStackOffset()) {

4653

// Check if the arguments are already laid out in the right way as

4654

// the caller's fixed stack objects.

4655

MachineFrameInfo &MFI = MF.getFrameInfo();

4656

const MachineRegisterInfo *MRI = &MF.getRegInfo();

4657

const X86InstrInfo *TII = Subtarget.getInstrInfo();

4658

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

4659

CCValAssign &VA = ArgLocs[i];

4660

SDValue Arg = OutVals[i];

4661

ISD::ArgFlagsTy Flags = Outs[i].Flags;

4662

if (VA.getLocInfo() == CCValAssign::Indirect)

4663

return false;

4664

if (!VA.isRegLoc()) {

4665

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

4666

MFI, MRI, TII, VA))

4667

return false;

4668

}

4669

}

4670

}

4671

4672

bool PositionIndependent = isPositionIndependent();

4673

// If the tailcall address may be in a register, then make sure it's

4674

// possible to register allocate for it. In 32-bit, the call address can

4675

// only target EAX, EDX, or ECX since the tail call must be scheduled after

4676

// callee-saved registers are restored. These happen to be the same

4677

// registers used to pass 'inreg' arguments so watch out for those.

4678

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

4679

!isa<ExternalSymbolSDNode>(Callee)) ||

4680

PositionIndependent)) {

4681

unsigned NumInRegs = 0;

4682

// In PIC we need an extra register to formulate the address computation

4683

// for the callee.

4684

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

4685

4686

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

4687

CCValAssign &VA = ArgLocs[i];

4688

if (!VA.isRegLoc())

4689

continue;

4690

Register Reg = VA.getLocReg();

4691

switch (Reg) {

4692

default: break;

4693

case X86::EAX: case X86::EDX: case X86::ECX:

4694

if (++NumInRegs == MaxInRegs)

4695

return false;

4696

break;

4697

}

4698

}

4699

}

4700

4701

const MachineRegisterInfo &MRI = MF.getRegInfo();

4702

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

4703

return false;

4704

}

4705

4706

bool CalleeWillPop =

4707

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

4708

MF.getTarget().Options.GuaranteedTailCallOpt);

4709

4710

if (unsigned BytesToPop =

4711

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

4712

// If we have bytes to pop, the callee must pop them.

4713

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

4714

if (!CalleePopMatches)

4715

return false;

4716

} else if (CalleeWillPop && StackArgsSize > 0) {

4717

// If we don't have bytes to pop, make sure the callee doesn't pop any.

4718

return false;

4719

}

4720

4721

return true;

4722

}

4723

4724

FastISel *

4725

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

4726

const TargetLibraryInfo *libInfo) const {

4727

return X86::createFastISel(funcInfo, libInfo);

4728

}

4729

4730

//===----------------------------------------------------------------------===//

4731

// Other Lowering Hooks

4732

//===----------------------------------------------------------------------===//

4733

4734

static bool MayFoldLoad(SDValue Op) {

4735

return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());

4736

}

4737

4738

static bool MayFoldIntoStore(SDValue Op) {

4739

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

4740

}

4741

4742

static bool MayFoldIntoZeroExtend(SDValue Op) {

4743

if (Op.hasOneUse()) {

4744

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

4745

return (ISD::ZERO_EXTEND == Opcode);

4746

}

4747

return false;

4748

}

4749

4750

static bool isTargetShuffle(unsigned Opcode) {

4751

switch(Opcode) {

4752

default: return false;

4753

case X86ISD::BLENDI:

4754

case X86ISD::PSHUFB:

4755

case X86ISD::PSHUFD:

4756

case X86ISD::PSHUFHW:

4757

case X86ISD::PSHUFLW:

4758

case X86ISD::SHUFP:

4759

case X86ISD::INSERTPS:

4760

case X86ISD::EXTRQI:

4761

case X86ISD::INSERTQI:

4762

case X86ISD::VALIGN:

4763

case X86ISD::PALIGNR:

4764

case X86ISD::VSHLDQ:

4765

case X86ISD::VSRLDQ:

4766

case X86ISD::MOVLHPS:

4767

case X86ISD::MOVHLPS:

4768

case X86ISD::MOVSHDUP:

4769

case X86ISD::MOVSLDUP:

4770

case X86ISD::MOVDDUP:

4771

case X86ISD::MOVSS:

4772

case X86ISD::MOVSD:

4773

case X86ISD::UNPCKL:

4774

case X86ISD::UNPCKH:

4775

case X86ISD::VBROADCAST:

4776

case X86ISD::VPERMILPI:

4777

case X86ISD::VPERMILPV:

4778

case X86ISD::VPERM2X128:

4779

case X86ISD::SHUF128:

4780

case X86ISD::VPERMIL2:

4781

case X86ISD::VPERMI:

4782

case X86ISD::VPPERM:

4783

case X86ISD::VPERMV:

4784

case X86ISD::VPERMV3:

4785

case X86ISD::VZEXT_MOVL:

4786

return true;

4787

}

4788

}

4789

4790

static bool isTargetShuffleVariableMask(unsigned Opcode) {

4791

switch (Opcode) {

4792

default: return false;

4793

// Target Shuffles.

4794

case X86ISD::PSHUFB:

4795

case X86ISD::VPERMILPV:

4796

case X86ISD::VPERMIL2:

4797

case X86ISD::VPPERM:

4798

case X86ISD::VPERMV:

4799

case X86ISD::VPERMV3:

4800

return true;

4801

// 'Faux' Target Shuffles.

4802

case ISD::OR:

4803

case ISD::AND:

4804

case X86ISD::ANDNP:

4805

return true;

4806

}

4807

}

4808

4809

static bool isTargetShuffleSplat(SDValue Op) {

4810

unsigned Opcode = Op.getOpcode();

4811

if (Opcode == ISD::EXTRACT_SUBVECTOR)

4812

return isTargetShuffleSplat(Op.getOperand(0));

4813

return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;

4814

}

4815

4816

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

4817

MachineFunction &MF = DAG.getMachineFunction();

4818

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4819

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

4820

int ReturnAddrIndex = FuncInfo->getRAIndex();

4821

4822

if (ReturnAddrIndex == 0) {

4823

// Set up a frame object for the return address.

4824

unsigned SlotSize = RegInfo->getSlotSize();

4825

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

4826

-(int64_t)SlotSize,

4827

false);

4828

FuncInfo->setRAIndex(ReturnAddrIndex);

4829

}

4830

4831

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

4832

}

4833

4834

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

4835

bool hasSymbolicDisplacement) {

4836

// Offset should fit into 32 bit immediate field.

4837

if (!isInt<32>(Offset))

4838

return false;

4839

4840

// If we don't have a symbolic displacement - we don't have any extra

4841

// restrictions.

4842

if (!hasSymbolicDisplacement)

4843

return true;

4844

4845

// FIXME: Some tweaks might be needed for medium code model.

4846

if (M != CodeModel::Small && M != CodeModel::Kernel)

4847

return false;

4848

4849

// For small code model we assume that latest object is 16MB before end of 31

4850

// bits boundary. We may also accept pretty large negative constants knowing

4851

// that all objects are in the positive half of address space.

4852

if (M == CodeModel::Small && Offset < 16*1024*1024)

4853

return true;

4854

4855

// For kernel code model we know that all object resist in the negative half

4856

// of 32bits address space. We may not accept negative offsets, since they may

4857

// be just off and we may accept pretty large positive ones.

4858

if (M == CodeModel::Kernel && Offset >= 0)

4859

return true;

4860

4861

return false;

4862

}

4863

4864

/// Determines whether the callee is required to pop its own arguments.

4865

/// Callee pop is necessary to support tail calls.

4866

bool X86::isCalleePop(CallingConv::ID CallingConv,

4867

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

4868

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

4869

// can guarantee TCO.

4870

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

4871

return true;

4872

4873

switch (CallingConv) {

4874

default:

4875

return false;

4876

case CallingConv::X86_StdCall:

4877

case CallingConv::X86_FastCall:

4878

case CallingConv::X86_ThisCall:

4879

case CallingConv::X86_VectorCall:

4880

return !is64Bit;

4881

}

4882

}

4883

4884

/// Return true if the condition is an signed comparison operation.

4885

static bool isX86CCSigned(unsigned X86CC) {

4886

switch (X86CC) {

4887

default:

4888

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4888);

4889

case X86::COND_E:

4890

case X86::COND_NE:

4891

case X86::COND_B:

4892

case X86::COND_A:

4893

case X86::COND_BE:

4894

case X86::COND_AE:

4895

return false;

4896

case X86::COND_G:

4897

case X86::COND_GE:

4898

case X86::COND_L:

4899

case X86::COND_LE:

4900

return true;

4901

}

4902

}

4903

4904

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

4905

switch (SetCCOpcode) {

4906

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4906);

4907

case ISD::SETEQ: return X86::COND_E;

4908

case ISD::SETGT: return X86::COND_G;

4909

case ISD::SETGE: return X86::COND_GE;

4910

case ISD::SETLT: return X86::COND_L;

4911

case ISD::SETLE: return X86::COND_LE;

4912

case ISD::SETNE: return X86::COND_NE;

4913

case ISD::SETULT: return X86::COND_B;

4914

case ISD::SETUGT: return X86::COND_A;

4915

case ISD::SETULE: return X86::COND_BE;

4916

case ISD::SETUGE: return X86::COND_AE;

4917

}

4918

}

4919

4920

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

4921

/// condition code, returning the condition code and the LHS/RHS of the

4922

/// comparison to make.

4923

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

4924

bool isFP, SDValue &LHS, SDValue &RHS,

4925

SelectionDAG &DAG) {

4926

if (!isFP) {

4927

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

4928

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {

4929

// X > -1 -> X == 0, jump !sign.

4930

RHS = DAG.getConstant(0, DL, RHS.getValueType());

4931

return X86::COND_NS;

4932

}

4933

if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {

4934

// X < 0 -> X == 0, jump on sign.

4935

return X86::COND_S;

4936

}

4937

if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {

4938

// X >= 0 -> X == 0, jump on !sign.

4939

return X86::COND_NS;

4940

}

4941

if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

4942

// X < 1 -> X <= 0

4943

RHS = DAG.getConstant(0, DL, RHS.getValueType());

4944

return X86::COND_LE;

4945

}

4946

}

4947

4948

return TranslateIntegerX86CC(SetCCOpcode);

4949

}

4950

4951

// First determine if it is required or is profitable to flip the operands.

4952

4953

// If LHS is a foldable load, but RHS is not, flip the condition.

4954

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

4955

!ISD::isNON_EXTLoad(RHS.getNode())) {

4956

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

4957

std::swap(LHS, RHS);

4958

}

4959

4960

switch (SetCCOpcode) {

4961

default: break;

4962

case ISD::SETOLT:

4963

case ISD::SETOLE:

4964

case ISD::SETUGT:

4965

case ISD::SETUGE:

4966

std::swap(LHS, RHS);

4967

break;

4968

}

4969

4970

// On a floating point condition, the flags are set as follows:

4971

// ZF PF CF op

4972

// 0 | 0 | 0 | X > Y

4973

// 0 | 0 | 1 | X < Y

4974

// 1 | 0 | 0 | X == Y

4975

// 1 | 1 | 1 | unordered

4976

switch (SetCCOpcode) {

4977

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4977);

4978

case ISD::SETUEQ:

4979

case ISD::SETEQ: return X86::COND_E;

4980

case ISD::SETOLT: // flipped

4981

case ISD::SETOGT:

4982

case ISD::SETGT: return X86::COND_A;

4983

case ISD::SETOLE: // flipped

4984

case ISD::SETOGE:

4985

case ISD::SETGE: return X86::COND_AE;

4986

case ISD::SETUGT: // flipped

4987

case ISD::SETULT:

4988

case ISD::SETLT: return X86::COND_B;

4989

case ISD::SETUGE: // flipped

4990

case ISD::SETULE:

4991

case ISD::SETLE: return X86::COND_BE;

4992

case ISD::SETONE:

4993

case ISD::SETNE: return X86::COND_NE;

4994

case ISD::SETUO: return X86::COND_P;

4995

case ISD::SETO: return X86::COND_NP;

4996

case ISD::SETOEQ:

4997

case ISD::SETUNE: return X86::COND_INVALID;

4998

}

4999

}

5000

5001

/// Is there a floating point cmov for the specific X86 condition code?

5002

/// Current x86 isa includes the following FP cmov instructions:

5003

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

5004

static bool hasFPCMov(unsigned X86CC) {

5005

switch (X86CC) {

5006

default:

5007

return false;

5008

case X86::COND_B:

5009

case X86::COND_BE:

5010

case X86::COND_E:

5011

case X86::COND_P:

5012

case X86::COND_A:

5013

case X86::COND_AE:

5014

case X86::COND_NE:

5015

case X86::COND_NP:

5016

return true;

5017

}

5018

}

5019

5020

5021

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

5022

const CallInst &I,

5023

MachineFunction &MF,

5024

unsigned Intrinsic) const {

5025

5026

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

5027

if (!IntrData)

5028

return false;

5029

5030

Info.flags = MachineMemOperand::MONone;

5031

Info.offset = 0;

5032

5033

switch (IntrData->Type) {

5034

case TRUNCATE_TO_MEM_VI8:

5035

case TRUNCATE_TO_MEM_VI16:

5036

case TRUNCATE_TO_MEM_VI32: {

5037

Info.opc = ISD::INTRINSIC_VOID;

5038

Info.ptrVal = I.getArgOperand(0);

5039

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

5040

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

5041

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

5042

ScalarVT = MVT::i8;

5043

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

5044

ScalarVT = MVT::i16;

5045

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

5046

ScalarVT = MVT::i32;

5047

5048

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

5049

Info.align = Align(1);

5050

Info.flags |= MachineMemOperand::MOStore;

5051

break;

5052

}

5053

case GATHER:

5054

case GATHER_AVX2: {

5055

Info.opc = ISD::INTRINSIC_W_CHAIN;

5056

Info.ptrVal = nullptr;

5057

MVT DataVT = MVT::getVT(I.getType());

5058

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5059

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5060

IndexVT.getVectorNumElements());

5061

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5062

Info.align = Align(1);

5063

Info.flags |= MachineMemOperand::MOLoad;

5064

break;

5065

}

5066

case SCATTER: {

5067

Info.opc = ISD::INTRINSIC_VOID;

5068

Info.ptrVal = nullptr;

5069

MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

5070

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5071

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5072

IndexVT.getVectorNumElements());

5073

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5074

Info.align = Align(1);

5075

Info.flags |= MachineMemOperand::MOStore;

5076

break;

5077

}

5078

default:

5079

return false;

5080

}

5081

5082

return true;

5083

}

5084

5085

/// Returns true if the target can instruction select the

5086

/// specified FP immediate natively. If false, the legalizer will

5087

/// materialize the FP immediate as a load from a constant pool.

5088

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

5089

bool ForCodeSize) const {

5090

for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {

5091

if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))

5092

return true;

5093

}

5094

return false;

5095

}

5096

5097

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

5098

ISD::LoadExtType ExtTy,

5099

EVT NewVT) const {

5100

assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"
) ? static_cast<void> (0) : __assert_fail ("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5100, __PRETTY_FUNCTION__));

5101

5102

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

5103

// relocation target a movq or addq instruction: don't let the load shrink.

5104

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

5105

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

5106

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

5107

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

5108

5109

// If this is an (1) AVX vector load with (2) multiple uses and (3) all of

5110

// those uses are extracted directly into a store, then the extract + store

5111

// can be store-folded. Therefore, it's probably not worth splitting the load.

5112

EVT VT = Load->getValueType(0);

5113

if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {

5114

for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {

5115

// Skip uses of the chain value. Result 0 of the node is the load value.

5116

if (UI.getUse().getResNo() != 0)

5117

continue;

5118

5119

// If this use is not an extract + store, it's probably worth splitting.

5120

if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||

5121

UI->use_begin()->getOpcode() != ISD::STORE)

5122

return true;

5123

}

5124

// All non-chain uses are extract + store.

5125

return false;

5126

}

5127

5128

return true;

5129

}

5130

5131

/// Returns true if it is beneficial to convert a load of a constant

5132

/// to just the constant itself.

5133

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

5134

Type *Ty) const {

5135

assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5135, __PRETTY_FUNCTION__));

5136

5137

unsigned BitSize = Ty->getPrimitiveSizeInBits();

5138

if (BitSize == 0 || BitSize > 64)

5139

return false;

5140

return true;

5141

}

5142

5143

bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

5144

// If we are using XMM registers in the ABI and the condition of the select is

5145

// a floating-point compare and we have blendv or conditional move, then it is

5146

// cheaper to select instead of doing a cross-register move and creating a

5147

// load that depends on the compare result.

5148

bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

5149

return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

5150

}

5151

5152

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

5153

// TODO: It might be a win to ease or lift this restriction, but the generic

5154

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

5155

if (VT.isVector() && Subtarget.hasAVX512())

5156

return false;

5157

5158

return true;

5159

}

5160

5161

bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

5162

SDValue C) const {

5163

// TODO: We handle scalars using custom code, but generic combining could make

5164

// that unnecessary.

5165

APInt MulC;

5166

if (!ISD::isConstantSplatVector(C.getNode(), MulC))

5167

return false;

5168

5169

// Find the type this will be legalized too. Otherwise we might prematurely

5170

// convert this to shl+add/sub and then still have to type legalize those ops.

5171

// Another choice would be to defer the decision for illegal types until

5172

// after type legalization. But constant splat vectors of i64 can't make it

5173

// through type legalization on 32-bit targets so we would need to special

5174

// case vXi64.

5175

while (getTypeAction(Context, VT) != TypeLegal)

5176

VT = getTypeToTransformTo(Context, VT);

5177

5178

// If vector multiply is legal, assume that's faster than shl + add/sub.

5179

// TODO: Multiply is a complex op with higher latency and lower throughput in

5180

// most implementations, so this check could be loosened based on type

5181

// and/or a CPU attribute.

5182

if (isOperationLegal(ISD::MUL, VT))

5183

return false;

5184

5185

// shl+add, shl+sub, shl+add+neg

5186

return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

5187

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

5188

}

5189

5190

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

5191

unsigned Index) const {

5192

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

5193

return false;

5194

5195

// Mask vectors support all subregister combinations and operations that

5196

// extract half of vector.

5197

if (ResVT.getVectorElementType() == MVT::i1)

5198

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

5199

(Index == ResVT.getVectorNumElements()));

5200

5201

return (Index % ResVT.getVectorNumElements()) == 0;

5202

}

5203

5204

bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

5205

unsigned Opc = VecOp.getOpcode();

5206

5207

// Assume target opcodes can't be scalarized.

5208

// TODO - do we have any exceptions?

5209

if (Opc >= ISD::BUILTIN_OP_END)

5210

return false;

5211

5212

// If the vector op is not supported, try to convert to scalar.

5213

EVT VecVT = VecOp.getValueType();

5214

if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

5215

return true;

5216

5217

// If the vector op is supported, but the scalar op is not, the transform may

5218

// not be worthwhile.

5219

EVT ScalarVT = VecVT.getScalarType();

5220

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

5221

}

5222

5223

bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

5224

bool) const {

5225

// TODO: Allow vectors?

5226

if (VT.isVector())

5227

return false;

5228

return VT.isSimple() || !isOperationExpand(Opcode, VT);

5229

}

5230

5231

bool X86TargetLowering::isCheapToSpeculateCttz() const {

5232

// Speculate cttz only if we can directly use TZCNT.

5233

return Subtarget.hasBMI();

5234

}

5235

5236

bool X86TargetLowering::isCheapToSpeculateCtlz() const {

5237

// Speculate ctlz only if we can directly use LZCNT.

5238

return Subtarget.hasLZCNT();

5239

}

5240

5241

bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

5242

const SelectionDAG &DAG,

5243

const MachineMemOperand &MMO) const {

5244

if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

5245

BitcastVT.getVectorElementType() == MVT::i1)

5246

return false;

5247

5248

if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

5249

return false;

5250

5251

// If both types are legal vectors, it's always ok to convert them.

5252

if (LoadVT.isVector() && BitcastVT.isVector() &&

5253

isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

5254

return true;

5255

5256

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

5257

}

5258

5259

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

5260

const SelectionDAG &DAG) const {

5261

// Do not merge to float value size (128 bytes) if no implicit

5262

// float attribute is set.

5263

bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(

5264

Attribute::NoImplicitFloat);

5265

5266

if (NoFloat) {

5267

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

5268

return (MemVT.getSizeInBits() <= MaxIntSize);

5269

}

5270

// Make sure we don't merge greater than our preferred vector

5271

// width.

5272

if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

5273

return false;

5274

return true;

5275

}

5276

5277

bool X86TargetLowering::isCtlzFast() const {

5278

return Subtarget.hasFastLZCNT();

5279

}

5280

5281

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

5282

const Instruction &AndI) const {

5283

return true;

5284

}

5285

5286

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

5287

EVT VT = Y.getValueType();

5288

5289

if (VT.isVector())

5290

return false;

5291

5292

if (!Subtarget.hasBMI())

5293

return false;

5294

5295

// There are only 32-bit and 64-bit forms for 'andn'.

5296

if (VT != MVT::i32 && VT != MVT::i64)

5297

return false;

5298

5299

return !isa<ConstantSDNode>(Y);

5300

}

5301

5302

bool X86TargetLowering::hasAndNot(SDValue Y) const {

5303

EVT VT = Y.getValueType();

5304

5305

if (!VT.isVector())

5306

return hasAndNotCompare(Y);

5307

5308

// Vector.

5309

5310

if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

5311

return false;

5312

5313

if (VT == MVT::v4i32)

5314

return true;

5315

5316

return Subtarget.hasSSE2();

5317

}

5318

5319

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

5320

return X.getValueType().isScalarInteger(); // 'bt'

5321

}

5322

5323

bool X86TargetLowering::

5324

shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

5325

SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

5326

unsigned OldShiftOpcode, unsigned NewShiftOpcode,

5327

SelectionDAG &DAG) const {

5328

// Does baseline recommend not to perform the fold by default?

5329

if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

5330

X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

5331

return false;

5332

// For scalars this transform is always beneficial.

5333

if (X.getValueType().isScalarInteger())

5334

return true;

5335

// If all the shift amounts are identical, then transform is beneficial even

5336

// with rudimentary SSE2 shifts.

5337

if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

5338

return true;

5339

// If we have AVX2 with it's powerful shift operations, then it's also good.

5340

if (Subtarget.hasAVX2())

5341

return true;

5342

// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

5343

return NewShiftOpcode == ISD::SHL;

5344

}

5345

5346

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

5347

const SDNode *N, CombineLevel Level) const {

5348

assert(((N->getOpcode() == ISD::SHL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5352, __PRETTY_FUNCTION__))

5349

N->getOperand(0).getOpcode() == ISD::SRL) ||((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5352, __PRETTY_FUNCTION__))

5350

(N->getOpcode() == ISD::SRL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5352, __PRETTY_FUNCTION__))

5351

N->getOperand(0).getOpcode() == ISD::SHL)) &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5352, __PRETTY_FUNCTION__))

5352

"Expected shift-shift mask")((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5352, __PRETTY_FUNCTION__));

5353

EVT VT = N->getValueType(0);

5354

if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

5355

(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

5356

// Only fold if the shift values are equal - so it folds to AND.

5357

// TODO - we should fold if either is a non-uniform vector but we don't do

5358

// the fold for non-splats yet.

5359

return N->getOperand(1) == N->getOperand(0).getOperand(1);

5360

}

5361

return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);

5362

}

5363

5364

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

5365

EVT VT = Y.getValueType();

5366

5367

// For vectors, we don't have a preference, but we probably want a mask.

5368

if (VT.isVector())

5369

return false;

5370

5371

// 64-bit shifts on 32-bit targets produce really bad bloated code.

5372

if (VT == MVT::i64 && !Subtarget.is64Bit())

5373

return false;

5374

5375

return true;

5376

}

5377

5378

bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,

5379

SDNode *N) const {

5380

if (DAG.getMachineFunction().getFunction().hasMinSize() &&

5381

!Subtarget.isOSWindows())

5382

return false;

5383

return true;

5384

}

5385

5386

bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

5387

// Any legal vector type can be splatted more efficiently than

5388

// loading/spilling from memory.

5389

return isTypeLegal(VT);

5390

}

5391

5392

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

5393

MVT VT = MVT::getIntegerVT(NumBits);

5394

if (isTypeLegal(VT))

5395

return VT;

5396

5397

// PMOVMSKB can handle this.

5398

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

5399

return MVT::v16i8;

5400

5401

// VPMOVMSKB can handle this.

5402

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

5403

return MVT::v32i8;

5404

5405

// TODO: Allow 64-bit type for 32-bit target.

5406

// TODO: 512-bit types should be allowed, but make sure that those

5407

// cases are handled in combineVectorSizedSetCCEquality().

5408

5409

return MVT::INVALID_SIMPLE_VALUE_TYPE;

5410

}

5411

5412

/// Val is the undef sentinel value or equal to the specified value.

5413

static bool isUndefOrEqual(int Val, int CmpVal) {

5414

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

5415

}

5416

5417

/// Val is either the undef or zero sentinel value.

5418

static bool isUndefOrZero(int Val) {

5419

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

5420

}

5421

5422

/// Return true if every element in Mask, beginning from position Pos and ending

5423

/// in Pos+Size is the undef sentinel value.

5424

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

5425

return llvm::all_of(Mask.slice(Pos, Size),

5426

[](int M) { return M == SM_SentinelUndef; });

5427

}

5428

5429

/// Return true if the mask creates a vector whose lower half is undefined.

5430

static bool isUndefLowerHalf(ArrayRef<int> Mask) {

5431

unsigned NumElts = Mask.size();

5432

return isUndefInRange(Mask, 0, NumElts / 2);

5433

}

5434

5435

/// Return true if the mask creates a vector whose upper half is undefined.

5436

static bool isUndefUpperHalf(ArrayRef<int> Mask) {

5437

unsigned NumElts = Mask.size();

5438

return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

5439

}

5440

5441

/// Return true if Val falls within the specified range (L, H].

5442

static bool isInRange(int Val, int Low, int Hi) {

5443

return (Val >= Low && Val < Hi);

5444

}

5445

5446

/// Return true if the value of any element in Mask falls within the specified

5447

/// range (L, H].

5448

static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

5449

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

5450

}

5451

5452

/// Return true if the value of any element in Mask is the zero sentinel value.

5453

static bool isAnyZero(ArrayRef<int> Mask) {

5454

return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

5455

}

5456

5457

/// Return true if the value of any element in Mask is the zero or undef

5458

/// sentinel values.

5459

static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

5460

return llvm::any_of(Mask, [](int M) {

5461

return M == SM_SentinelZero || M == SM_SentinelUndef;

5462

});

5463

}

5464

5465

/// Return true if Val is undef or if its value falls within the

5466

/// specified range (L, H].

5467

static bool isUndefOrInRange(int Val, int Low, int Hi) {

5468

return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

5469

}

5470

5471

/// Return true if every element in Mask is undef or if its value

5472

/// falls within the specified range (L, H].

5473

static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

5474

return llvm::all_of(

5475

Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

5476

}

5477

5478

/// Return true if Val is undef, zero or if its value falls within the

5479

/// specified range (L, H].

5480

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

5481

return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

5482

}

5483

5484

/// Return true if every element in Mask is undef, zero or if its value

5485

/// falls within the specified range (L, H].

5486

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

5487

return llvm::all_of(

5488

Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

5489

}

5490

5491

/// Return true if every element in Mask, beginning

5492

/// from position Pos and ending in Pos + Size, falls within the specified

5493

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.

5494

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

5495

unsigned Size, int Low, int Step = 1) {

5496

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

5497

if (!isUndefOrEqual(Mask[i], Low))

5498

return false;

5499

return true;

5500

}

5501

5502

/// Return true if every element in Mask, beginning

5503

/// from position Pos and ending in Pos+Size, falls within the specified

5504

/// sequential range (Low, Low+Size], or is undef or is zero.

5505

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

5506

unsigned Size, int Low,

5507

int Step = 1) {

5508

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

5509

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

5510

return false;

5511

return true;

5512

}

5513

5514

/// Return true if every element in Mask, beginning

5515

/// from position Pos and ending in Pos+Size is undef or is zero.

5516

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

5517

unsigned Size) {

5518

return llvm::all_of(Mask.slice(Pos, Size),

5519

[](int M) { return isUndefOrZero(M); });

5520

}

5521

5522

/// Helper function to test whether a shuffle mask could be

5523

/// simplified by widening the elements being shuffled.

5524

///

5525

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

5526

/// leaves it in an unspecified state.

5527

///

5528

/// NOTE: This must handle normal vector shuffle masks and *target* vector

5529

/// shuffle masks. The latter have the special property of a '-2' representing

5530

/// a zero-ed lane of a vector.

5531

static bool canWidenShuffleElements(ArrayRef<int> Mask,

5532

SmallVectorImpl<int> &WidenedMask) {

5533

WidenedMask.assign(Mask.size() / 2, 0);

5534

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

5535

int M0 = Mask[i];

5536

int M1 = Mask[i + 1];

5537

5538

// If both elements are undef, its trivial.

5539

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

5540

WidenedMask[i / 2] = SM_SentinelUndef;

5541

continue;

5542

}

5543

5544

// Check for an undef mask and a mask value properly aligned to fit with

5545

// a pair of values. If we find such a case, use the non-undef mask's value.

5546

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

5547

WidenedMask[i / 2] = M1 / 2;

5548

continue;

5549

}

5550

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

5551

WidenedMask[i / 2] = M0 / 2;

5552

continue;

5553

}

5554

5555

// When zeroing, we need to spread the zeroing across both lanes to widen.

5556

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

5557

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

5558

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

5559

WidenedMask[i / 2] = SM_SentinelZero;

5560

continue;

5561

}

5562

return false;

5563

}

5564

5565

// Finally check if the two mask values are adjacent and aligned with

5566

// a pair.

5567

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

5568

WidenedMask[i / 2] = M0 / 2;

5569

continue;

5570

}

5571

5572

// Otherwise we can't safely widen the elements used in this shuffle.

5573

return false;

5574

}

5575

assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5576, __PRETTY_FUNCTION__))

5576

"Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5576, __PRETTY_FUNCTION__));

5577

5578

return true;

5579

}

5580

5581

static bool canWidenShuffleElements(ArrayRef<int> Mask,

5582

const APInt &Zeroable,

5583

bool V2IsZero,

5584

SmallVectorImpl<int> &WidenedMask) {

5585

// Create an alternative mask with info about zeroable elements.

5586

// Here we do not set undef elements as zeroable.

5587

SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());

5588

if (V2IsZero) {

5589

assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((!Zeroable.isNullValue() && "V2's non-undef elements are used?!"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isNullValue() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5589, __PRETTY_FUNCTION__));

5590

for (int i = 0, Size = Mask.size(); i != Size; ++i)

5591

if (Mask[i] != SM_SentinelUndef && Zeroable[i])

5592

ZeroableMask[i] = SM_SentinelZero;

5593

}

5594

return canWidenShuffleElements(ZeroableMask, WidenedMask);

5595

}

5596

5597

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

5598

SmallVector<int, 32> WidenedMask;

5599

return canWidenShuffleElements(Mask, WidenedMask);

5600

}

5601

5602

// Attempt to narrow/widen shuffle mask until it matches the target number of

5603

// elements.

5604

static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

5605

SmallVectorImpl<int> &ScaledMask) {

5606

unsigned NumSrcElts = Mask.size();

5607

assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&((((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts
) == 0) && "Illegal shuffle scale factor") ? static_cast
<void> (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5608, __PRETTY_FUNCTION__))

5608

"Illegal shuffle scale factor")((((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts
) == 0) && "Illegal shuffle scale factor") ? static_cast
<void> (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5608, __PRETTY_FUNCTION__));

5609

5610

// Narrowing is guaranteed to work.

5611

if (NumDstElts >= NumSrcElts) {

5612

int Scale = NumDstElts / NumSrcElts;

5613

llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

5614

return true;

5615

}

5616

5617

// We have to repeat the widening until we reach the target size, but we can

5618

// split out the first widening as it sets up ScaledMask for us.

5619

if (canWidenShuffleElements(Mask, ScaledMask)) {

5620

while (ScaledMask.size() > NumDstElts) {

5621

SmallVector<int, 16> WidenedMask;

5622

if (!canWidenShuffleElements(ScaledMask, WidenedMask))

5623

return false;

5624

ScaledMask = std::move(WidenedMask);

5625

}

5626

return true;

5627

}

5628

5629

return false;

5630

}

5631

5632

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

5633

bool X86::isZeroNode(SDValue Elt) {

5634

return isNullConstant(Elt) || isNullFPConstant(Elt);

5635

}

5636

5637

// Build a vector of constants.

5638

// Use an UNDEF node if MaskElt == -1.

5639

// Split 64-bit constants in the 32-bit mode.

5640

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

5641

const SDLoc &dl, bool IsMask = false) {

5642

5643

SmallVector<SDValue, 32> Ops;

5644

bool Split = false;

5645

5646

MVT ConstVecVT = VT;

5647

unsigned NumElts = VT.getVectorNumElements();

5648

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

5649

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

5650

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

5651

Split = true;

5652

}

5653

5654

MVT EltVT = ConstVecVT.getVectorElementType();

5655

for (unsigned i = 0; i < NumElts; ++i) {

5656

bool IsUndef = Values[i] < 0 && IsMask;

5657

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

5658

DAG.getConstant(Values[i], dl, EltVT);

5659

Ops.push_back(OpNode);

5660

if (Split)

5661

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

5662

DAG.getConstant(0, dl, EltVT));

5663

}

5664

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

5665

if (Split)

5666

ConstsNode = DAG.getBitcast(VT, ConstsNode);

5667

return ConstsNode;

5668

}

5669

5670

static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,

5671

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

5672

assert(Bits.size() == Undefs.getBitWidth() &&((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5673, __PRETTY_FUNCTION__))

5673

"Unequal constant and undef arrays")((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5673, __PRETTY_FUNCTION__));

5674

SmallVector<SDValue, 32> Ops;

5675

bool Split = false;

5676

5677

MVT ConstVecVT = VT;

5678

unsigned NumElts = VT.getVectorNumElements();

5679

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

5680

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

5681

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

5682

Split = true;

5683

}

5684

5685

MVT EltVT = ConstVecVT.getVectorElementType();

5686

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

5687

if (Undefs[i]) {

5688

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

5689

continue;

5690

}

5691

const APInt &V = Bits[i];

5692

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"
) ? static_cast<void> (0) : __assert_fail ("V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5692, __PRETTY_FUNCTION__));

5693

if (Split) {

5694

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

5695

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

5696

} else if (EltVT == MVT::f32) {

5697

APFloat FV(APFloat::IEEEsingle(), V);

5698

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

5699

} else if (EltVT == MVT::f64) {

5700

APFloat FV(APFloat::IEEEdouble(), V);

5701

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

5702

} else {

5703

Ops.push_back(DAG.getConstant(V, dl, EltVT));

5704

}

5705

}

5706

5707

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

5708

return DAG.getBitcast(VT, ConstsNode);

5709

}

5710

5711

/// Returns a vector of specified type with all zero elements.

5712

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

5713

SelectionDAG &DAG, const SDLoc &dl) {

5714

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5716, __PRETTY_FUNCTION__))

5715

VT.getVectorElementType() == MVT::i1) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5716, __PRETTY_FUNCTION__))

5716

"Unexpected vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5716, __PRETTY_FUNCTION__));

5717

5718

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

5719

// type. This ensures they get CSE'd. But if the integer type is not

5720

// available, use a floating-point +0.0 instead.

5721

SDValue Vec;

5722

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

5723

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

5724

} else if (VT.isFloatingPoint()) {

5725

Vec = DAG.getConstantFP(+0.0, dl, VT);

5726

} else if (VT.getVectorElementType() == MVT::i1) {

5727

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5728, __PRETTY_FUNCTION__))

5728

"Unexpected vector type")(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5728, __PRETTY_FUNCTION__));

5729

Vec = DAG.getConstant(0, dl, VT);

5730

} else {

5731

unsigned Num32BitElts = VT.getSizeInBits() / 32;

5732

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

5733

}

5734

return DAG.getBitcast(VT, Vec);

5735

}

5736

5737

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

5738

const SDLoc &dl, unsigned vectorWidth) {

5739

EVT VT = Vec.getValueType();

5740

EVT ElVT = VT.getVectorElementType();

5741

unsigned Factor = VT.getSizeInBits()/vectorWidth;

5742

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

5743

VT.getVectorNumElements()/Factor);

5744

5745

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

5746

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

5747

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5747, __PRETTY_FUNCTION__));

5748

5749

// This is the index of the first element of the vectorWidth-bit chunk

5750

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

5751

IdxVal &= ~(ElemsPerChunk - 1);

5752

5753

// If the input is a buildvector just emit a smaller one.

5754

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

5755

return DAG.getBuildVector(ResultVT, dl,

5756

Vec->ops().slice(IdxVal, ElemsPerChunk));

5757

5758

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

5759

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

5760

}

5761

5762

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

5763

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

5764

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

5765

/// instructions or a simple subregister reference. Idx is an index in the

5766

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

5767

/// lowering EXTRACT_VECTOR_ELT operations easier.

5768

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

5769

SelectionDAG &DAG, const SDLoc &dl) {

5770

assert((Vec.getValueType().is256BitVector() ||(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5771, __PRETTY_FUNCTION__))

5771

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5771, __PRETTY_FUNCTION__));

5772

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

5773

}

5774

5775

/// Generate a DAG to grab 256-bits from a 512-bit vector.

5776

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

5777

SelectionDAG &DAG, const SDLoc &dl) {

5778

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5778, __PRETTY_FUNCTION__));

5779

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

5780

}

5781

5782

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

5783

SelectionDAG &DAG, const SDLoc &dl,

5784

unsigned vectorWidth) {

5785

assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5786, __PRETTY_FUNCTION__))

5786

"Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5786, __PRETTY_FUNCTION__));

5787

// Inserting UNDEF is Result

5788

if (Vec.isUndef())

5789

return Result;

5790

EVT VT = Vec.getValueType();

5791

EVT ElVT = VT.getVectorElementType();

5792

EVT ResultVT = Result.getValueType();

5793

5794

// Insert the relevant vectorWidth bits.

5795

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

5796

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5796, __PRETTY_FUNCTION__));

5797

5798

// This is the index of the first element of the vectorWidth-bit chunk

5799

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

5800

IdxVal &= ~(ElemsPerChunk - 1);

5801

5802

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

5803

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

5804

}

5805

5806

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

5807

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

5808

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

5809

/// simple superregister reference. Idx is an index in the 128 bits

5810

/// we want. It need not be aligned to a 128-bit boundary. That makes

5811

/// lowering INSERT_VECTOR_ELT operations easier.

5812

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

5813

SelectionDAG &DAG, const SDLoc &dl) {

5814

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5814, __PRETTY_FUNCTION__));

5815

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

5816

}

5817

5818

/// Widen a vector to a larger size with the same scalar type, with the new

5819

/// elements either zero or undef.

5820

static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

5821

const X86Subtarget &Subtarget, SelectionDAG &DAG,

5822

const SDLoc &dl) {

5823

assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5825, __PRETTY_FUNCTION__))

5824

Vec.getValueType().getScalarType() == VT.getScalarType() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5825, __PRETTY_FUNCTION__))

5825

"Unsupported vector widening type")((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5825, __PRETTY_FUNCTION__));

5826

SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

5827

: DAG.getUNDEF(VT);

5828

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,

5829

DAG.getIntPtrConstant(0, dl));

5830

}

5831

5832

/// Widen a vector to a larger size with the same scalar type, with the new

5833

/// elements either zero or undef.

5834

static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

5835

const X86Subtarget &Subtarget, SelectionDAG &DAG,

5836

const SDLoc &dl, unsigned WideSizeInBits) {

5837

assert(Vec.getValueSizeInBits() < WideSizeInBits &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5839, __PRETTY_FUNCTION__))

5838

(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5839, __PRETTY_FUNCTION__))

5839

"Unsupported vector widening type")((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5839, __PRETTY_FUNCTION__));

5840

unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

5841

MVT SVT = Vec.getSimpleValueType().getScalarType();

5842

MVT VT = MVT::getVectorVT(SVT, WideNumElts);

5843

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

5844

}

5845

5846

// Helper function to collect subvector ops that are concatenated together,

5847

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

5848

// The subvectors in Ops are guaranteed to be the same type.

5849

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {

5850

assert(Ops.empty() && "Expected an empty ops vector")((Ops.empty() && "Expected an empty ops vector") ? static_cast
<void> (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5850, __PRETTY_FUNCTION__));

5851

5852

if (N->getOpcode() == ISD::CONCAT_VECTORS) {

5853

Ops.append(N->op_begin(), N->op_end());

5854

return true;

5855

}

5856

5857

if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

5858

SDValue Src = N->getOperand(0);

5859

SDValue Sub = N->getOperand(1);

5860

const APInt &Idx = N->getConstantOperandAPInt(2);

5861

EVT VT = Src.getValueType();

5862

EVT SubVT = Sub.getValueType();

5863

5864

// TODO - Handle more general insert_subvector chains.

5865

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&

5866

Idx == (VT.getVectorNumElements() / 2)) {

5867

// insert_subvector(insert_subvector(undef, x, lo), y, hi)

5868

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

5869

Src.getOperand(1).getValueType() == SubVT &&

5870

isNullConstant(Src.getOperand(2))) {

5871

Ops.push_back(Src.getOperand(1));

5872

Ops.push_back(Sub);

5873

return true;

5874

}

5875

// insert_subvector(x, extract_subvector(x, lo), hi)

5876

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

5877

Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

5878

Ops.append(2, Sub);

5879

return true;

5880

}

5881

}

5882

}

5883

5884

return false;

5885

}

5886

5887

static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

5888

const SDLoc &dl) {

5889

EVT VT = Op.getValueType();

5890

unsigned NumElems = VT.getVectorNumElements();

5891

unsigned SizeInBits = VT.getSizeInBits();

5892

assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
"Can't split odd sized vector") ? static_cast<void> (0
) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5893, __PRETTY_FUNCTION__))

5893

"Can't split odd sized vector")(((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
"Can't split odd sized vector") ? static_cast<void> (0
) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5893, __PRETTY_FUNCTION__));

5894

5895

SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

5896

SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

5897

return std::make_pair(Lo, Hi);

5898

}

5899

5900

// Split an unary integer op into 2 half sized ops.

5901

static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

5902

EVT VT = Op.getValueType();

5903

5904

// Make sure we only try to split 256/512-bit types to avoid creating

5905

// narrow vectors.

5906

assert((Op.getOperand(0).getValueType().is256BitVector() ||(((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand
(0).getValueType().is512BitVector()) && (VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported VT!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5908, __PRETTY_FUNCTION__))

5907

Op.getOperand(0).getValueType().is512BitVector()) &&(((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand
(0).getValueType().is512BitVector()) && (VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported VT!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5908, __PRETTY_FUNCTION__))

5908

(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand
(0).getValueType().is512BitVector()) && (VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported VT!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5908, __PRETTY_FUNCTION__));

5909

assert(Op.getOperand(0).getValueType().getVectorNumElements() ==((Op.getOperand(0).getValueType().getVectorNumElements() == VT
.getVectorNumElements() && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5911, __PRETTY_FUNCTION__))

5910

VT.getVectorNumElements() &&((Op.getOperand(0).getValueType().getVectorNumElements() == VT
.getVectorNumElements() && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5911, __PRETTY_FUNCTION__))

5911

"Unexpected VTs!")((Op.getOperand(0).getValueType().getVectorNumElements() == VT
.getVectorNumElements() && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5911, __PRETTY_FUNCTION__));

5912

5913

SDLoc dl(Op);

5914

5915

// Extract the Lo/Hi vectors

5916

SDValue Lo, Hi;

5917

std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);

5918

5919

EVT LoVT, HiVT;

5920

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

5921

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

5922

DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),

5923

DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));

5924

}

5925

5926

/// Break a binary integer operation into 2 half sized ops and then

5927

/// concatenate the result back.

5928

static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

5929

EVT VT = Op.getValueType();

5930

5931

// Sanity check that all the types match.

5932

assert(Op.getOperand(0).getValueType() == VT &&((Op.getOperand(0).getValueType() == VT && Op.getOperand
(1).getValueType() == VT && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5933, __PRETTY_FUNCTION__))

5933

Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")((Op.getOperand(0).getValueType() == VT && Op.getOperand
(1).getValueType() == VT && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5933, __PRETTY_FUNCTION__));

5934

assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5934, __PRETTY_FUNCTION__));

5935

5936

SDLoc dl(Op);

5937

5938

// Extract the LHS Lo/Hi vectors

5939

SDValue LHS1, LHS2;

5940

std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

5941

5942

// Extract the RHS Lo/Hi vectors

5943

SDValue RHS1, RHS2;

5944

std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

5945

5946

EVT LoVT, HiVT;

5947

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

5948

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

5949

DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),

5950

DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));

5951

}

5952

5953

// Helper for splitting operands of an operation to legal target size and

5954

// apply a function on each part.

5955

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

5956

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

5957

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

5958

// The argument Builder is a function that will be applied on each split part:

5959

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

5960

template <typename F>

5961

SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

5962

const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

5963

F Builder, bool CheckBWI = true) {

5964

assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5964, __PRETTY_FUNCTION__));

5965

unsigned NumSubs = 1;

5966

if ((CheckBWI && Subtarget.useBWIRegs()) ||

5967

(!CheckBWI && Subtarget.useAVX512Regs())) {

5968

if (VT.getSizeInBits() > 512) {

5969

NumSubs = VT.getSizeInBits() / 512;

5970

assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5970, __PRETTY_FUNCTION__));

5971

}

5972

} else if (Subtarget.hasAVX2()) {

5973

if (VT.getSizeInBits() > 256) {

5974

NumSubs = VT.getSizeInBits() / 256;

5975

assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5975, __PRETTY_FUNCTION__));

5976

}

5977

} else {

5978

if (VT.getSizeInBits() > 128) {

5979

NumSubs = VT.getSizeInBits() / 128;

5980

assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5980, __PRETTY_FUNCTION__));

5981

}

5982

}

5983

5984

if (NumSubs == 1)

5985

return Builder(DAG, DL, Ops);

5986

5987

SmallVector<SDValue, 4> Subs;

5988

for (unsigned i = 0; i != NumSubs; ++i) {

5989

SmallVector<SDValue, 2> SubOps;

5990

for (SDValue Op : Ops) {

5991

EVT OpVT = Op.getValueType();

5992

unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

5993

unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

5994

SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

5995

}

5996

Subs.push_back(Builder(DAG, DL, SubOps));

5997

}

5998

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

5999

}

6000

6001

/// Insert i1-subvector to i1-vector.

6002

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

6003

const X86Subtarget &Subtarget) {

6004

6005

SDLoc dl(Op);

6006

SDValue Vec = Op.getOperand(0);

6007

SDValue SubVec = Op.getOperand(1);

6008

SDValue Idx = Op.getOperand(2);

6009

unsigned IdxVal = Op.getConstantOperandVal(2);

6010

6011

// Inserting undef is a nop. We can just return the original vector.

6012

if (SubVec.isUndef())

6013

return Vec;

6014

6015

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

6016

return Op;

6017

6018

MVT OpVT = Op.getSimpleValueType();

6019

unsigned NumElems = OpVT.getVectorNumElements();

6020

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

6021

6022

// Extend to natively supported kshift.

6023

MVT WideOpVT = OpVT;

6024

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

6025

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

6026

6027

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

6028

// if necessary.

6029

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

6030

// May need to promote to a legal type.

6031

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6032

DAG.getConstant(0, dl, WideOpVT),

6033

SubVec, Idx);

6034

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6035

}

6036

6037

MVT SubVecVT = SubVec.getSimpleValueType();

6038

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

6039

assert(IdxVal + SubVecNumElems <= NumElems &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6041, __PRETTY_FUNCTION__))

6040

IdxVal % SubVecVT.getSizeInBits() == 0 &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6041, __PRETTY_FUNCTION__))

6041

"Unexpected index value in INSERT_SUBVECTOR")((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6041, __PRETTY_FUNCTION__));

6042

6043

SDValue Undef = DAG.getUNDEF(WideOpVT);

6044

6045

if (IdxVal == 0) {

6046

// Zero lower bits of the Vec

6047

SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

6048

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

6049

ZeroIdx);

6050

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6051

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6052

// Merge them together, SubVec should be zero extended.

6053

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6054

DAG.getConstant(0, dl, WideOpVT),

6055

SubVec, ZeroIdx);

6056

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6057

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6058

}

6059

6060

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6061

Undef, SubVec, ZeroIdx);

6062

6063

if (Vec.isUndef()) {

6064

assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6064, __PRETTY_FUNCTION__));

6065

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6066

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6067

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6068

}

6069

6070

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

6071

assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6071, __PRETTY_FUNCTION__));

6072

NumElems = WideOpVT.getVectorNumElements();

6073

unsigned ShiftLeft = NumElems - SubVecNumElems;

6074

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6075

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6076

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6077

if (ShiftRight != 0)

6078

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6079

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6080

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6081

}

6082

6083

// Simple case when we put subvector in the upper part

6084

if (IdxVal + SubVecNumElems == NumElems) {

6085

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6086

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6087

if (SubVecNumElems * 2 == NumElems) {

6088

// Special case, use legal zero extending insert_subvector. This allows

6089

// isel to optimize when bits are known zero.

6090

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

6091

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6092

DAG.getConstant(0, dl, WideOpVT),

6093

Vec, ZeroIdx);

6094

} else {

6095

// Otherwise use explicit shifts to zero the bits.

6096

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6097

Undef, Vec, ZeroIdx);

6098

NumElems = WideOpVT.getVectorNumElements();

6099

SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

6100

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6101

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6102

}

6103

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6104

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6105

}

6106

6107

// Inserting into the middle is more complicated.

6108

6109

NumElems = WideOpVT.getVectorNumElements();

6110

6111

// Widen the vector if needed.

6112

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

6113

6114

unsigned ShiftLeft = NumElems - SubVecNumElems;

6115

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6116

6117

// Do an optimization for the the most frequently used types.

6118

if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

6119

APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

6120

Mask0.flipAllBits();

6121

SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

6122

SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

6123

Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

6124

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6125

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6126

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6127

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6128

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6129

6130

// Reduce to original width if needed.

6131

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6132

}

6133

6134

// Clear the upper bits of the subvector and move it to its insert position.

6135

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6136

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6137

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6138

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6139

6140

// Isolate the bits below the insertion point.

6141

unsigned LowShift = NumElems - IdxVal;

6142

SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

6143

DAG.getTargetConstant(LowShift, dl, MVT::i8));

6144

Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

6145

DAG.getTargetConstant(LowShift, dl, MVT::i8));

6146

6147

// Isolate the bits after the last inserted bit.

6148

unsigned HighShift = IdxVal + SubVecNumElems;

6149

SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

6150

DAG.getTargetConstant(HighShift, dl, MVT::i8));

6151

High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

6152

DAG.getTargetConstant(HighShift, dl, MVT::i8));

6153

6154

// Now OR all 3 pieces together.

6155

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

6156

SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

6157

6158

// Reduce to original width if needed.

6159

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6160

}

6161

6162

static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

6163

const SDLoc &dl) {

6164

assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((V1.getValueType() == V2.getValueType() && "subvector type mismatch"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6164, __PRETTY_FUNCTION__));

6165

EVT SubVT = V1.getValueType();

6166

EVT SubSVT = SubVT.getScalarType();

6167

unsigned SubNumElts = SubVT.getVectorNumElements();

6168

unsigned SubVectorWidth = SubVT.getSizeInBits();

6169

EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

6170

SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

6171

return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

6172

}

6173

6174

/// Returns a vector of specified type with all bits set.

6175

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

6176

/// Then bitcast to their original type, ensuring they get CSE'd.

6177

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6178

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6179, __PRETTY_FUNCTION__))

6179

"Expected a 128/256/512-bit vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6179, __PRETTY_FUNCTION__));

6180

6181

APInt Ones = APInt::getAllOnesValue(32);

6182

unsigned NumElts = VT.getSizeInBits() / 32;

6183

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

6184

return DAG.getBitcast(VT, Vec);

6185

}

6186

6187

// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.

6188

static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {

6189

switch (Opcode) {

6190

case ISD::ANY_EXTEND:

6191

case ISD::ANY_EXTEND_VECTOR_INREG:

6192

return ISD::ANY_EXTEND_VECTOR_INREG;

6193

case ISD::ZERO_EXTEND:

6194

case ISD::ZERO_EXTEND_VECTOR_INREG:

6195

return ISD::ZERO_EXTEND_VECTOR_INREG;

6196

case ISD::SIGN_EXTEND:

6197

case ISD::SIGN_EXTEND_VECTOR_INREG:

6198

return ISD::SIGN_EXTEND_VECTOR_INREG;

6199

}

6200

llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6200);

6201

}

6202

6203

static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,

6204

SDValue In, SelectionDAG &DAG) {

6205

EVT InVT = In.getValueType();

6206

assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((VT.isVector() && InVT.isVector() && "Expected vector VTs."
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6206, __PRETTY_FUNCTION__));

6207

assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6209, __PRETTY_FUNCTION__))

6208

ISD::ZERO_EXTEND == Opcode) &&(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6209, __PRETTY_FUNCTION__))

6209

"Unknown extension opcode")(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6209, __PRETTY_FUNCTION__));

6210

6211

// For 256-bit vectors, we only need the lower (128-bit) input half.

6212

// For 512-bit vectors, we only need the lower input half or quarter.

6213

if (InVT.getSizeInBits() > 128) {

6214

assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6215, __PRETTY_FUNCTION__))

6215

"Expected VTs to be the same size!")((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6215, __PRETTY_FUNCTION__));

6216

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

6217

In = extractSubVector(In, 0, DAG, DL,

6218

std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

6219

InVT = In.getValueType();

6220

}

6221

6222

if (VT.getVectorNumElements() != InVT.getVectorNumElements())

6223

Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);

6224

6225

return DAG.getNode(Opcode, DL, VT, In);

6226

}

6227

6228

// Match (xor X, -1) -> X.

6229

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

6230

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

6231

static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {

6232

V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);

6233

if (V.getOpcode() == ISD::XOR &&

6234

ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))

6235

return V.getOperand(0);

6236

if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6237

(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

6238

if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

6239

Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

6240

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),

6241

Not, V.getOperand(1));

6242

}

6243

}

6244

SmallVector<SDValue, 2> CatOps;

6245

if (collectConcatOps(V.getNode(), CatOps)) {

6246

for (SDValue &CatOp : CatOps) {

6247

SDValue NotCat = IsNOT(CatOp, DAG);

6248

if (!NotCat) return SDValue();

6249

CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

6250

}

6251

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);

6252

}

6253

return SDValue();

6254

}

6255

6256

void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

6257

bool Lo, bool Unary) {

6258

assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6258, __PRETTY_FUNCTION__));

6259

int NumElts = VT.getVectorNumElements();

6260

int NumEltsInLane = 128 / VT.getScalarSizeInBits();

6261

for (int i = 0; i < NumElts; ++i) {

6262

unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

6263

int Pos = (i % NumEltsInLane) / 2 + LaneStart;

6264

Pos += (Unary ? 0 : NumElts * (i % 2));

6265

Pos += (Lo ? 0 : NumEltsInLane / 2);

6266

Mask.push_back(Pos);

6267

}

6268

}

6269

6270

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

6271

/// imposed by AVX and specific to the unary pattern. Example:

6272

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

6273

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

6274

void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

6275

bool Lo) {

6276

assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6276, __PRETTY_FUNCTION__));

6277

int NumElts = VT.getVectorNumElements();

6278

for (int i = 0; i < NumElts; ++i) {

6279

int Pos = i / 2;

6280

Pos += (Lo ? 0 : NumElts / 2);

6281

Mask.push_back(Pos);

6282

}

6283

}

6284

6285

/// Returns a vector_shuffle node for an unpackl operation.

6286

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,

6287

SDValue V1, SDValue V2) {

6288

SmallVector<int, 8> Mask;

6289

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

6290

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

6291

}

6292

6293

/// Returns a vector_shuffle node for an unpackh operation.

6294

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,

6295

SDValue V1, SDValue V2) {

6296

SmallVector<int, 8> Mask;

6297

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

6298

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

6299

}

6300

6301

/// Return a vector_shuffle of the specified vector of zero or undef vector.

6302

/// This produces a shuffle where the low element of V2 is swizzled into the

6303

/// zero/undef vector, landing at element Idx.

6304

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

6305

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

6306

bool IsZero,

6307

const X86Subtarget &Subtarget,

6308

SelectionDAG &DAG) {

6309

MVT VT = V2.getSimpleValueType();

6310

SDValue V1 = IsZero

6311

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

6312

int NumElems = VT.getVectorNumElements();

6313

SmallVector<int, 16> MaskVec(NumElems);

6314

for (int i = 0; i != NumElems; ++i)

6315

// If this is the insertion idx, put the low elt of V2 here.

6316

MaskVec[i] = (i == Idx) ? NumElems : i;

6317

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

6318

}

6319

6320

static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

6321

if (Ptr.getOpcode() == X86ISD::Wrapper ||

6322

Ptr.getOpcode() == X86ISD::WrapperRIP)

6323

Ptr = Ptr.getOperand(0);

6324

6325

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

6326

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

6327

return nullptr;

6328

6329

return CNode->getConstVal();

6330

}

6331

6332

static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

6333

if (!Load || !ISD::isNormalLoad(Load))

6334

return nullptr;

6335

return getTargetConstantFromBasePtr(Load->getBasePtr());

6336

}

6337

6338

static const Constant *getTargetConstantFromNode(SDValue Op) {

6339

Op = peekThroughBitcasts(Op);

6340

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

6341

}

6342

6343

const Constant *

6344

X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

6345

assert(LD && "Unexpected null LoadSDNode")((LD && "Unexpected null LoadSDNode") ? static_cast<
void> (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6345, __PRETTY_FUNCTION__));

6346

return getTargetConstantFromNode(LD);

6347

}

6348

6349

// Extract raw constant bits from constant pools.

6350

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

6351

APInt &UndefElts,

6352

SmallVectorImpl<APInt> &EltBits,

6353

bool AllowWholeUndefs = true,

6354

bool AllowPartialUndefs = true) {

6355

assert(EltBits.empty() && "Expected an empty EltBits vector")((EltBits.empty() && "Expected an empty EltBits vector"
) ? static_cast<void> (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6355, __PRETTY_FUNCTION__));

6356

6357

Op = peekThroughBitcasts(Op);

6358

6359

EVT VT = Op.getValueType();

6360

unsigned SizeInBits = VT.getSizeInBits();

6361

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"
) ? static_cast<void> (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6361, __PRETTY_FUNCTION__));

6362

unsigned NumElts = SizeInBits / EltSizeInBits;

6363

6364

// Bitcast a source array of element bits to the target size.

6365

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

6366

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

6367

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

6368

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6369, __PRETTY_FUNCTION__))

6369

"Constant bit sizes don't match")(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6369, __PRETTY_FUNCTION__));

6370

6371

// Don't split if we don't allow undef bits.

6372

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

6373

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

6374

return false;

6375

6376

// If we're already the right size, don't bother bitcasting.

6377

if (NumSrcElts == NumElts) {

6378

UndefElts = UndefSrcElts;

6379

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

6380

return true;

6381

}

6382

6383

// Extract all the undef/constant element data and pack into single bitsets.

6384

APInt UndefBits(SizeInBits, 0);

6385

APInt MaskBits(SizeInBits, 0);

6386

6387

for (unsigned i = 0; i != NumSrcElts; ++i) {

6388

unsigned BitOffset = i * SrcEltSizeInBits;

6389

if (UndefSrcElts[i])

6390

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

6391

MaskBits.insertBits(SrcEltBits[i], BitOffset);

6392

}

6393

6394

// Split the undef/constant single bitset data into the target elements.

6395

UndefElts = APInt(NumElts, 0);

6396

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

6397

6398

for (unsigned i = 0; i != NumElts; ++i) {

6399

unsigned BitOffset = i * EltSizeInBits;

6400

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

6401

6402

// Only treat an element as UNDEF if all bits are UNDEF.

6403

if (UndefEltBits.isAllOnesValue()) {

6404

if (!AllowWholeUndefs)

6405

return false;

6406

UndefElts.setBit(i);

6407

continue;

6408

}

6409

6410

// If only some bits are UNDEF then treat them as zero (or bail if not

6411

// supported).

6412

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

6413

return false;

6414

6415

EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

6416

}

6417

return true;

6418

};

6419

6420

// Collect constant bits and insert into mask/undef bit masks.

6421

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

6422

unsigned UndefBitIndex) {

6423

if (!Cst)

6424

return false;

6425

if (isa<UndefValue>(Cst)) {

6426

Undefs.setBit(UndefBitIndex);

6427

return true;

6428

}

6429

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

6430

Mask = CInt->getValue();

6431

return true;

6432

}

6433

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

6434

Mask = CFP->getValueAPF().bitcastToAPInt();

6435

return true;

6436

}

6437

return false;

6438

};

6439

6440

// Handle UNDEFs.

6441

if (Op.isUndef()) {

6442

APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);

6443

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

6444

return CastBitData(UndefSrcElts, SrcEltBits);

6445

}

6446

6447

// Extract scalar constant bits.

6448

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

6449

APInt UndefSrcElts = APInt::getNullValue(1);

6450

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

6451

return CastBitData(UndefSrcElts, SrcEltBits);

6452

}

6453

if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

6454

APInt UndefSrcElts = APInt::getNullValue(1);

6455

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

6456

SmallVector<APInt, 64> SrcEltBits(1, RawBits);

6457

return CastBitData(UndefSrcElts, SrcEltBits);

6458

}

6459

6460

// Extract constant bits from build vector.

6461

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {

6462

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

6463

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

6464

6465

APInt UndefSrcElts(NumSrcElts, 0);

6466

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

6467

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

6468

const SDValue &Src = Op.getOperand(i);

6469

if (Src.isUndef()) {

6470

UndefSrcElts.setBit(i);

6471

continue;

6472

}

6473

auto *Cst = cast<ConstantSDNode>(Src);

6474

SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);

6475

}

6476

return CastBitData(UndefSrcElts, SrcEltBits);

6477

}

6478

if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {

6479

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

6480

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

6481

6482

APInt UndefSrcElts(NumSrcElts, 0);

6483

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

6484

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

6485

const SDValue &Src = Op.getOperand(i);

6486

if (Src.isUndef()) {

6487

UndefSrcElts.setBit(i);

6488

continue;

6489

}

6490

auto *Cst = cast<ConstantFPSDNode>(Src);

6491

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

6492

SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);

6493

}

6494

return CastBitData(UndefSrcElts, SrcEltBits);

6495

}

6496

6497

// Extract constant bits from constant pool vector.

6498

if (auto *Cst = getTargetConstantFromNode(Op)) {

6499

Type *CstTy = Cst->getType();

6500

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

6501

if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

6502

return false;

6503

6504

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

6505

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

6506

6507

APInt UndefSrcElts(NumSrcElts, 0);

6508

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

6509

for (unsigned i = 0; i != NumSrcElts; ++i)

6510

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

6511

UndefSrcElts, i))

6512

return false;

6513

6514

return CastBitData(UndefSrcElts, SrcEltBits);

6515

}

6516

6517

// Extract constant bits from a broadcasted constant pool scalar.

6518

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

6519

EltSizeInBits <= VT.getScalarSizeInBits()) {

6520

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

6521

if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())

6522

return false;

6523

6524

SDValue Ptr = MemIntr->getBasePtr();

6525

if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

6526

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

6527

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

6528

6529

APInt UndefSrcElts(NumSrcElts, 0);

6530

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

6531

if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

6532

if (UndefSrcElts[0])

6533

UndefSrcElts.setBits(0, NumSrcElts);

6534

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

6535

return CastBitData(UndefSrcElts, SrcEltBits);

6536

}

6537

}

6538

}

6539

6540

// Extract constant bits from a subvector broadcast.

6541

if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {

6542

SmallVector<APInt, 16> SubEltBits;

6543

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

6544

UndefElts, SubEltBits, AllowWholeUndefs,

6545

AllowPartialUndefs)) {

6546

UndefElts = APInt::getSplat(NumElts, UndefElts);

6547

while (EltBits.size() < NumElts)

6548

EltBits.append(SubEltBits.begin(), SubEltBits.end());

6549

return true;

6550

}

6551

}

6552

6553

// Extract a rematerialized scalar constant insertion.

6554

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

6555

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

6556

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

6557

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

6558

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

6559

6560

APInt UndefSrcElts(NumSrcElts, 0);

6561

SmallVector<APInt, 64> SrcEltBits;

6562

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

6563

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

6564

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

6565

return CastBitData(UndefSrcElts, SrcEltBits);

6566

}

6567

6568

// Insert constant bits from a base and sub vector sources.

6569

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

6570

// TODO - support insert_subvector through bitcasts.

6571

if (EltSizeInBits != VT.getScalarSizeInBits())

6572

return false;

6573

6574

APInt UndefSubElts;

6575

SmallVector<APInt, 32> EltSubBits;

6576

if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

6577

UndefSubElts, EltSubBits,

6578

AllowWholeUndefs, AllowPartialUndefs) &&

6579

getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

6580

UndefElts, EltBits, AllowWholeUndefs,

6581

AllowPartialUndefs)) {

6582

unsigned BaseIdx = Op.getConstantOperandVal(2);

6583

UndefElts.insertBits(UndefSubElts, BaseIdx);

6584

for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

6585

EltBits[BaseIdx + i] = EltSubBits[i];

6586

return true;

6587

}

6588

}

6589

6590

// Extract constant bits from a subvector's source.

6591

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

6592

// TODO - support extract_subvector through bitcasts.

6593

if (EltSizeInBits != VT.getScalarSizeInBits())

6594

return false;

6595

6596

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

6597

UndefElts, EltBits, AllowWholeUndefs,

6598

AllowPartialUndefs)) {

6599

EVT SrcVT = Op.getOperand(0).getValueType();

6600

unsigned NumSrcElts = SrcVT.getVectorNumElements();

6601

unsigned NumSubElts = VT.getVectorNumElements();

6602

unsigned BaseIdx = Op.getConstantOperandVal(1);

6603

UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

6604

if ((BaseIdx + NumSubElts) != NumSrcElts)

6605

EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

6606

if (BaseIdx != 0)

6607

EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

6608

return true;

6609

}

6610

}

6611

6612

// Extract constant bits from shuffle node sources.

6613

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

6614

// TODO - support shuffle through bitcasts.

6615

if (EltSizeInBits != VT.getScalarSizeInBits())

6616

return false;

6617

6618

ArrayRef<int> Mask = SVN->getMask();

6619

if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

6620

llvm::any_of(Mask, [](int M) { return M < 0; }))

6621

return false;

6622

6623

APInt UndefElts0, UndefElts1;

6624

SmallVector<APInt, 32> EltBits0, EltBits1;

6625

if (isAnyInRange(Mask, 0, NumElts) &&

6626

!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

6627

UndefElts0, EltBits0, AllowWholeUndefs,

6628

AllowPartialUndefs))

6629

return false;

6630

if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

6631

!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

6632

UndefElts1, EltBits1, AllowWholeUndefs,

6633

AllowPartialUndefs))

6634

return false;

6635

6636

UndefElts = APInt::getNullValue(NumElts);

6637

for (int i = 0; i != (int)NumElts; ++i) {

6638

int M = Mask[i];

6639

if (M < 0) {

6640

UndefElts.setBit(i);

6641

EltBits.push_back(APInt::getNullValue(EltSizeInBits));

6642

} else if (M < (int)NumElts) {

6643

if (UndefElts0[M])

6644

UndefElts.setBit(i);

6645

EltBits.push_back(EltBits0[M]);

6646

} else {

6647

if (UndefElts1[M - NumElts])

6648

UndefElts.setBit(i);

6649

EltBits.push_back(EltBits1[M - NumElts]);

6650

}

6651

}

6652

return true;

6653

}

6654

6655

return false;

6656

}

6657

6658

namespace llvm {

6659

namespace X86 {

6660

bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

6661

APInt UndefElts;

6662

SmallVector<APInt, 16> EltBits;

6663

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

6664

UndefElts, EltBits, true,

6665

AllowPartialUndefs)) {

6666

int SplatIndex = -1;

6667

for (int i = 0, e = EltBits.size(); i != e; ++i) {

6668

if (UndefElts[i])

6669

continue;

6670

if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

6671

SplatIndex = -1;

6672

break;

6673

}

6674

SplatIndex = i;

6675

}

6676

if (0 <= SplatIndex) {

6677

SplatVal = EltBits[SplatIndex];

6678

return true;

6679

}

6680

}

6681

6682

return false;

6683

}

6684

} // namespace X86

6685

} // namespace llvm

6686

6687

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

6688

unsigned MaskEltSizeInBits,

6689

SmallVectorImpl<uint64_t> &RawMask,

6690

APInt &UndefElts) {

6691

// Extract the raw target constant bits.

6692

SmallVector<APInt, 64> EltBits;

6693

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

6694

EltBits, /* AllowWholeUndefs */ true,

6695

/* AllowPartialUndefs */ false))

6696

return false;

6697

6698

// Insert the extracted elements into the mask.

6699

for (const APInt &Elt : EltBits)

6700

RawMask.push_back(Elt.getZExtValue());

6701

6702

return true;

6703

}

6704

6705

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

6706

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

6707

/// Note: This ignores saturation, so inputs must be checked first.

6708

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

6709

bool Unary, unsigned NumStages = 1) {

6710

assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6710, __PRETTY_FUNCTION__));

6711

unsigned NumElts = VT.getVectorNumElements();

6712

unsigned NumLanes = VT.getSizeInBits() / 128;

6713

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

6714

unsigned Offset = Unary ? 0 : NumElts;

6715

unsigned Repetitions = 1u << (NumStages - 1);

6716

unsigned Increment = 1u << NumStages;

6717

assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6717, __PRETTY_FUNCTION__));

6718

6719

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

6720

for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

6721

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

6722

Mask.push_back(Elt + (Lane * NumEltsPerLane));

6723

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

6724

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

6725

}

6726

}

6727

}

6728

6729

// Split the demanded elts of a PACKSS/PACKUS node between its operands.

6730

static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

6731

APInt &DemandedLHS, APInt &DemandedRHS) {

6732

int NumLanes = VT.getSizeInBits() / 128;

6733

int NumElts = DemandedElts.getBitWidth();

6734

int NumInnerElts = NumElts / 2;

6735

int NumEltsPerLane = NumElts / NumLanes;

6736

int NumInnerEltsPerLane = NumInnerElts / NumLanes;

6737

6738

DemandedLHS = APInt::getNullValue(NumInnerElts);

6739

DemandedRHS = APInt::getNullValue(NumInnerElts);

6740

6741

// Map DemandedElts to the packed operands.

6742

for (int Lane = 0; Lane != NumLanes; ++Lane) {

6743

for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

6744

int OuterIdx = (Lane * NumEltsPerLane) + Elt;

6745

int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

6746

if (DemandedElts[OuterIdx])

6747

DemandedLHS.setBit(InnerIdx);

6748

if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

6749

DemandedRHS.setBit(InnerIdx);

6750

}

6751

}

6752

}

6753

6754

// Split the demanded elts of a HADD/HSUB node between its operands.

6755

static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

6756

APInt &DemandedLHS, APInt &DemandedRHS) {

6757

int NumLanes = VT.getSizeInBits() / 128;

6758

int NumElts = DemandedElts.getBitWidth();

6759

int NumEltsPerLane = NumElts / NumLanes;

6760

int HalfEltsPerLane = NumEltsPerLane / 2;

6761

6762

DemandedLHS = APInt::getNullValue(NumElts);

6763

DemandedRHS = APInt::getNullValue(NumElts);

6764

6765

// Map DemandedElts to the horizontal operands.

6766

for (int Idx = 0; Idx != NumElts; ++Idx) {

6767

if (!DemandedElts[Idx])

6768

continue;

6769

int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;

6770

int LocalIdx = Idx % NumEltsPerLane;

6771

if (LocalIdx < HalfEltsPerLane) {

6772

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);

6773

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);

6774

} else {

6775

LocalIdx -= HalfEltsPerLane;

6776

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);

6777

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);

6778

}

6779

}

6780

}

6781

6782

/// Calculates the shuffle mask corresponding to the target-specific opcode.

6783

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

6784

/// operands in \p Ops, and returns true.

6785

/// Sets \p IsUnary to true if only one source is used. Note that this will set

6786

/// IsUnary for shuffles which use a single input multiple times, and in those

6787

/// cases it will adjust the mask to only have indices within that single input.

6788

/// It is an error to call this with non-empty Mask/Ops vectors.

6789

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

6790

SmallVectorImpl<SDValue> &Ops,

6791

SmallVectorImpl<int> &Mask, bool &IsUnary) {

6792

unsigned NumElems = VT.getVectorNumElements();

6793

unsigned MaskEltSize = VT.getScalarSizeInBits();

6794

SmallVector<uint64_t, 32> RawMask;

6795

APInt RawUndefs;

6796

uint64_t ImmN;

6797

6798

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6798, __PRETTY_FUNCTION__));

6799

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? static_cast<void> (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6799, __PRETTY_FUNCTION__));

6800

6801

IsUnary = false;

6802

bool IsFakeUnary = false;

6803

switch (N->getOpcode()) {

6804

case X86ISD::BLENDI:

6805

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6805, __PRETTY_FUNCTION__));

6806

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6806, __PRETTY_FUNCTION__));

6807

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6808

DecodeBLENDMask(NumElems, ImmN, Mask);

6809

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6810

break;

6811

case X86ISD::SHUFP:

6812

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6812, __PRETTY_FUNCTION__));

6813

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6813, __PRETTY_FUNCTION__));

6814

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6815

DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

6816

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6817

break;

6818

case X86ISD::INSERTPS:

6819

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6819, __PRETTY_FUNCTION__));

6820

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6820, __PRETTY_FUNCTION__));

6821

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6822

DecodeINSERTPSMask(ImmN, Mask);

6823

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6824

break;

6825

case X86ISD::EXTRQI:

6826

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6826, __PRETTY_FUNCTION__));

6827

if (isa<ConstantSDNode>(N->getOperand(1)) &&

6828

isa<ConstantSDNode>(N->getOperand(2))) {

6829

int BitLen = N->getConstantOperandVal(1);

6830

int BitIdx = N->getConstantOperandVal(2);

6831

DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

6832

IsUnary = true;

6833

}

6834

break;

6835

case X86ISD::INSERTQI:

6836

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6836, __PRETTY_FUNCTION__));

6837

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6837, __PRETTY_FUNCTION__));

6838

if (isa<ConstantSDNode>(N->getOperand(2)) &&

6839

isa<ConstantSDNode>(N->getOperand(3))) {

6840

int BitLen = N->getConstantOperandVal(2);

6841

int BitIdx = N->getConstantOperandVal(3);

6842

DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

6843

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6844

}

6845

break;

6846

case X86ISD::UNPCKH:

6847

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6847, __PRETTY_FUNCTION__));

6848

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6848, __PRETTY_FUNCTION__));

6849

DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

6850

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6851

break;

6852

case X86ISD::UNPCKL:

6853

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6853, __PRETTY_FUNCTION__));

6854

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6854, __PRETTY_FUNCTION__));

6855

DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

6856

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6857

break;

6858

case X86ISD::MOVHLPS:

6859

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6859, __PRETTY_FUNCTION__));

6860

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6860, __PRETTY_FUNCTION__));

6861

DecodeMOVHLPSMask(NumElems, Mask);

6862

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6863

break;

6864

case X86ISD::MOVLHPS:

6865

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6865, __PRETTY_FUNCTION__));

6866

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6866, __PRETTY_FUNCTION__));

6867

DecodeMOVLHPSMask(NumElems, Mask);

6868

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6869

break;

6870

case X86ISD::VALIGN:

6871

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6872, __PRETTY_FUNCTION__))

6872

"Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6872, __PRETTY_FUNCTION__));

6873

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6873, __PRETTY_FUNCTION__));

6874

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6874, __PRETTY_FUNCTION__));

6875

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6876

DecodeVALIGNMask(NumElems, ImmN, Mask);

6877

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6878

Ops.push_back(N->getOperand(1));

6879

Ops.push_back(N->getOperand(0));

6880

break;

6881

case X86ISD::PALIGNR:

6882

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6882, __PRETTY_FUNCTION__));

6883

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6883, __PRETTY_FUNCTION__));

6884

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6884, __PRETTY_FUNCTION__));

6885

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6886

DecodePALIGNRMask(NumElems, ImmN, Mask);

6887

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6888

Ops.push_back(N->getOperand(1));

6889

Ops.push_back(N->getOperand(0));

6890

break;

6891

case X86ISD::VSHLDQ:

6892

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6892, __PRETTY_FUNCTION__));

6893

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6893, __PRETTY_FUNCTION__));

6894

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6895

DecodePSLLDQMask(NumElems, ImmN, Mask);

6896

IsUnary = true;

6897

break;

6898

case X86ISD::VSRLDQ:

6899

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6899, __PRETTY_FUNCTION__));

6900

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6900, __PRETTY_FUNCTION__));

6901

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6902

DecodePSRLDQMask(NumElems, ImmN, Mask);

6903

IsUnary = true;

6904

break;

6905

case X86ISD::PSHUFD:

6906

case X86ISD::VPERMILPI:

6907

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6907, __PRETTY_FUNCTION__));

6908

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6909

DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

6910

IsUnary = true;

6911

break;

6912

case X86ISD::PSHUFHW:

6913

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6913, __PRETTY_FUNCTION__));

6914

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6915

DecodePSHUFHWMask(NumElems, ImmN, Mask);

6916

IsUnary = true;

6917

break;

6918

case X86ISD::PSHUFLW:

6919

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6919, __PRETTY_FUNCTION__));

6920

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6921

DecodePSHUFLWMask(NumElems, ImmN, Mask);

6922

IsUnary = true;

6923

break;

6924

case X86ISD::VZEXT_MOVL:

6925

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6925, __PRETTY_FUNCTION__));

6926

DecodeZeroMoveLowMask(NumElems, Mask);

6927

IsUnary = true;

6928

break;

6929

case X86ISD::VBROADCAST:

6930

// We only decode broadcasts of same-sized vectors, peeking through to

6931

// extracted subvectors is likely to cause hasOneUse issues with

6932

// SimplifyDemandedBits etc.

6933

if (N->getOperand(0).getValueType() == VT) {

6934

DecodeVectorBroadcast(NumElems, Mask);

6935

IsUnary = true;

6936

break;

6937

}

6938

return false;

6939

case X86ISD::VPERMILPV: {

6940

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6940, __PRETTY_FUNCTION__));

6941

IsUnary = true;

6942

SDValue MaskNode = N->getOperand(1);

6943

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

6944

RawUndefs)) {

6945

DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

6946

break;

6947

}

6948

return false;

6949

}

6950

case X86ISD::PSHUFB: {

6951

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6951, __PRETTY_FUNCTION__));

6952

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6952, __PRETTY_FUNCTION__));

6953

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6953, __PRETTY_FUNCTION__));

6954

IsUnary = true;

6955

SDValue MaskNode = N->getOperand(1);

6956

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

6957

DecodePSHUFBMask(RawMask, RawUndefs, Mask);

6958

break;

6959

}

6960

return false;

6961

}

6962

case X86ISD::VPERMI:

6963

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6963, __PRETTY_FUNCTION__));

6964

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6965

DecodeVPERMMask(NumElems, ImmN, Mask);

6966

IsUnary = true;

6967

break;

6968

case X86ISD::MOVSS:

6969

case X86ISD::MOVSD:

6970

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6970, __PRETTY_FUNCTION__));

6971

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6971, __PRETTY_FUNCTION__));

6972

DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

6973

break;

6974

case X86ISD::VPERM2X128:

6975

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6975, __PRETTY_FUNCTION__));

6976

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6976, __PRETTY_FUNCTION__));

6977

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6978

DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

6979

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6980

break;

6981

case X86ISD::SHUF128:

6982

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6982, __PRETTY_FUNCTION__));

6983

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6983, __PRETTY_FUNCTION__));

6984

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

6985

decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

6986

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

6987

break;

6988

case X86ISD::MOVSLDUP:

6989

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6989, __PRETTY_FUNCTION__));

6990

DecodeMOVSLDUPMask(NumElems, Mask);

6991

IsUnary = true;

6992

break;

6993

case X86ISD::MOVSHDUP:

6994

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6994, __PRETTY_FUNCTION__));

6995

DecodeMOVSHDUPMask(NumElems, Mask);

6996

IsUnary = true;

6997

break;

6998

case X86ISD::MOVDDUP:

6999

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6999, __PRETTY_FUNCTION__));

7000

DecodeMOVDDUPMask(NumElems, Mask);

7001

IsUnary = true;

7002

break;

7003

case X86ISD::VPERMIL2: {

7004

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7004, __PRETTY_FUNCTION__));

7005

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7005, __PRETTY_FUNCTION__));

7006

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7007

SDValue MaskNode = N->getOperand(2);

7008

SDValue CtrlNode = N->getOperand(3);

7009

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

7010

unsigned CtrlImm = CtrlOp->getZExtValue();

7011

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7012

RawUndefs)) {

7013

DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

7014

Mask);

7015

break;

7016

}

7017

}

7018

return false;

7019

}

7020

case X86ISD::VPPERM: {

7021

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7021, __PRETTY_FUNCTION__));

7022

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7022, __PRETTY_FUNCTION__));

7023

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7024

SDValue MaskNode = N->getOperand(2);

7025

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7026

DecodeVPPERMMask(RawMask, RawUndefs, Mask);

7027

break;

7028

}

7029

return false;

7030

}

7031

case X86ISD::VPERMV: {

7032

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7032, __PRETTY_FUNCTION__));

7033

IsUnary = true;

7034

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

7035

Ops.push_back(N->getOperand(1));

7036

SDValue MaskNode = N->getOperand(0);

7037

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7038

RawUndefs)) {

7039

DecodeVPERMVMask(RawMask, RawUndefs, Mask);

7040

break;

7041

}

7042

return false;

7043

}

7044

case X86ISD::VPERMV3: {

7045

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7045, __PRETTY_FUNCTION__));

7046

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((N->getOperand(2).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7046, __PRETTY_FUNCTION__));

7047

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

7048

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

7049

Ops.push_back(N->getOperand(0));

7050

Ops.push_back(N->getOperand(2));

7051

SDValue MaskNode = N->getOperand(1);

7052

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7053

RawUndefs)) {

7054

DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

7055

break;

7056

}

7057

return false;

7058

}

7059

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7059);

7060

}

7061

7062

// Empty mask indicates the decode failed.

7063

if (Mask.empty())

7064

return false;

7065

7066

// Check if we're getting a shuffle mask with zero'd elements.

7067

if (!AllowSentinelZero && isAnyZero(Mask))

7068

return false;

7069

7070

// If we have a fake unary shuffle, the shuffle mask is spread across two

7071

// inputs that are actually the same node. Re-map the mask to always point

7072

// into the first input.

7073

if (IsFakeUnary)

7074

for (int &M : Mask)

7075

if (M >= (int)Mask.size())

7076

M -= Mask.size();

7077

7078

// If we didn't already add operands in the opcode-specific code, default to

7079

// adding 1 or 2 operands starting at 0.

7080

if (Ops.empty()) {

7081

Ops.push_back(N->getOperand(0));

7082

if (!IsUnary || IsFakeUnary)

7083

Ops.push_back(N->getOperand(1));

7084

}

7085

7086

return true;

7087

}

7088

7089

/// Compute whether each element of a shuffle is zeroable.

7090

///

7091

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

7092

/// Either it is an undef element in the shuffle mask, the element of the input

7093

/// referenced is undef, or the element of the input referenced is known to be

7094

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

7095

/// as many lanes with this technique as possible to simplify the remaining

7096

/// shuffle.

7097

static void computeZeroableShuffleElements(ArrayRef<int> Mask,

7098

SDValue V1, SDValue V2,

7099

APInt &KnownUndef, APInt &KnownZero) {

7100

int Size = Mask.size();

7101

KnownUndef = KnownZero = APInt::getNullValue(Size);

7102

7103

V1 = peekThroughBitcasts(V1);

7104

V2 = peekThroughBitcasts(V2);

7105

7106

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

7107

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

7108

7109

int VectorSizeInBits = V1.getValueSizeInBits();

7110

int ScalarSizeInBits = VectorSizeInBits / Size;

7111

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7111, __PRETTY_FUNCTION__));

7112

7113

for (int i = 0; i < Size; ++i) {

7114

int M = Mask[i];

7115

// Handle the easy cases.

7116

if (M < 0) {

7117

KnownUndef.setBit(i);

7118

continue;

7119

}

7120

if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

7121

KnownZero.setBit(i);

7122

continue;

7123

}

7124

7125

// Determine shuffle input and normalize the mask.

7126

SDValue V = M < Size ? V1 : V2;

7127

M %= Size;

7128

7129

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

7130

if (V.getOpcode() != ISD::BUILD_VECTOR)

7131

continue;

7132

7133

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

7134

// the (larger) source element must be UNDEF/ZERO.

7135

if ((Size % V.getNumOperands()) == 0) {

7136

int Scale = Size / V->getNumOperands();

7137

SDValue Op = V.getOperand(M / Scale);

7138

if (Op.isUndef())

7139

KnownUndef.setBit(i);

7140

if (X86::isZeroNode(Op))

7141

KnownZero.setBit(i);

7142

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

7143

APInt Val = Cst->getAPIntValue();

7144

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

7145

if (Val == 0)

7146

KnownZero.setBit(i);

7147

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7148

APInt Val = Cst->getValueAPF().bitcastToAPInt();

7149

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

7150

if (Val == 0)

7151

KnownZero.setBit(i);

7152

}

7153

continue;

7154

}

7155

7156

// If the BUILD_VECTOR has more elements then all the (smaller) source

7157

// elements must be UNDEF or ZERO.

7158

if ((V.getNumOperands() % Size) == 0) {

7159

int Scale = V->getNumOperands() / Size;

7160

bool AllUndef = true;

7161

bool AllZero = true;

7162

for (int j = 0; j < Scale; ++j) {

7163

SDValue Op = V.getOperand((M * Scale) + j);

7164

AllUndef &= Op.isUndef();

7165

AllZero &= X86::isZeroNode(Op);

7166

}

7167

if (AllUndef)

7168

KnownUndef.setBit(i);

7169

if (AllZero)

7170

KnownZero.setBit(i);

7171

continue;

7172

}

7173

}

7174

}

7175

7176

/// Decode a target shuffle mask and inputs and see if any values are

7177

/// known to be undef or zero from their inputs.

7178

/// Returns true if the target shuffle mask was decoded.

7179

/// FIXME: Merge this with computeZeroableShuffleElements?

7180

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

7181

SmallVectorImpl<SDValue> &Ops,

7182

APInt &KnownUndef, APInt &KnownZero) {

7183

bool IsUnary;

7184

if (!isTargetShuffle(N.getOpcode()))

7185

return false;

7186

7187

MVT VT = N.getSimpleValueType();

7188

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

7189

return false;

7190

7191

int Size = Mask.size();

7192

SDValue V1 = Ops[0];

7193

SDValue V2 = IsUnary ? V1 : Ops[1];

7194

KnownUndef = KnownZero = APInt::getNullValue(Size);

7195

7196

V1 = peekThroughBitcasts(V1);

7197

V2 = peekThroughBitcasts(V2);

7198

7199

assert((VT.getSizeInBits() % Size) == 0 &&(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7200, __PRETTY_FUNCTION__))

7200

"Illegal split of shuffle value type")(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7200, __PRETTY_FUNCTION__));

7201

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

7202

7203

// Extract known constant input data.

7204

APInt UndefSrcElts[2];

7205

SmallVector<APInt, 32> SrcEltBits[2];

7206

bool IsSrcConstant[2] = {

7207

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

7208

SrcEltBits[0], true, false),

7209

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

7210

SrcEltBits[1], true, false)};

7211

7212

for (int i = 0; i < Size; ++i) {

7213

int M = Mask[i];

7214

7215

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

7216

if (M < 0) {

7217

assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7217, __PRETTY_FUNCTION__));

7218

if (SM_SentinelUndef == M)

7219

KnownUndef.setBit(i);

7220

if (SM_SentinelZero == M)

7221

KnownZero.setBit(i);

7222

continue;

7223

}

7224

7225

// Determine shuffle input and normalize the mask.

7226

unsigned SrcIdx = M / Size;

7227

SDValue V = M < Size ? V1 : V2;

7228

M %= Size;

7229

7230

// We are referencing an UNDEF input.

7231

if (V.isUndef()) {

7232

KnownUndef.setBit(i);

7233

continue;

7234

}

7235

7236

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

7237

// TODO: We currently only set UNDEF for integer types - floats use the same

7238

// registers as vectors and many of the scalar folded loads rely on the

7239

// SCALAR_TO_VECTOR pattern.

7240

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

7241

(Size % V.getValueType().getVectorNumElements()) == 0) {

7242

int Scale = Size / V.getValueType().getVectorNumElements();

7243

int Idx = M / Scale;

7244

if (Idx != 0 && !VT.isFloatingPoint())

7245

KnownUndef.setBit(i);

7246

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

7247

KnownZero.setBit(i);

7248

continue;

7249

}

7250

7251

// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

7252

// base vectors.

7253

if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

7254

SDValue Vec = V.getOperand(0);

7255

int NumVecElts = Vec.getValueType().getVectorNumElements();

7256

if (Vec.isUndef() && Size == NumVecElts) {

7257

int Idx = V.getConstantOperandVal(2);

7258

int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

7259

if (M < Idx || (Idx + NumSubElts) <= M)

7260

KnownUndef.setBit(i);

7261

}

7262

continue;

7263

}

7264

7265

// Attempt to extract from the source's constant bits.

7266

if (IsSrcConstant[SrcIdx]) {

7267

if (UndefSrcElts[SrcIdx][M])

7268

KnownUndef.setBit(i);

7269

else if (SrcEltBits[SrcIdx][M] == 0)

7270

KnownZero.setBit(i);

7271

}

7272

}

7273

7274

assert(VT.getVectorNumElements() == (unsigned)Size &&((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7275, __PRETTY_FUNCTION__))

7275

"Different mask size from vector size!")((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7275, __PRETTY_FUNCTION__));

7276

return true;

7277

}

7278

7279

// Replace target shuffle mask elements with known undef/zero sentinels.

7280

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

7281

const APInt &KnownUndef,

7282

const APInt &KnownZero,

7283

bool ResolveKnownZeros= true) {

7284

unsigned NumElts = Mask.size();

7285

assert(KnownUndef.getBitWidth() == NumElts &&((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7286, __PRETTY_FUNCTION__))

7286

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7286, __PRETTY_FUNCTION__));

7287

7288

for (unsigned i = 0; i != NumElts; ++i) {

7289

if (KnownUndef[i])

7290

Mask[i] = SM_SentinelUndef;

7291

else if (ResolveKnownZeros && KnownZero[i])

7292

Mask[i] = SM_SentinelZero;

7293

}

7294

}

7295

7296

// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.

7297

static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

7298

APInt &KnownUndef,

7299

APInt &KnownZero) {

7300

unsigned NumElts = Mask.size();

7301

KnownUndef = KnownZero = APInt::getNullValue(NumElts);

7302

7303

for (unsigned i = 0; i != NumElts; ++i) {

7304

int M = Mask[i];

7305

if (SM_SentinelUndef == M)

7306

KnownUndef.setBit(i);

7307

if (SM_SentinelZero == M)

7308

KnownZero.setBit(i);

7309

}

7310

}

7311

7312

// Forward declaration (for getFauxShuffleMask recursive check).

7313

// TODO: Use DemandedElts variant.

7314

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

7315

SmallVectorImpl<int> &Mask,

7316

const SelectionDAG &DAG, unsigned Depth,

7317

bool ResolveKnownElts);

7318

7319

// Attempt to decode ops that could be represented as a shuffle mask.

7320

// The decoded shuffle mask may contain a different number of elements to the

7321

// destination value type.

7322

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

7323

SmallVectorImpl<int> &Mask,

7324

SmallVectorImpl<SDValue> &Ops,

7325

const SelectionDAG &DAG, unsigned Depth,

7326

bool ResolveKnownElts) {

7327

Mask.clear();

7328

Ops.clear();

7329

7330

MVT VT = N.getSimpleValueType();

7331

unsigned NumElts = VT.getVectorNumElements();

7332

unsigned NumSizeInBits = VT.getSizeInBits();

7333

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

7334

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

7335

return false;

7336

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7336, __PRETTY_FUNCTION__));

7337

unsigned NumSizeInBytes = NumSizeInBits / 8;

7338

unsigned NumBytesPerElt = NumBitsPerElt / 8;

7339

7340

unsigned Opcode = N.getOpcode();

7341

switch (Opcode) {

7342

case ISD::VECTOR_SHUFFLE: {

7343

// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

7344

ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

7345

if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

7346

Mask.append(ShuffleMask.begin(), ShuffleMask.end());

7347

Ops.push_back(N.getOperand(0));

7348

Ops.push_back(N.getOperand(1));

7349

return true;

7350

}

7351

return false;

7352

}

7353

case ISD::AND:

7354

case X86ISD::ANDNP: {

7355

// Attempt to decode as a per-byte mask.

7356

APInt UndefElts;

7357

SmallVector<APInt, 32> EltBits;

7358

SDValue N0 = N.getOperand(0);

7359

SDValue N1 = N.getOperand(1);

7360

bool IsAndN = (X86ISD::ANDNP == Opcode);

7361

uint64_t ZeroMask = IsAndN ? 255 : 0;

7362

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

7363

return false;

7364

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

7365

if (UndefElts[i]) {

7366

Mask.push_back(SM_SentinelUndef);

7367

continue;

7368

}

7369

const APInt &ByteBits = EltBits[i];

7370

if (ByteBits != 0 && ByteBits != 255)

7371

return false;

7372

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

7373

}

7374

Ops.push_back(IsAndN ? N1 : N0);

7375

return true;

7376

}

7377

case ISD::OR: {

7378

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

7379

// is a valid shuffle index.

7380

SDValue N0 = peekThroughBitcasts(N.getOperand(0));

7381

SDValue N1 = peekThroughBitcasts(N.getOperand(1));

7382

if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

7383

return false;

7384

SmallVector<int, 64> SrcMask0, SrcMask1;

7385

SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

7386

if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,

7387

true) ||

7388

!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,

7389

true))

7390

return false;

7391

7392

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

7393

SmallVector<int, 64> Mask0, Mask1;

7394

narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

7395

narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

7396

for (int i = 0; i != (int)MaskSize; ++i) {

7397

if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)

7398

Mask.push_back(SM_SentinelUndef);

7399

else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

7400

Mask.push_back(SM_SentinelZero);

7401

else if (Mask1[i] == SM_SentinelZero)

7402

Mask.push_back(i);

7403

else if (Mask0[i] == SM_SentinelZero)

7404

Mask.push_back(i + MaskSize);

7405

else

7406

return false;

7407

}

7408

Ops.push_back(N0);

7409

Ops.push_back(N1);

7410

return true;

7411

}

7412

case ISD::INSERT_SUBVECTOR: {

7413

SDValue Src = N.getOperand(0);

7414

SDValue Sub = N.getOperand(1);

7415

EVT SubVT = Sub.getValueType();

7416

unsigned NumSubElts = SubVT.getVectorNumElements();

7417

if (!N->isOnlyUserOf(Sub.getNode()))

7418

return false;

7419

uint64_t InsertIdx = N.getConstantOperandVal(2);

7420

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

7421

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

7422

Sub.getOperand(0).getValueType() == VT) {

7423

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

7424

for (int i = 0; i != (int)NumElts; ++i)

7425

Mask.push_back(i);

7426

for (int i = 0; i != (int)NumSubElts; ++i)

7427

Mask[InsertIdx + i] = NumElts + ExtractIdx + i;

7428

Ops.push_back(Src);

7429

Ops.push_back(Sub.getOperand(0));

7430

return true;

7431

}

7432

// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

7433

SmallVector<int, 64> SubMask;

7434

SmallVector<SDValue, 2> SubInputs;

7435

if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,

7436

SubMask, DAG, Depth + 1, ResolveKnownElts))

7437

return false;

7438

7439

// Subvector shuffle inputs must not be larger than the subvector.

7440

if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

7441

return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();

7442

}))

7443

return false;

7444

7445

if (SubMask.size() != NumSubElts) {

7446

assert(((SubMask.size() % NumSubElts) == 0 ||((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7447, __PRETTY_FUNCTION__))

7447

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7447, __PRETTY_FUNCTION__));

7448

if ((NumSubElts % SubMask.size()) == 0) {

7449

int Scale = NumSubElts / SubMask.size();

7450

SmallVector<int,64> ScaledSubMask;

7451

narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

7452

SubMask = ScaledSubMask;

7453

} else {

7454

int Scale = SubMask.size() / NumSubElts;

7455

NumSubElts = SubMask.size();

7456

NumElts *= Scale;

7457

InsertIdx *= Scale;

7458

}

7459

}

7460

Ops.push_back(Src);

7461

Ops.append(SubInputs.begin(), SubInputs.end());

7462

if (ISD::isBuildVectorAllZeros(Src.getNode()))

7463

Mask.append(NumElts, SM_SentinelZero);

7464

else

7465

for (int i = 0; i != (int)NumElts; ++i)

7466

Mask.push_back(i);

7467

for (int i = 0; i != (int)NumSubElts; ++i) {

7468

int M = SubMask[i];

7469

if (0 <= M) {

7470

int InputIdx = M / NumSubElts;

7471

M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

7472

}

7473

Mask[i + InsertIdx] = M;

7474

}

7475

return true;

7476

}

7477

case X86ISD::PINSRB:

7478

case X86ISD::PINSRW:

7479

case ISD::SCALAR_TO_VECTOR:

7480

case ISD::INSERT_VECTOR_ELT: {

7481

// Match against a insert_vector_elt/scalar_to_vector of an extract from a

7482

// vector, for matching src/dst vector types.

7483

SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

7484

7485

unsigned DstIdx = 0;

7486

if (Opcode != ISD::SCALAR_TO_VECTOR) {

7487

// Check we have an in-range constant insertion index.

7488

if (!isa<ConstantSDNode>(N.getOperand(2)) ||

7489

N.getConstantOperandAPInt(2).uge(NumElts))

7490

return false;

7491

DstIdx = N.getConstantOperandVal(2);

7492

7493

// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

7494

if (X86::isZeroNode(Scl)) {

7495

Ops.push_back(N.getOperand(0));

7496

for (unsigned i = 0; i != NumElts; ++i)

7497

Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

7498

return true;

7499

}

7500

}

7501

7502

// Peek through trunc/aext/zext.

7503

// TODO: aext shouldn't require SM_SentinelZero padding.

7504

// TODO: handle shift of scalars.

7505

unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

7506

while (Scl.getOpcode() == ISD::TRUNCATE ||

7507

Scl.getOpcode() == ISD::ANY_EXTEND ||

7508

Scl.getOpcode() == ISD::ZERO_EXTEND) {

7509

Scl = Scl.getOperand(0);

7510

MinBitsPerElt =

7511

std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

7512

}

7513

if ((MinBitsPerElt % 8) != 0)

7514

return false;

7515

7516

// Attempt to find the source vector the scalar was extracted from.

7517

SDValue SrcExtract;

7518

if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

7519

Scl.getOpcode() == X86ISD::PEXTRW ||

7520

Scl.getOpcode() == X86ISD::PEXTRB) &&

7521

Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

7522

SrcExtract = Scl;

7523

}

7524

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

7525

return false;

7526

7527

SDValue SrcVec = SrcExtract.getOperand(0);

7528

EVT SrcVT = SrcVec.getValueType();

7529

if (!SrcVT.getScalarType().isByteSized())

7530

return false;

7531

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

7532

unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

7533

unsigned DstByte = DstIdx * NumBytesPerElt;

7534

MinBitsPerElt =

7535

std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

7536

7537

// Create 'identity' byte level shuffle mask and then add inserted bytes.

7538

if (Opcode == ISD::SCALAR_TO_VECTOR) {

7539

Ops.push_back(SrcVec);

7540

Mask.append(NumSizeInBytes, SM_SentinelUndef);

7541

} else {

7542

Ops.push_back(SrcVec);

7543

Ops.push_back(N.getOperand(0));

7544

for (int i = 0; i != (int)NumSizeInBytes; ++i)

7545

Mask.push_back(NumSizeInBytes + i);

7546

}

7547

7548

unsigned MinBytesPerElts = MinBitsPerElt / 8;

7549

MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

7550

for (unsigned i = 0; i != MinBytesPerElts; ++i)

7551

Mask[DstByte + i] = SrcByte + i;

7552

for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

7553

Mask[DstByte + i] = SM_SentinelZero;

7554

return true;

7555

}

7556

case X86ISD::PACKSS:

7557

case X86ISD::PACKUS: {

7558

SDValue N0 = N.getOperand(0);

7559

SDValue N1 = N.getOperand(1);

7560

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7562, __PRETTY_FUNCTION__))

7561

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7562, __PRETTY_FUNCTION__))

7562

"Unexpected input value type")((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7562, __PRETTY_FUNCTION__));

7563

7564

APInt EltsLHS, EltsRHS;

7565

getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

7566

7567

// If we know input saturation won't happen (or we don't care for particular

7568

// lanes), we can treat this as a truncation shuffle.

7569

if (Opcode == X86ISD::PACKSS) {

7570

if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&

7571

DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

7572

(!(N1.isUndef() || EltsRHS.isNullValue()) &&

7573

DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

7574

return false;

7575

} else {

7576

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

7577

if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&

7578

!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

7579

(!(N1.isUndef() || EltsRHS.isNullValue()) &&

7580

!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

7581

return false;

7582

}

7583

7584

bool IsUnary = (N0 == N1);

7585

7586

Ops.push_back(N0);

7587

if (!IsUnary)

7588

Ops.push_back(N1);

7589

7590

createPackShuffleMask(VT, Mask, IsUnary);

7591

return true;

7592

}

7593

case X86ISD::VTRUNC: {

7594

SDValue Src = N.getOperand(0);

7595

EVT SrcVT = Src.getValueType();

7596

// Truncated source must be a simple vector.

7597

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

7598

(SrcVT.getScalarSizeInBits() % 8) != 0)

7599

return false;

7600

unsigned NumSrcElts = SrcVT.getVectorNumElements();

7601

unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

7602

unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

7603

assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7603, __PRETTY_FUNCTION__));

7604

for (unsigned i = 0; i != NumSrcElts; ++i)

7605

Mask.push_back(i * Scale);

7606

Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

7607

Ops.push_back(Src);

7608

return true;

7609

}

7610

case X86ISD::VSHLI:

7611

case X86ISD::VSRLI: {

7612

uint64_t ShiftVal = N.getConstantOperandVal(1);

7613

// Out of range bit shifts are guaranteed to be zero.

7614

if (NumBitsPerElt <= ShiftVal) {

7615

Mask.append(NumElts, SM_SentinelZero);

7616

return true;

7617

}

7618

7619

// We can only decode 'whole byte' bit shifts as shuffles.

7620

if ((ShiftVal % 8) != 0)

7621

break;

7622

7623

uint64_t ByteShift = ShiftVal / 8;

7624

Ops.push_back(N.getOperand(0));

7625

7626

// Clear mask to all zeros and insert the shifted byte indices.

7627

Mask.append(NumSizeInBytes, SM_SentinelZero);

7628

7629

if (X86ISD::VSHLI == Opcode) {

7630

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

7631

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

7632

Mask[i + j] = i + j - ByteShift;

7633

} else {

7634

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

7635

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

7636

Mask[i + j - ByteShift] = i + j;

7637

}

7638

return true;

7639

}

7640

case X86ISD::VROTLI:

7641

case X86ISD::VROTRI: {

7642

// We can only decode 'whole byte' bit rotates as shuffles.

7643

uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

7644

if ((RotateVal % 8) != 0)

7645

return false;

7646

Ops.push_back(N.getOperand(0));

7647

int Offset = RotateVal / 8;

7648

Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

7649

for (int i = 0; i != (int)NumElts; ++i) {

7650

int BaseIdx = i * NumBytesPerElt;

7651

for (int j = 0; j != (int)NumBytesPerElt; ++j) {

7652

Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

7653

}

7654

}

7655

return true;

7656

}

7657

case X86ISD::VBROADCAST: {

7658

SDValue Src = N.getOperand(0);

7659

if (!Src.getSimpleValueType().isVector())

7660

return false;

7661

Ops.push_back(Src);

7662

Mask.append(NumElts, 0);

7663

return true;

7664

}

7665

case ISD::ZERO_EXTEND:

7666

case ISD::ANY_EXTEND:

7667

case ISD::ZERO_EXTEND_VECTOR_INREG:

7668

case ISD::ANY_EXTEND_VECTOR_INREG: {

7669

SDValue Src = N.getOperand(0);

7670

EVT SrcVT = Src.getValueType();

7671

7672

// Extended source must be a simple vector.

7673

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

7674

(SrcVT.getScalarSizeInBits() % 8) != 0)

7675

return false;

7676

7677

bool IsAnyExtend =

7678

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

7679

DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

7680

IsAnyExtend, Mask);

7681

Ops.push_back(Src);

7682

return true;

7683

}

7684

}

7685

7686

return false;

7687

}

7688

7689

/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.

7690

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

7691

SmallVectorImpl<int> &Mask) {

7692

int MaskWidth = Mask.size();

7693

SmallVector<SDValue, 16> UsedInputs;

7694

for (int i = 0, e = Inputs.size(); i < e; ++i) {

7695

int lo = UsedInputs.size() * MaskWidth;

7696

int hi = lo + MaskWidth;

7697

7698

// Strip UNDEF input usage.

7699

if (Inputs[i].isUndef())

7700

for (int &M : Mask)

7701

if ((lo <= M) && (M < hi))

7702

M = SM_SentinelUndef;

7703

7704

// Check for unused inputs.

7705

if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

7706

for (int &M : Mask)

7707

if (lo <= M)

7708

M -= MaskWidth;

7709

continue;

7710

}

7711

7712

// Check for repeated inputs.

7713

bool IsRepeat = false;

7714

for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

7715

if (UsedInputs[j] != Inputs[i])

7716

continue;

7717

for (int &M : Mask)

7718

if (lo <= M)

7719

M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

7720

IsRepeat = true;

7721

break;

7722

}

7723

if (IsRepeat)

7724

continue;

7725

7726

UsedInputs.push_back(Inputs[i]);

7727

}

7728

Inputs = UsedInputs;

7729

}

7730

7731

/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

7732

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

7733

/// Returns true if the target shuffle mask was decoded.

7734

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

7735

SmallVectorImpl<SDValue> &Inputs,

7736

SmallVectorImpl<int> &Mask,

7737

APInt &KnownUndef, APInt &KnownZero,

7738

const SelectionDAG &DAG, unsigned Depth,

7739

bool ResolveKnownElts) {

7740

EVT VT = Op.getValueType();

7741

if (!VT.isSimple() || !VT.isVector())

7742

return false;

7743

7744

if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

7745

if (ResolveKnownElts)

7746

resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

7747

return true;

7748

}

7749

if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

7750

ResolveKnownElts)) {

7751

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

7752

return true;

7753

}

7754

return false;

7755

}

7756

7757

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

7758

SmallVectorImpl<int> &Mask,

7759

const SelectionDAG &DAG, unsigned Depth = 0,

7760

bool ResolveKnownElts = true) {

7761

EVT VT = Op.getValueType();

7762

if (!VT.isSimple() || !VT.isVector())

7763

return false;

7764

7765

APInt KnownUndef, KnownZero;

7766

unsigned NumElts = Op.getValueType().getVectorNumElements();

7767

APInt DemandedElts = APInt::getAllOnesValue(NumElts);

7768

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

7769

KnownZero, DAG, Depth, ResolveKnownElts);

7770

}

7771

7772

/// Returns the scalar element that will make up the i'th

7773

/// element of the result of the vector shuffle.

7774

static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

7775

SelectionDAG &DAG, unsigned Depth) {

7776

if (Depth >= SelectionDAG::MaxRecursionDepth)

7777

return SDValue(); // Limit search depth.

7778

7779

EVT VT = Op.getValueType();

7780

unsigned Opcode = Op.getOpcode();

7781

unsigned NumElems = VT.getVectorNumElements();

7782

7783

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

7784

if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

7785

int Elt = SV->getMaskElt(Index);

7786

7787

if (Elt < 0)

7788

return DAG.getUNDEF(VT.getVectorElementType());

7789

7790

SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

7791

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

7792

}

7793

7794

// Recurse into target specific vector shuffles to find scalars.

7795

if (isTargetShuffle(Opcode)) {

7796

MVT ShufVT = VT.getSimpleVT();

7797

MVT ShufSVT = ShufVT.getVectorElementType();

7798

int NumElems = (int)ShufVT.getVectorNumElements();

7799

SmallVector<int, 16> ShuffleMask;

7800

SmallVector<SDValue, 16> ShuffleOps;

7801

bool IsUnary;

7802

7803

if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

7804

ShuffleMask, IsUnary))

7805

return SDValue();

7806

7807

int Elt = ShuffleMask[Index];

7808

if (Elt == SM_SentinelZero)

7809

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

7810

: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

7811

if (Elt == SM_SentinelUndef)

7812

return DAG.getUNDEF(ShufSVT);

7813

7814

assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")((0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7814, __PRETTY_FUNCTION__));

7815

SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

7816

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

7817

}

7818

7819

// Recurse into insert_subvector base/sub vector to find scalars.

7820

if (Opcode == ISD::INSERT_SUBVECTOR) {

7821

SDValue Vec = Op.getOperand(0);

7822

SDValue Sub = Op.getOperand(1);

7823

uint64_t SubIdx = Op.getConstantOperandVal(2);

7824

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

7825

7826

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

7827

return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

7828

return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

7829

}

7830

7831

// Recurse into concat_vectors sub vector to find scalars.

7832

if (Opcode == ISD::CONCAT_VECTORS) {

7833

EVT SubVT = Op.getOperand(0).getValueType();

7834

unsigned NumSubElts = SubVT.getVectorNumElements();

7835

uint64_t SubIdx = Index / NumSubElts;

7836

uint64_t SubElt = Index % NumSubElts;

7837

return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

7838

}

7839

7840

// Recurse into extract_subvector src vector to find scalars.

7841

if (Opcode == ISD::EXTRACT_SUBVECTOR) {

7842

SDValue Src = Op.getOperand(0);

7843

uint64_t SrcIdx = Op.getConstantOperandVal(1);

7844

return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

7845

}

7846

7847

// We only peek through bitcasts of the same vector width.

7848

if (Opcode == ISD::BITCAST) {

7849

SDValue Src = Op.getOperand(0);

7850

EVT SrcVT = Src.getValueType();

7851

if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

7852

return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

7853

return SDValue();

7854

}

7855

7856

// Actual nodes that may contain scalar elements

7857

7858

// For insert_vector_elt - either return the index matching scalar or recurse

7859

// into the base vector.

7860

if (Opcode == ISD::INSERT_VECTOR_ELT &&

7861

isa<ConstantSDNode>(Op.getOperand(2))) {

7862

if (Op.getConstantOperandAPInt(2) == Index)

7863

return Op.getOperand(1);

7864

return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

7865

}

7866

7867

if (Opcode == ISD::SCALAR_TO_VECTOR)

7868

return (Index == 0) ? Op.getOperand(0)

7869

: DAG.getUNDEF(VT.getVectorElementType());

7870

7871

if (Opcode == ISD::BUILD_VECTOR)

7872

return Op.getOperand(Index);

7873

7874

return SDValue();

7875

}

7876

7877

// Use PINSRB/PINSRW/PINSRD to create a build vector.

7878

static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,

7879

unsigned NumNonZero, unsigned NumZero,

7880

SelectionDAG &DAG,

7881

const X86Subtarget &Subtarget) {

7882

MVT VT = Op.getSimpleValueType();

7883

unsigned NumElts = VT.getVectorNumElements();

7884

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7886, __PRETTY_FUNCTION__))

7885

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7886, __PRETTY_FUNCTION__))

7886

"Illegal vector insertion")((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7886, __PRETTY_FUNCTION__));

7887

7888

SDLoc dl(Op);

7889

SDValue V;

7890

bool First = true;

7891

7892

for (unsigned i = 0; i < NumElts; ++i) {

7893

bool IsNonZero = (NonZeros & (1 << i)) != 0;

7894

if (!IsNonZero)

7895

continue;

7896

7897

// If the build vector contains zeros or our first insertion is not the

7898

// first index then insert into zero vector to break any register

7899

// dependency else use SCALAR_TO_VECTOR.

7900

if (First) {

7901

First = false;

7902

if (NumZero || 0 != i)

7903

V = getZeroVector(VT, Subtarget, DAG, dl);

7904

else {

7905

assert(0 == i && "Expected insertion into zero-index")((0 == i && "Expected insertion into zero-index") ? static_cast
<void> (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7905, __PRETTY_FUNCTION__));

7906

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

7907

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

7908

V = DAG.getBitcast(VT, V);

7909

continue;

7910

}

7911

}

7912

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

7913

DAG.getIntPtrConstant(i, dl));

7914

}

7915

7916

return V;

7917

}

7918

7919

/// Custom lower build_vector of v16i8.

7920

static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,

7921

unsigned NumNonZero, unsigned NumZero,

7922

SelectionDAG &DAG,

7923

const X86Subtarget &Subtarget) {

7924

if (NumNonZero > 8 && !Subtarget.hasSSE41())

7925

return SDValue();

7926

7927

// SSE4.1 - use PINSRB to insert each byte directly.

7928

if (Subtarget.hasSSE41())

7929

return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,

7930

Subtarget);

7931

7932

SDLoc dl(Op);

7933

SDValue V;

7934

7935

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

7936

for (unsigned i = 0; i < 16; i += 2) {

7937

bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;

7938

bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;

7939

if (!ThisIsNonZero && !NextIsNonZero)

7940

continue;

7941

7942

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

7943

SDValue Elt;

7944

if (ThisIsNonZero) {

7945

if (NumZero || NextIsNonZero)

7946

Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

7947

else

7948

Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

7949

}

7950

7951

if (NextIsNonZero) {

7952

SDValue NextElt = Op.getOperand(i + 1);

7953

if (i == 0 && NumZero)

7954

NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);

7955

else

7956

NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);

7957

NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,

7958

DAG.getConstant(8, dl, MVT::i8));

7959

if (ThisIsNonZero)

7960

Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);

7961

else

7962

Elt = NextElt;

7963

}

7964

7965

// If our first insertion is not the first index or zeros are needed, then

7966

// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

7967

// elements undefined).

7968

if (!V) {

7969

if (i != 0 || NumZero)

7970

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

7971

else {

7972

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

7973

V = DAG.getBitcast(MVT::v8i16, V);

7974

continue;

7975

}

7976

}

7977

Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);

7978

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,

7979

DAG.getIntPtrConstant(i / 2, dl));

7980

}

7981

7982

return DAG.getBitcast(MVT::v16i8, V);

7983

}

7984

7985

/// Custom lower build_vector of v8i16.

7986

static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,

7987

unsigned NumNonZero, unsigned NumZero,

7988

SelectionDAG &DAG,

7989

const X86Subtarget &Subtarget) {

7990

if (NumNonZero > 4 && !Subtarget.hasSSE41())

7991

return SDValue();

7992

7993

// Use PINSRW to insert each byte directly.

7994

return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,

7995

Subtarget);

7996

}

7997

7998

/// Custom lower build_vector of v4i32 or v4f32.

7999

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

8000

const X86Subtarget &Subtarget) {

8001

// If this is a splat of a pair of elements, use MOVDDUP (unless the target

8002

// has XOP; in that case defer lowering to potentially use VPERMIL2PS).

8003

// Because we're creating a less complicated build vector here, we may enable

8004

// further folding of the MOVDDUP via shuffle transforms.

8005

if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

8006

Op.getOperand(0) == Op.getOperand(2) &&

8007

Op.getOperand(1) == Op.getOperand(3) &&

8008

Op.getOperand(0) != Op.getOperand(1)) {

8009

SDLoc DL(Op);

8010

MVT VT = Op.getSimpleValueType();

8011

MVT EltVT = VT.getVectorElementType();

8012

// Create a new build vector with the first 2 elements followed by undef

8013

// padding, bitcast to v2f64, duplicate, and bitcast back.

8014

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

8015

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

8016

SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

8017

SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

8018

return DAG.getBitcast(VT, Dup);

8019

}

8020

8021

// Find all zeroable elements.

8022

std::bitset<4> Zeroable, Undefs;

8023

for (int i = 0; i < 4; ++i) {

8024

SDValue Elt = Op.getOperand(i);

8025

Undefs[i] = Elt.isUndef();

8026

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

8027

}

8028

assert(Zeroable.size() - Zeroable.count() > 1 &&((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8029, __PRETTY_FUNCTION__))

8029

"We expect at least two non-zero elements!")((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8029, __PRETTY_FUNCTION__));

8030

8031

// We only know how to deal with build_vector nodes where elements are either

8032

// zeroable or extract_vector_elt with constant index.

8033

SDValue FirstNonZero;

8034

unsigned FirstNonZeroIdx;

8035

for (unsigned i = 0; i < 4; ++i) {

8036

if (Zeroable[i])

8037

continue;

8038

SDValue Elt = Op.getOperand(i);

8039

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8040

!isa<ConstantSDNode>(Elt.getOperand(1)))

8041

return SDValue();

8042

// Make sure that this node is extracting from a 128-bit vector.

8043

MVT VT = Elt.getOperand(0).getSimpleValueType();

8044

if (!VT.is128BitVector())

8045

return SDValue();

8046

if (!FirstNonZero.getNode()) {

8047

FirstNonZero = Elt;

8048

FirstNonZeroIdx = i;

8049

}

8050

}

8051

8052

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8052, __PRETTY_FUNCTION__));

8053

SDValue V1 = FirstNonZero.getOperand(0);

8054

MVT VT = V1.getSimpleValueType();

8055

8056

// See if this build_vector can be lowered as a blend with zero.

8057

SDValue Elt;

8058

unsigned EltMaskIdx, EltIdx;

8059

int Mask[4];

8060

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

8061

if (Zeroable[EltIdx]) {

8062

// The zero vector will be on the right hand side.

8063

Mask[EltIdx] = EltIdx+4;

8064

continue;

8065

}

8066

8067

Elt = Op->getOperand(EltIdx);

8068

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

8069

EltMaskIdx = Elt.getConstantOperandVal(1);

8070

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

8071

break;

8072

Mask[EltIdx] = EltIdx;

8073

}

8074

8075

if (EltIdx == 4) {

8076

// Let the shuffle legalizer deal with blend operations.

8077

SDValue VZeroOrUndef = (Zeroable == Undefs)

8078

? DAG.getUNDEF(VT)

8079

: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

8080

if (V1.getSimpleValueType() != VT)

8081

V1 = DAG.getBitcast(VT, V1);

8082

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

8083

}

8084

8085

// See if we can lower this build_vector to a INSERTPS.

8086

if (!Subtarget.hasSSE41())

8087

return SDValue();

8088

8089

SDValue V2 = Elt.getOperand(0);

8090

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

8091

V1 = SDValue();

8092

8093

bool CanFold = true;

8094

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

8095

if (Zeroable[i])

8096

continue;

8097

8098

SDValue Current = Op->getOperand(i);

8099

SDValue SrcVector = Current->getOperand(0);

8100

if (!V1.getNode())

8101

V1 = SrcVector;

8102

CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

8103

}

8104

8105

if (!CanFold)

8106

return SDValue();

8107

8108

assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8108, __PRETTY_FUNCTION__));

8109

if (V1.getSimpleValueType() != MVT::v4f32)

8110

V1 = DAG.getBitcast(MVT::v4f32, V1);

8111

if (V2.getSimpleValueType() != MVT::v4f32)

8112

V2 = DAG.getBitcast(MVT::v4f32, V2);

8113

8114

// Ok, we can emit an INSERTPS instruction.

8115

unsigned ZMask = Zeroable.to_ulong();

8116

8117

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

8118

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8118, __PRETTY_FUNCTION__));

8119

SDLoc DL(Op);

8120

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

8121

DAG.getIntPtrConstant(InsertPSMask, DL, true));

8122

return DAG.getBitcast(VT, Result);

8123

}

8124

8125

/// Return a vector logical shift node.

8126

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

8127

SelectionDAG &DAG, const TargetLowering &TLI,

8128

const SDLoc &dl) {

8129

assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8129, __PRETTY_FUNCTION__));

8130

MVT ShVT = MVT::v16i8;

8131

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

8132

SrcOp = DAG.getBitcast(ShVT, SrcOp);

8133

assert(NumBits % 8 == 0 && "Only support byte sized shifts")((NumBits % 8 == 0 && "Only support byte sized shifts"
) ? static_cast<void> (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8133, __PRETTY_FUNCTION__));

8134

SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

8135

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

8136

}

8137

8138

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

8139

SelectionDAG &DAG) {

8140

8141

// Check if the scalar load can be widened into a vector load. And if

8142

// the address is "base + cst" see if the cst can be "absorbed" into

8143

// the shuffle mask.

8144

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

8145

SDValue Ptr = LD->getBasePtr();

8146

if (!ISD::isNormalLoad(LD) || !LD->isSimple())

8147

return SDValue();

8148

EVT PVT = LD->getValueType(0);

8149

if (PVT != MVT::i32 && PVT != MVT::f32)

8150

return SDValue();

8151

8152

int FI = -1;

8153

int64_t Offset = 0;

8154

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

8155

FI = FINode->getIndex();

8156

Offset = 0;

8157

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

8158

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

8159

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

8160

Offset = Ptr.getConstantOperandVal(1);

8161

Ptr = Ptr.getOperand(0);

8162

} else {

8163

return SDValue();

8164

}

8165

8166

// FIXME: 256-bit vector instructions don't require a strict alignment,

8167

// improve this code to support it better.

8168

Align RequiredAlign(VT.getSizeInBits() / 8);

8169

SDValue Chain = LD->getChain();

8170

// Make sure the stack object alignment is at least 16 or 32.

8171

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

8172

MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

8173

if (!InferredAlign || *InferredAlign < RequiredAlign) {

8174

if (MFI.isFixedObjectIndex(FI)) {

8175

// Can't change the alignment. FIXME: It's possible to compute

8176

// the exact stack offset and reference FI + adjust offset instead.

8177

// If someone *really* cares about this. That's the way to implement it.

8178

return SDValue();

8179

} else {

8180

MFI.setObjectAlignment(FI, RequiredAlign);

8181

}

8182

}

8183

8184

// (Offset % 16 or 32) must be multiple of 4. Then address is then

8185

// Ptr + (Offset & ~15).

8186

if (Offset < 0)

8187

return SDValue();

8188

if ((Offset % RequiredAlign.value()) & 3)

8189

return SDValue();

8190

int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

8191

if (StartOffset) {

8192

SDLoc DL(Ptr);

8193

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

8194

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

8195

}

8196

8197

int EltNo = (Offset - StartOffset) >> 2;

8198

unsigned NumElems = VT.getVectorNumElements();

8199

8200

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

8201

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

8202

LD->getPointerInfo().getWithOffset(StartOffset));

8203

8204

SmallVector<int, 8> Mask(NumElems, EltNo);

8205

8206

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

8207

}

8208

8209

return SDValue();

8210

}

8211

8212

// Recurse to find a LoadSDNode source and the accumulated ByteOffest.

8213

static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

8214

if (ISD::isNON_EXTLoad(Elt.getNode())) {

8215

auto *BaseLd = cast<LoadSDNode>(Elt);

8216

if (!BaseLd->isSimple())

8217

return false;

8218

Ld = BaseLd;

8219

ByteOffset = 0;

8220

return true;

8221

}

8222

8223

switch (Elt.getOpcode()) {

8224

case ISD::BITCAST:

8225

case ISD::TRUNCATE:

8226

case ISD::SCALAR_TO_VECTOR:

8227

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

8228

case ISD::SRL:

8229

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

8230

uint64_t Idx = IdxC->getZExtValue();

8231

if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

8232

ByteOffset += Idx / 8;

8233

return true;

8234

}

8235

}

8236

break;

8237

case ISD::EXTRACT_VECTOR_ELT:

8238

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

8239

SDValue Src = Elt.getOperand(0);

8240

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

8241

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

8242

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

8243

findEltLoadSrc(Src, Ld, ByteOffset)) {

8244

uint64_t Idx = IdxC->getZExtValue();

8245

ByteOffset += Idx * (SrcSizeInBits / 8);

8246

return true;

8247

}

8248

}

8249

break;

8250

}

8251

8252

return false;

8253

}

8254

8255

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

8256

/// elements can be replaced by a single large load which has the same value as

8257

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

8258

///

8259

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

8260

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

8261

const SDLoc &DL, SelectionDAG &DAG,

8262

const X86Subtarget &Subtarget,

8263

bool isAfterLegalize) {

8264

if ((VT.getScalarSizeInBits() % 8) != 0)

8265

return SDValue();

8266

8267

unsigned NumElems = Elts.size();

8268

8269

int LastLoadedElt = -1;

8270

APInt LoadMask = APInt::getNullValue(NumElems);

8271

APInt ZeroMask = APInt::getNullValue(NumElems);

8272

APInt UndefMask = APInt::getNullValue(NumElems);

8273

8274

SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

8275

SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

8276

8277

// For each element in the initializer, see if we've found a load, zero or an

8278

// undef.

8279

for (unsigned i = 0; i < NumElems; ++i) {

8280

SDValue Elt = peekThroughBitcasts(Elts[i]);

8281

if (!Elt.getNode())

8282

return SDValue();

8283

if (Elt.isUndef()) {

8284

UndefMask.setBit(i);

8285

continue;

8286

}

8287

if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

8288

ZeroMask.setBit(i);

8289

continue;

8290

}

8291

8292

// Each loaded element must be the correct fractional portion of the

8293

// requested vector load.

8294

unsigned EltSizeInBits = Elt.getValueSizeInBits();

8295

if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

8296

return SDValue();

8297

8298

if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

8299

return SDValue();

8300

unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

8301

if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

8302

return SDValue();

8303

8304

LoadMask.setBit(i);

8305

LastLoadedElt = i;

8306

}

8307

assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8309, __PRETTY_FUNCTION__))

8308

LoadMask.countPopulation()) == NumElems &&(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8309, __PRETTY_FUNCTION__))

8309

"Incomplete element masks")(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8309, __PRETTY_FUNCTION__));

8310

8311

// Handle Special Cases - all undef or undef/zero.

8312

if (UndefMask.countPopulation() == NumElems)

8313

return DAG.getUNDEF(VT);

8314

8315

// FIXME: Should we return this as a BUILD_VECTOR instead?

8316

if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)

8317

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

8318

: DAG.getConstantFP(0.0, DL, VT);

8319

8320

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

8321

int FirstLoadedElt = LoadMask.countTrailingZeros();

8322

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

8323

EVT EltBaseVT = EltBase.getValueType();

8324

assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8325, __PRETTY_FUNCTION__))

8325

"Register/Memory size mismatch")((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8325, __PRETTY_FUNCTION__));

8326

LoadSDNode *LDBase = Loads[FirstLoadedElt];

8327

assert(LDBase && "Did not find base load for merging consecutive loads")((LDBase && "Did not find base load for merging consecutive loads"
) ? static_cast<void> (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8327, __PRETTY_FUNCTION__));

8328

unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

8329

unsigned BaseSizeInBytes = BaseSizeInBits / 8;

8330

int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;

8331

assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"
) ? static_cast<void> (0) : __assert_fail ("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8331, __PRETTY_FUNCTION__));

8332

8333

// TODO: Support offsetting the base load.

8334

if (ByteOffsets[FirstLoadedElt] != 0)

8335

return SDValue();

8336

8337

// Check to see if the element's load is consecutive to the base load

8338

// or offset from a previous (already checked) load.

8339

auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

8340

LoadSDNode *Ld = Loads[EltIdx];

8341

int64_t ByteOffset = ByteOffsets[EltIdx];

8342

if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

8343

int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

8344

return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

8345

Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

8346

}

8347

return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

8348

EltIdx - FirstLoadedElt);

8349

};

8350

8351

// Consecutive loads can contain UNDEFS but not ZERO elements.

8352

// Consecutive loads with UNDEFs and ZEROs elements require a

8353

// an additional shuffle stage to clear the ZERO elements.

8354

bool IsConsecutiveLoad = true;

8355

bool IsConsecutiveLoadWithZeros = true;

8356

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

8357

if (LoadMask[i]) {

8358

if (!CheckConsecutiveLoad(LDBase, i)) {

8359

IsConsecutiveLoad = false;

8360

IsConsecutiveLoadWithZeros = false;

8361

break;

8362

}

8363

} else if (ZeroMask[i]) {

8364

IsConsecutiveLoad = false;

8365

}

8366

}

8367

8368

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

8369

auto MMOFlags = LDBase->getMemOperand()->getFlags();

8370

assert(LDBase->isSimple() &&((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8371, __PRETTY_FUNCTION__))

8371

"Cannot merge volatile or atomic loads.")((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8371, __PRETTY_FUNCTION__));

8372

SDValue NewLd =

8373

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

8374

LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

8375

MMOFlags);

8376

for (auto *LD : Loads)

8377

if (LD)

8378

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

8379

return NewLd;

8380

};

8381

8382

// Check if the base load is entirely dereferenceable.

8383

bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

8384

VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

8385

8386

// LOAD - all consecutive load/undefs (must start/end with a load or be

8387

// entirely dereferenceable). If we have found an entire vector of loads and

8388

// undefs, then return a large load of the entire vector width starting at the

8389

// base pointer. If the vector contains zeros, then attempt to shuffle those

8390

// elements.

8391

if (FirstLoadedElt == 0 &&

8392

(LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&

8393

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

8394

if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

8395

return SDValue();

8396

8397

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

8398

// will lower to regular temporal loads and use the cache.

8399

if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&

8400

VT.is256BitVector() && !Subtarget.hasInt256())

8401

return SDValue();

8402

8403

if (NumElems == 1)

8404

return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

8405

8406

if (!ZeroMask)

8407

return CreateLoad(VT, LDBase);

8408

8409

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

8410

// vector and a zero vector to clear out the zero elements.

8411

if (!isAfterLegalize && VT.isVector()) {

8412

unsigned NumMaskElts = VT.getVectorNumElements();

8413

if ((NumMaskElts % NumElems) == 0) {

8414

unsigned Scale = NumMaskElts / NumElems;

8415

SmallVector<int, 4> ClearMask(NumMaskElts, -1);

8416

for (unsigned i = 0; i < NumElems; ++i) {

8417

if (UndefMask[i])

8418

continue;

8419

int Offset = ZeroMask[i] ? NumMaskElts : 0;

8420

for (unsigned j = 0; j != Scale; ++j)

8421

ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

8422

}

8423

SDValue V = CreateLoad(VT, LDBase);

8424

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

8425

: DAG.getConstantFP(0.0, DL, VT);

8426

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

8427

}

8428

}

8429

}

8430

8431

// If the upper half of a ymm/zmm load is undef then just load the lower half.

8432

if (VT.is256BitVector() || VT.is512BitVector()) {

8433

unsigned HalfNumElems = NumElems / 2;

8434

if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {

8435

EVT HalfVT =

8436

EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

8437

SDValue HalfLD =

8438

EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

8439

DAG, Subtarget, isAfterLegalize);

8440

if (HalfLD)

8441

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

8442

HalfLD, DAG.getIntPtrConstant(0, DL));

8443

}

8444

}

8445

8446

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

8447

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

8448

(LoadSizeInBits == 32 || LoadSizeInBits == 64) &&

8449

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

8450

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

8451

: MVT::getIntegerVT(LoadSizeInBits);

8452

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

8453

// Allow v4f32 on SSE1 only targets.

8454

// FIXME: Add more isel patterns so we can just use VT directly.

8455

if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

8456

VecVT = MVT::v4f32;

8457

if (TLI.isTypeLegal(VecVT)) {

8458

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

8459

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

8460

SDValue ResNode = DAG.getMemIntrinsicNode(

8461

X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

8462

LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

8463

for (auto *LD : Loads)

8464

if (LD)

8465

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

8466

return DAG.getBitcast(VT, ResNode);

8467

}

8468

}

8469

8470

// BROADCAST - match the smallest possible repetition pattern, load that

8471

// scalar/subvector element and then broadcast to the entire vector.

8472

if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

8473

(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

8474

for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

8475

unsigned RepeatSize = SubElems * BaseSizeInBits;

8476

unsigned ScalarSize = std::min(RepeatSize, 64u);

8477

if (!Subtarget.hasAVX2() && ScalarSize < 32)

8478

continue;

8479

8480

bool Match = true;

8481

SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

8482

for (unsigned i = 0; i != NumElems && Match; ++i) {

8483

if (!LoadMask[i])

8484

continue;

8485

SDValue Elt = peekThroughBitcasts(Elts[i]);

8486

if (RepeatedLoads[i % SubElems].isUndef())

8487

RepeatedLoads[i % SubElems] = Elt;

8488

else

8489

Match &= (RepeatedLoads[i % SubElems] == Elt);

8490

}

8491

8492

// We must have loads at both ends of the repetition.

8493

Match &= !RepeatedLoads.front().isUndef();

8494

Match &= !RepeatedLoads.back().isUndef();

8495

if (!Match)

8496

continue;

8497

8498

EVT RepeatVT =

8499

VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

8500

? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

8501

: EVT::getFloatingPointVT(ScalarSize);

8502

if (RepeatSize > ScalarSize)

8503

RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

8504

RepeatSize / ScalarSize);

8505

EVT BroadcastVT =

8506

EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

8507

VT.getSizeInBits() / ScalarSize);

8508

if (TLI.isTypeLegal(BroadcastVT)) {

8509

if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

8510

RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {

8511

unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST

8512

: X86ISD::VBROADCAST;

8513

SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);

8514

return DAG.getBitcast(VT, Broadcast);

8515

}

8516

}

8517

}

8518

}

8519

8520

return SDValue();

8521

}

8522

8523

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

8524

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

8525

// are consecutive, non-overlapping, and in the right order.

8526

static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

8527

SelectionDAG &DAG,

8528

const X86Subtarget &Subtarget,

8529

bool isAfterLegalize) {

8530

SmallVector<SDValue, 64> Elts;

8531

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

8532

if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

8533

Elts.push_back(Elt);

8534

continue;

8535

}

8536

return SDValue();

8537

}

8538

assert(Elts.size() == VT.getVectorNumElements())((Elts.size() == VT.getVectorNumElements()) ? static_cast<
void> (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8538, __PRETTY_FUNCTION__));

8539

return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

8540

isAfterLegalize);

8541

}

8542

8543

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

8544

unsigned SplatBitSize, LLVMContext &C) {

8545

unsigned ScalarSize = VT.getScalarSizeInBits();

8546

unsigned NumElm = SplatBitSize / ScalarSize;

8547

8548

SmallVector<Constant *, 32> ConstantVec;

8549

for (unsigned i = 0; i < NumElm; i++) {

8550

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

8551

Constant *Const;

8552

if (VT.isFloatingPoint()) {

8553

if (ScalarSize == 32) {

8554

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

8555

} else {

8556

assert(ScalarSize == 64 && "Unsupported floating point scalar size")((ScalarSize == 64 && "Unsupported floating point scalar size"
) ? static_cast<void> (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8556, __PRETTY_FUNCTION__));

8557

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

8558

}

8559

} else

8560

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

8561

ConstantVec.push_back(Const);

8562

}

8563

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

8564

}

8565

8566

static bool isFoldableUseOfShuffle(SDNode *N) {

8567

for (auto *U : N->uses()) {

8568

unsigned Opc = U->getOpcode();

8569

// VPERMV/VPERMV3 shuffles can never fold their index operands.

8570

if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

8571

return false;

8572

if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

8573

return false;

8574

if (isTargetShuffle(Opc))

8575

return true;

8576

if (Opc == ISD::BITCAST) // Ignore bitcasts

8577

return isFoldableUseOfShuffle(U);

8578

if (N->hasOneUse())

8579

return true;

8580

}

8581

return false;

8582

}

8583

8584

// Check if the current node of build vector is a zero extended vector.

8585

// // If so, return the value extended.

8586

// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.

8587

// // NumElt - return the number of zero extended identical values.

8588

// // EltType - return the type of the value include the zero extend.

8589

static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,

8590

unsigned &NumElt, MVT &EltType) {

8591

SDValue ExtValue = Op->getOperand(0);

8592

unsigned NumElts = Op->getNumOperands();

8593

unsigned Delta = NumElts;

8594

8595

for (unsigned i = 1; i < NumElts; i++) {

8596

if (Op->getOperand(i) == ExtValue) {

8597

Delta = i;

8598

break;

8599

}

8600

if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))

8601

return SDValue();

8602

}

8603

if (!isPowerOf2_32(Delta) || Delta == 1)

8604

return SDValue();

8605

8606

for (unsigned i = Delta; i < NumElts; i++) {

8607

if (i % Delta == 0) {

8608

if (Op->getOperand(i) != ExtValue)

8609

return SDValue();

8610

} else if (!(isNullConstant(Op->getOperand(i)) ||

8611

Op->getOperand(i).isUndef()))

8612

return SDValue();

8613

}

8614

unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();

8615

unsigned ExtVTSize = EltSize * Delta;

8616

EltType = MVT::getIntegerVT(ExtVTSize);

8617

NumElt = NumElts / Delta;

8618

return ExtValue;

8619

}

8620

8621

/// Attempt to use the vbroadcast instruction to generate a splat value

8622

/// from a splat BUILD_VECTOR which uses:

8623

/// a. A single scalar load, or a constant.

8624

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

8625

///

8626

/// The VBROADCAST node is returned when a pattern is found,

8627

/// or SDValue() otherwise.

8628

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

8629

const X86Subtarget &Subtarget,

8630

SelectionDAG &DAG) {

8631

// VBROADCAST requires AVX.

8632

// TODO: Splats could be generated for non-AVX CPUs using SSE

8633

// instructions, but there's less potential gain for only 128-bit vectors.

8634

if (!Subtarget.hasAVX())

8635

return SDValue();

8636

8637

MVT VT = BVOp->getSimpleValueType(0);

8638

SDLoc dl(BVOp);

8639

8640

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8641, __PRETTY_FUNCTION__))

8641

"Unsupported vector type for broadcast.")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8641, __PRETTY_FUNCTION__));

8642

8643

BitVector UndefElements;

8644

SDValue Ld = BVOp->getSplatValue(&UndefElements);

8645

8646

// Attempt to use VBROADCASTM

8647

// From this pattern:

8648

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

8649

// b. t1 = (build_vector t0 t0)

8650

//

8651

// Create (VBROADCASTM v2i1 X)

8652

if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {

8653

MVT EltType = VT.getScalarType();

8654

unsigned NumElts = VT.getVectorNumElements();

8655

SDValue BOperand;

8656

SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);

8657

if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||

8658

(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&

8659

Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {

8660

if (ZeroExtended)

8661

BOperand = ZeroExtended.getOperand(0);

8662

else

8663

BOperand = Ld.getOperand(0).getOperand(0);

8664

MVT MaskVT = BOperand.getSimpleValueType();

8665

if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q

8666

(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

8667

SDValue Brdcst =

8668

DAG.getNode(X86ISD::VBROADCASTM, dl,

8669

MVT::getVectorVT(EltType, NumElts), BOperand);

8670

return DAG.getBitcast(VT, Brdcst);

8671

}

8672

}

8673

}

8674

8675

unsigned NumElts = VT.getVectorNumElements();

8676

unsigned NumUndefElts = UndefElements.count();

8677

if (!Ld || (NumElts - NumUndefElts) <= 1) {

8678

APInt SplatValue, Undef;

8679

unsigned SplatBitSize;

8680

bool HasUndef;

8681

// Check if this is a repeated constant pattern suitable for broadcasting.

8682

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

8683

SplatBitSize > VT.getScalarSizeInBits() &&

8684

SplatBitSize < VT.getSizeInBits()) {

8685

// Avoid replacing with broadcast when it's a use of a shuffle

8686

// instruction to preserve the present custom lowering of shuffles.

8687

if (isFoldableUseOfShuffle(BVOp))

8688

return SDValue();

8689

// replace BUILD_VECTOR with broadcast of the repeated constants.

8690

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

8691

LLVMContext *Ctx = DAG.getContext();

8692

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

8693

if (Subtarget.hasAVX()) {

8694

if (SplatBitSize == 32 || SplatBitSize == 64 ||

8695

(SplatBitSize < 32 && Subtarget.hasAVX2())) {

8696

// Splatted value can fit in one INTEGER constant in constant pool.

8697

// Load the constant and broadcast it.

8698

MVT CVT = MVT::getIntegerVT(SplatBitSize);

8699

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

8700

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

8701

SDValue CP = DAG.getConstantPool(C, PVT);

8702

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

8703

8704

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

8705

SDVTList Tys =

8706

DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

8707

SDValue Ops[] = {DAG.getEntryNode(), CP};

8708

MachinePointerInfo MPI =

8709

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

8710

SDValue Brdcst = DAG.getMemIntrinsicNode(

8711

X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

8712

MachineMemOperand::MOLoad);

8713

return DAG.getBitcast(VT, Brdcst);

8714

}

8715

if (SplatBitSize > 64) {

8716

// Load the vector of constants and broadcast it.

8717

MVT CVT = VT.getScalarType();

8718

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

8719

*Ctx);

8720

SDValue VCP = DAG.getConstantPool(VecC, PVT);

8721

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

8722

Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

8723

Ld = DAG.getLoad(

8724

MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,

8725

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

8726

Alignment);

8727

SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);

8728

return DAG.getBitcast(VT, Brdcst);

8729

}

8730

}

8731

}

8732

8733

// If we are moving a scalar into a vector (Ld must be set and all elements

8734

// but 1 are undef) and that operation is not obviously supported by

8735

// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

8736

// That's better than general shuffling and may eliminate a load to GPR and

8737

// move from scalar to vector register.

8738

if (!Ld || NumElts - NumUndefElts != 1)

8739

return SDValue();

8740

unsigned ScalarSize = Ld.getValueSizeInBits();

8741

if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

8742

return SDValue();

8743

}

8744

8745

bool ConstSplatVal =

8746

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

8747

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

8748

8749

// Make sure that all of the users of a non-constant load are from the

8750

// BUILD_VECTOR node.

8751

// FIXME: Is the use count needed for non-constant, non-load case?

8752

if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

8753

return SDValue();

8754

8755

unsigned ScalarSize = Ld.getValueSizeInBits();

8756

bool IsGE256 = (VT.getSizeInBits() >= 256);

8757

8758

// When optimizing for size, generate up to 5 extra bytes for a broadcast

8759

// instruction to save 8 or more bytes of constant pool data.

8760

// TODO: If multiple splats are generated to load the same constant,

8761

// it may be detrimental to overall size. There needs to be a way to detect

8762

// that condition to know if this is truly a size win.

8763

bool OptForSize = DAG.shouldOptForSize();

8764

8765

// Handle broadcasting a single constant scalar from the constant pool

8766

// into a vector.

8767

// On Sandybridge (no AVX2), it is still better to load a constant vector

8768

// from the constant pool and not to broadcast it from a scalar.

8769

// But override that restriction when optimizing for size.

8770

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

8771

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

8772

EVT CVT = Ld.getValueType();

8773

assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type"
) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8773, __PRETTY_FUNCTION__));

8774

8775

// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.

8776

// For size optimization, also splat v2f64 and v2i64, and for size opt

8777

// with AVX2, also splat i8 and i16.

8778

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

8779

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

8780

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

8781

const Constant *C = nullptr;

8782

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

8783

C = CI->getConstantIntValue();

8784

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

8785

C = CF->getConstantFPValue();

8786

8787

assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void
> (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8787, __PRETTY_FUNCTION__));

8788

8789

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

8790

SDValue CP =

8791

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

8792

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

8793

8794

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8795

SDValue Ops[] = {DAG.getEntryNode(), CP};

8796

MachinePointerInfo MPI =

8797

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

8798

return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

8799

MPI, Alignment, MachineMemOperand::MOLoad);

8800

}

8801

}

8802

8803

// Handle AVX2 in-register broadcasts.

8804

if (!IsLoad && Subtarget.hasInt256() &&

8805

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

8806

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

8807

8808

// The scalar source must be a normal load.

8809

if (!IsLoad)

8810

return SDValue();

8811

8812

// Make sure the non-chain result is only used by this build vector.

8813

if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

8814

return SDValue();

8815

8816

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

8817

(Subtarget.hasVLX() && ScalarSize == 64)) {

8818

auto *LN = cast<LoadSDNode>(Ld);

8819

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8820

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

8821

SDValue BCast =

8822

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

8823

LN->getMemoryVT(), LN->getMemOperand());

8824

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

8825

return BCast;

8826

}

8827

8828

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

8829

// double since there is no vbroadcastsd xmm

8830

if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

8831

(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

8832

auto *LN = cast<LoadSDNode>(Ld);

8833

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8834

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

8835

SDValue BCast =

8836

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

8837

LN->getMemoryVT(), LN->getMemOperand());

8838

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

8839

return BCast;

8840

}

8841

8842

// Unsupported broadcast.

8843

return SDValue();

8844

}

8845

8846

/// For an EXTRACT_VECTOR_ELT with a constant index return the real

8847

/// underlying vector and index.

8848

///

8849

/// Modifies \p ExtractedFromVec to the real vector and returns the real

8850

/// index.

8851

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

8852

SDValue ExtIdx) {

8853

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

8854

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

8855

return Idx;

8856

8857

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

8858

// lowered this:

8859

// (extract_vector_elt (v8f32 %1), Constant<6>)

8860

// to:

8861

// (extract_vector_elt (vector_shuffle<2,u,u,u>

8862

// (extract_subvector (v8f32 %0), Constant<4>),

8863

// undef)

8864

// Constant<0>)

8865

// In this case the vector is the extract_subvector expression and the index

8866

// is 2, as specified by the shuffle.

8867

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

8868

SDValue ShuffleVec = SVOp->getOperand(0);

8869

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

8870

assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8871, __PRETTY_FUNCTION__))

8871

ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8871, __PRETTY_FUNCTION__));

8872

8873

int ShuffleIdx = SVOp->getMaskElt(Idx);

8874

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

8875

ExtractedFromVec = ShuffleVec;

8876

return ShuffleIdx;

8877

}

8878

return Idx;

8879

}

8880

8881

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

8882

MVT VT = Op.getSimpleValueType();

8883

8884

// Skip if insert_vec_elt is not supported.

8885

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

8886

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

8887

return SDValue();

8888

8889

SDLoc DL(Op);

8890

unsigned NumElems = Op.getNumOperands();

8891

8892

SDValue VecIn1;

8893

SDValue VecIn2;

8894

SmallVector<unsigned, 4> InsertIndices;

8895

SmallVector<int, 8> Mask(NumElems, -1);

8896

8897

for (unsigned i = 0; i != NumElems; ++i) {

8898

unsigned Opc = Op.getOperand(i).getOpcode();

8899

8900

if (Opc == ISD::UNDEF)

8901

continue;

8902

8903

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

8904

// Quit if more than 1 elements need inserting.

8905

if (InsertIndices.size() > 1)

8906

return SDValue();

8907

8908

InsertIndices.push_back(i);

8909

continue;

8910

}

8911

8912

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

8913

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

8914

8915

// Quit if non-constant index.

8916

if (!isa<ConstantSDNode>(ExtIdx))

8917

return SDValue();

8918

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

8919

8920

// Quit if extracted from vector of different type.

8921

if (ExtractedFromVec.getValueType() != VT)

8922

return SDValue();

8923

8924

if (!VecIn1.getNode())

8925

VecIn1 = ExtractedFromVec;

8926

else if (VecIn1 != ExtractedFromVec) {

8927

if (!VecIn2.getNode())

8928

VecIn2 = ExtractedFromVec;

8929

else if (VecIn2 != ExtractedFromVec)

8930

// Quit if more than 2 vectors to shuffle

8931

return SDValue();

8932

}

8933

8934

if (ExtractedFromVec == VecIn1)

8935

Mask[i] = Idx;

8936

else if (ExtractedFromVec == VecIn2)

8937

Mask[i] = Idx + NumElems;

8938

}

8939

8940

if (!VecIn1.getNode())

8941

return SDValue();

8942

8943

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

8944

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

8945

8946

for (unsigned Idx : InsertIndices)

8947

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

8948

DAG.getIntPtrConstant(Idx, DL));

8949

8950

return NV;

8951

}

8952

8953

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

8954

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

8955

const X86Subtarget &Subtarget) {

8956

8957

MVT VT = Op.getSimpleValueType();

8958

assert((VT.getVectorElementType() == MVT::i1) &&(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8959, __PRETTY_FUNCTION__))

8959

"Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8959, __PRETTY_FUNCTION__));

8960

8961

SDLoc dl(Op);

8962

if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

8963

ISD::isBuildVectorAllOnes(Op.getNode()))

8964

return Op;

8965

8966

uint64_t Immediate = 0;

8967

SmallVector<unsigned, 16> NonConstIdx;

8968

bool IsSplat = true;

8969

bool HasConstElts = false;

8970

int SplatIdx = -1;

8971

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

8972

SDValue In = Op.getOperand(idx);

8973

if (In.isUndef())

8974

continue;

8975

if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

8976

Immediate |= (InC->getZExtValue() & 0x1) << idx;

8977

HasConstElts = true;

8978

} else {

8979

NonConstIdx.push_back(idx);

8980

}

8981

if (SplatIdx < 0)

8982

SplatIdx = idx;

8983

else if (In != Op.getOperand(SplatIdx))

8984

IsSplat = false;

8985

}

8986

8987

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

8988

if (IsSplat) {

8989

// The build_vector allows the scalar element to be larger than the vector

8990

// element type. We need to mask it to use as a condition unless we know

8991

// the upper bits are zero.

8992

// FIXME: Use computeKnownBits instead of checking specific opcode?

8993

SDValue Cond = Op.getOperand(SplatIdx);

8994

assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((Cond.getValueType() == MVT::i8 && "Unexpected VT!")
? static_cast<void> (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8994, __PRETTY_FUNCTION__));

8995

if (Cond.getOpcode() != ISD::SETCC)

8996

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

8997

DAG.getConstant(1, dl, MVT::i8));

8998

8999

// Perform the select in the scalar domain so we can use cmov.

9000

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

9001

SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

9002

DAG.getAllOnesConstant(dl, MVT::i32),

9003

DAG.getConstant(0, dl, MVT::i32));

9004

Select = DAG.getBitcast(MVT::v32i1, Select);

9005

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

9006

} else {

9007

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

9008

SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

9009

DAG.getAllOnesConstant(dl, ImmVT),

9010

DAG.getConstant(0, dl, ImmVT));

9011

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

9012

Select = DAG.getBitcast(VecVT, Select);

9013

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

9014

DAG.getIntPtrConstant(0, dl));

9015

}

9016

}

9017

9018

// insert elements one by one

9019

SDValue DstVec;

9020

if (HasConstElts) {

9021

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

9022

SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

9023

SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

9024

ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

9025

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

9026

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

9027

} else {

9028

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

9029

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

9030

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

9031

DstVec = DAG.getBitcast(VecVT, Imm);

9032

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

9033

DAG.getIntPtrConstant(0, dl));

9034

}

9035

} else

9036

DstVec = DAG.getUNDEF(VT);

9037

9038

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

9039

unsigned InsertIdx = NonConstIdx[i];

9040

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

9041

Op.getOperand(InsertIdx),

9042

DAG.getIntPtrConstant(InsertIdx, dl));

9043

}

9044

return DstVec;

9045

}

9046

9047

/// This is a helper function of LowerToHorizontalOp().

9048

/// This function checks that the build_vector \p N in input implements a

9049

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

9050

/// may not match the layout of an x86 256-bit horizontal instruction.

9051

/// In other words, if this returns true, then some extraction/insertion will

9052

/// be required to produce a valid horizontal instruction.

9053

///

9054

/// Parameter \p Opcode defines the kind of horizontal operation to match.

9055

/// For example, if \p Opcode is equal to ISD::ADD, then this function

9056

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

9057

/// is equal to ISD::SUB, then this function checks if this is a horizontal

9058

/// arithmetic sub.

9059

///

9060

/// This function only analyzes elements of \p N whose indices are

9061

/// in range [BaseIdx, LastIdx).

9062

///

9063

/// TODO: This function was originally used to match both real and fake partial

9064

/// horizontal operations, but the index-matching logic is incorrect for that.

9065

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

9066

/// code because it is only used for partial h-op matching now?

9067

static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

9068

SelectionDAG &DAG,

9069

unsigned BaseIdx, unsigned LastIdx,

9070

SDValue &V0, SDValue &V1) {

9071

EVT VT = N->getValueType(0);

9072

assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9072, __PRETTY_FUNCTION__));

9073

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"
) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9073, __PRETTY_FUNCTION__));

9074

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9075, __PRETTY_FUNCTION__))

9075

"Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9075, __PRETTY_FUNCTION__));

9076

9077

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

9078

bool CanFold = true;

9079

unsigned ExpectedVExtractIdx = BaseIdx;

9080

unsigned NumElts = LastIdx - BaseIdx;

9081

V0 = DAG.getUNDEF(VT);

9082

V1 = DAG.getUNDEF(VT);

9083

9084

// Check if N implements a horizontal binop.

9085

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

9086

SDValue Op = N->getOperand(i + BaseIdx);

9087

9088

// Skip UNDEFs.

9089

if (Op->isUndef()) {

9090

// Update the expected vector extract index.

9091

if (i * 2 == NumElts)

9092

ExpectedVExtractIdx = BaseIdx;

9093

ExpectedVExtractIdx += 2;

9094

continue;

9095

}

9096

9097

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

9098

9099

if (!CanFold)

9100

break;

9101

9102

SDValue Op0 = Op.getOperand(0);

9103

SDValue Op1 = Op.getOperand(1);

9104

9105

// Try to match the following pattern:

9106

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

9107

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

9108

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

9109

Op0.getOperand(0) == Op1.getOperand(0) &&

9110

isa<ConstantSDNode>(Op0.getOperand(1)) &&

9111

isa<ConstantSDNode>(Op1.getOperand(1)));

9112

if (!CanFold)

9113

break;

9114

9115

unsigned I0 = Op0.getConstantOperandVal(1);

9116

unsigned I1 = Op1.getConstantOperandVal(1);

9117

9118

if (i * 2 < NumElts) {

9119

if (V0.isUndef()) {

9120

V0 = Op0.getOperand(0);

9121

if (V0.getValueType() != VT)

9122

return false;

9123

}

9124

} else {

9125

if (V1.isUndef()) {

9126

V1 = Op0.getOperand(0);

9127

if (V1.getValueType() != VT)

9128

return false;

9129

}

9130

if (i * 2 == NumElts)

9131

ExpectedVExtractIdx = BaseIdx;

9132

}

9133

9134

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

9135

if (I0 == ExpectedVExtractIdx)

9136

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

9137

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

9138

// Try to match the following dag sequence:

9139

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

9140

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

9141

} else

9142

CanFold = false;

9143

9144

ExpectedVExtractIdx += 2;

9145

}

9146

9147

return CanFold;

9148

}

9149

9150

/// Emit a sequence of two 128-bit horizontal add/sub followed by

9151

/// a concat_vector.

9152

///

9153

/// This is a helper function of LowerToHorizontalOp().

9154

/// This function expects two 256-bit vectors called V0 and V1.

9155

/// At first, each vector is split into two separate 128-bit vectors.

9156

/// Then, the resulting 128-bit vectors are used to implement two

9157

/// horizontal binary operations.

9158

///

9159

/// The kind of horizontal binary operation is defined by \p X86Opcode.

9160

///

9161

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

9162

/// the two new horizontal binop.

9163

/// When Mode is set, the first horizontal binop dag node would take as input

9164

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

9165

/// horizontal binop dag node would take as input the lower 128-bit of V1

9166

/// and the upper 128-bit of V1.

9167

/// Example:

9168

/// HADD V0_LO, V0_HI

9169

/// HADD V1_LO, V1_HI

9170

///

9171

/// Otherwise, the first horizontal binop dag node takes as input the lower

9172

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

9173

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

9174

/// Example:

9175

/// HADD V0_LO, V1_LO

9176

/// HADD V0_HI, V1_HI

9177

///

9178

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

9179

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

9180

/// the upper 128-bits of the result.

9181

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

9182

const SDLoc &DL, SelectionDAG &DAG,

9183

unsigned X86Opcode, bool Mode,

9184

bool isUndefLO, bool isUndefHI) {

9185

MVT VT = V0.getSimpleValueType();

9186

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9187, __PRETTY_FUNCTION__))

9187

"Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9187, __PRETTY_FUNCTION__));

9188

9189

unsigned NumElts = VT.getVectorNumElements();

9190

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

9191

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

9192

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

9193

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

9194

MVT NewVT = V0_LO.getSimpleValueType();

9195

9196

SDValue LO = DAG.getUNDEF(NewVT);

9197

SDValue HI = DAG.getUNDEF(NewVT);

9198

9199

if (Mode) {

9200

// Don't emit a horizontal binop if the result is expected to be UNDEF.

9201

if (!isUndefLO && !V0->isUndef())

9202

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

9203

if (!isUndefHI && !V1->isUndef())

9204

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

9205

} else {

9206

// Don't emit a horizontal binop if the result is expected to be UNDEF.

9207

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

9208

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

9209

9210

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

9211

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

9212

}

9213

9214

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

9215

}

9216

9217

/// Returns true iff \p BV builds a vector with the result equivalent to

9218

/// the result of ADDSUB/SUBADD operation.

9219

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

9220

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

9221

/// \p Opnd0 and \p Opnd1.

9222

static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

9223

const X86Subtarget &Subtarget, SelectionDAG &DAG,

9224

SDValue &Opnd0, SDValue &Opnd1,

9225

unsigned &NumExtracts,

9226

bool &IsSubAdd) {

9227

9228

MVT VT = BV->getSimpleValueType(0);

9229

if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

9230

return false;

9231

9232

unsigned NumElts = VT.getVectorNumElements();

9233

SDValue InVec0 = DAG.getUNDEF(VT);

9234

SDValue InVec1 = DAG.getUNDEF(VT);

9235

9236

NumExtracts = 0;

9237

9238

// Odd-numbered elements in the input build vector are obtained from

9239

// adding/subtracting two integer/float elements.

9240

// Even-numbered elements in the input build vector are obtained from

9241

// subtracting/adding two integer/float elements.

9242

unsigned Opc[2] = {0, 0};

9243

for (unsigned i = 0, e = NumElts; i != e; ++i) {

9244

SDValue Op = BV->getOperand(i);

9245

9246

// Skip 'undef' values.

9247

unsigned Opcode = Op.getOpcode();

9248

if (Opcode == ISD::UNDEF)

9249

continue;

9250

9251

// Early exit if we found an unexpected opcode.

9252

if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

9253

return false;

9254

9255

SDValue Op0 = Op.getOperand(0);

9256

SDValue Op1 = Op.getOperand(1);

9257

9258

// Try to match the following pattern:

9259

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

9260

// Early exit if we cannot match that sequence.

9261

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9262

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9263

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

9264

Op0.getOperand(1) != Op1.getOperand(1))

9265

return false;

9266

9267

unsigned I0 = Op0.getConstantOperandVal(1);

9268

if (I0 != i)

9269

return false;

9270

9271

// We found a valid add/sub node, make sure its the same opcode as previous

9272

// elements for this parity.

9273

if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

9274

return false;

9275

Opc[i % 2] = Opcode;

9276

9277

// Update InVec0 and InVec1.

9278

if (InVec0.isUndef()) {

9279

InVec0 = Op0.getOperand(0);

9280

if (InVec0.getSimpleValueType() != VT)

9281

return false;

9282

}

9283

if (InVec1.isUndef()) {

9284

InVec1 = Op1.getOperand(0);

9285

if (InVec1.getSimpleValueType() != VT)

9286

return false;

9287

}

9288

9289

// Make sure that operands in input to each add/sub node always

9290

// come from a same pair of vectors.

9291

if (InVec0 != Op0.getOperand(0)) {

9292

if (Opcode == ISD::FSUB)

9293

return false;

9294

9295

// FADD is commutable. Try to commute the operands

9296

// and then test again.

9297

std::swap(Op0, Op1);

9298

if (InVec0 != Op0.getOperand(0))

9299

return false;

9300

}

9301

9302

if (InVec1 != Op1.getOperand(0))

9303

return false;

9304

9305

// Increment the number of extractions done.

9306

++NumExtracts;

9307

}

9308

9309

// Ensure we have found an opcode for both parities and that they are

9310

// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

9311

// inputs are undef.

9312

if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

9313

InVec0.isUndef() || InVec1.isUndef())

9314

return false;

9315

9316

IsSubAdd = Opc[0] == ISD::FADD;

9317

9318

Opnd0 = InVec0;

9319

Opnd1 = InVec1;

9320

return true;

9321

}

9322

9323

/// Returns true if is possible to fold MUL and an idiom that has already been

9324

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

9325

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

9326

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

9327

///

9328

/// Prior to calling this function it should be known that there is some

9329

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

9330

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

9331

/// before replacement of such SDNode with ADDSUB operation. Thus the number

9332

/// of \p Opnd0 uses is expected to be equal to 2.

9333

/// For example, this function may be called for the following IR:

9334

/// %AB = fmul fast <2 x double> %A, %B

9335

/// %Sub = fsub fast <2 x double> %AB, %C

9336

/// %Add = fadd fast <2 x double> %AB, %C

9337

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

9338

/// <2 x i32> <i32 0, i32 3>

9339

/// There is a def for %Addsub here, which potentially can be replaced by

9340

/// X86ISD::ADDSUB operation:

9341

/// %Addsub = X86ISD::ADDSUB %AB, %C

9342

/// and such ADDSUB can further be replaced with FMADDSUB:

9343

/// %Addsub = FMADDSUB %A, %B, %C.

9344

///

9345

/// The main reason why this method is called before the replacement of the

9346

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

9347

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

9348

/// FMADDSUB is.

9349

static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

9350

SelectionDAG &DAG,

9351

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,

9352

unsigned ExpectedUses) {

9353

if (Opnd0.getOpcode() != ISD::FMUL ||

9354

!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

9355

return false;

9356

9357

// FIXME: These checks must match the similar ones in

9358

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

9359

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

9360

// or MUL + ADDSUB to FMADDSUB.

9361

const TargetOptions &Options = DAG.getTarget().Options;

9362

bool AllowFusion =

9363

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

9364

if (!AllowFusion)

9365

return false;

9366

9367

Opnd2 = Opnd1;

9368

Opnd1 = Opnd0.getOperand(1);

9369

Opnd0 = Opnd0.getOperand(0);

9370

9371

return true;

9372

}

9373

9374

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

9375

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

9376

/// X86ISD::FMSUBADD node.

9377

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

9378

const X86Subtarget &Subtarget,

9379

SelectionDAG &DAG) {

9380

SDValue Opnd0, Opnd1;

9381

unsigned NumExtracts;

9382

bool IsSubAdd;

9383

if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,

9384

IsSubAdd))

9385

return SDValue();

9386

9387

MVT VT = BV->getSimpleValueType(0);

9388

SDLoc DL(BV);

9389

9390

// Try to generate X86ISD::FMADDSUB node here.

9391

SDValue Opnd2;

9392

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {

9393

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

9394

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

9395

}

9396

9397

// We only support ADDSUB.

9398

if (IsSubAdd)

9399

return SDValue();

9400

9401

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

9402

// the ADDSUB idiom has been successfully recognized. There are no known

9403

// X86 targets with 512-bit ADDSUB instructions!

9404

// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom

9405

// recognition.

9406

if (VT.is512BitVector())

9407

return SDValue();

9408

9409

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

9410

}

9411

9412

static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

9413

unsigned &HOpcode, SDValue &V0, SDValue &V1) {

9414

// Initialize outputs to known values.

9415

MVT VT = BV->getSimpleValueType(0);

9416

HOpcode = ISD::DELETED_NODE;

9417

V0 = DAG.getUNDEF(VT);

9418

V1 = DAG.getUNDEF(VT);

9419

9420

// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

9421

// half of the result is calculated independently from the 128-bit halves of

9422

// the inputs, so that makes the index-checking logic below more complicated.

9423

unsigned NumElts = VT.getVectorNumElements();

9424

unsigned GenericOpcode = ISD::DELETED_NODE;

9425

unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

9426

unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

9427

unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

9428

for (unsigned i = 0; i != Num128BitChunks; ++i) {

9429

for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

9430

// Ignore undef elements.

9431

SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

9432

if (Op.isUndef())

9433

continue;

9434

9435

// If there's an opcode mismatch, we're done.

9436

if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

9437

return false;

9438

9439

// Initialize horizontal opcode.

9440

if (HOpcode == ISD::DELETED_NODE) {

9441

GenericOpcode = Op.getOpcode();

9442

switch (GenericOpcode) {

9443

case ISD::ADD: HOpcode = X86ISD::HADD; break;

9444

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

9445

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

9446

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

9447

default: return false;

9448

}

9449

}

9450

9451

SDValue Op0 = Op.getOperand(0);

9452

SDValue Op1 = Op.getOperand(1);

9453

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9454

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9455

Op0.getOperand(0) != Op1.getOperand(0) ||

9456

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

9457

!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

9458

return false;

9459

9460

// The source vector is chosen based on which 64-bit half of the

9461

// destination vector is being calculated.

9462

if (j < NumEltsIn64Bits) {

9463

if (V0.isUndef())

9464

V0 = Op0.getOperand(0);

9465

} else {

9466

if (V1.isUndef())

9467

V1 = Op0.getOperand(0);

9468

}

9469

9470

SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;

9471

if (SourceVec != Op0.getOperand(0))

9472

return false;

9473

9474

// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

9475

unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

9476

unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

9477

unsigned ExpectedIndex = i * NumEltsIn128Bits +

9478

(j % NumEltsIn64Bits) * 2;

9479

if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

9480

continue;

9481

9482

// If this is not a commutative op, this does not match.

9483

if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

9484

return false;

9485

9486

// Addition is commutative, so try swapping the extract indexes.

9487

// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

9488

if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

9489

continue;

9490

9491

// Extract indexes do not match horizontal requirement.

9492

return false;

9493

}

9494

}

9495

// We matched. Opcode and operands are returned by reference as arguments.

9496

return true;

9497

}

9498

9499

static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

9500

SelectionDAG &DAG, unsigned HOpcode,

9501

SDValue V0, SDValue V1) {

9502

// If either input vector is not the same size as the build vector,

9503

// extract/insert the low bits to the correct size.

9504

// This is free (examples: zmm --> xmm, xmm --> ymm).

9505

MVT VT = BV->getSimpleValueType(0);

9506

unsigned Width = VT.getSizeInBits();

9507

if (V0.getValueSizeInBits() > Width)

9508

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);

9509

else if (V0.getValueSizeInBits() < Width)

9510

V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

9511

9512

if (V1.getValueSizeInBits() > Width)

9513

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);

9514

else if (V1.getValueSizeInBits() < Width)

9515

V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

9516

9517

unsigned NumElts = VT.getVectorNumElements();

9518

APInt DemandedElts = APInt::getAllOnesValue(NumElts);

9519

for (unsigned i = 0; i != NumElts; ++i)

9520

if (BV->getOperand(i).isUndef())

9521

DemandedElts.clearBit(i);

9522

9523

// If we don't need the upper xmm, then perform as a xmm hop.

9524

unsigned HalfNumElts = NumElts / 2;

9525

if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

9526

MVT HalfVT = VT.getHalfNumVectorElementsVT();

9527

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);

9528

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);

9529

SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);

9530

return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);

9531

}

9532

9533

return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);

9534

}

9535

9536

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

9537

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

9538

const X86Subtarget &Subtarget,

9539

SelectionDAG &DAG) {

9540

// We need at least 2 non-undef elements to make this worthwhile by default.

9541

unsigned NumNonUndefs =

9542

count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

9543

if (NumNonUndefs < 2)

9544

return SDValue();

9545

9546

// There are 4 sets of horizontal math operations distinguished by type:

9547

// int/FP at 128-bit/256-bit. Each type was introduced with a different

9548

// subtarget feature. Try to match those "native" patterns first.

9549

MVT VT = BV->getSimpleValueType(0);

9550

if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

9551

((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

9552

((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

9553

((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

9554

unsigned HOpcode;

9555

SDValue V0, V1;

9556

if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

9557

return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);

9558

}

9559

9560

// Try harder to match 256-bit ops by using extract/concat.

9561

if (!Subtarget.hasAVX() || !VT.is256BitVector())

9562

return SDValue();

9563

9564

// Count the number of UNDEF operands in the build_vector in input.

9565

unsigned NumElts = VT.getVectorNumElements();

9566

unsigned Half = NumElts / 2;

9567

unsigned NumUndefsLO = 0;

9568

unsigned NumUndefsHI = 0;

9569

for (unsigned i = 0, e = Half; i != e; ++i)

9570

if (BV->getOperand(i)->isUndef())

9571

NumUndefsLO++;

9572

9573

for (unsigned i = Half, e = NumElts; i != e; ++i)

9574

if (BV->getOperand(i)->isUndef())

9575

NumUndefsHI++;

9576

9577

SDLoc DL(BV);

9578

SDValue InVec0, InVec1;

9579

if (VT == MVT::v8i32 || VT == MVT::v16i16) {

9580

SDValue InVec2, InVec3;

9581

unsigned X86Opcode;

9582

bool CanFold = true;

9583

9584

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

9585

isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,

9586

InVec3) &&

9587

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

9588

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

9589

X86Opcode = X86ISD::HADD;

9590

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,

9591

InVec1) &&

9592

isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,

9593

InVec3) &&

9594

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

9595

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

9596

X86Opcode = X86ISD::HSUB;

9597

else

9598

CanFold = false;

9599

9600

if (CanFold) {

9601

// Do not try to expand this build_vector into a pair of horizontal

9602

// add/sub if we can emit a pair of scalar add/sub.

9603

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

9604

return SDValue();

9605

9606

// Convert this build_vector into a pair of horizontal binops followed by

9607

// a concat vector. We must adjust the outputs from the partial horizontal

9608

// matching calls above to account for undefined vector halves.

9609

SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

9610

SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

9611

assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"
) ? static_cast<void> (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9611, __PRETTY_FUNCTION__));

9612

bool isUndefLO = NumUndefsLO == Half;

9613

bool isUndefHI = NumUndefsHI == Half;

9614

return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

9615

isUndefHI);

9616

}

9617

}

9618

9619

if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

9620

VT == MVT::v16i16) {

9621

unsigned X86Opcode;

9622

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

9623

X86Opcode = X86ISD::HADD;

9624

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,

9625

InVec1))

9626

X86Opcode = X86ISD::HSUB;

9627

else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,

9628

InVec1))

9629

X86Opcode = X86ISD::FHADD;

9630

else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,

9631

InVec1))

9632

X86Opcode = X86ISD::FHSUB;

9633

else

9634

return SDValue();

9635

9636

// Don't try to expand this build_vector into a pair of horizontal add/sub

9637

// if we can simply emit a pair of scalar add/sub.

9638

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

9639

return SDValue();

9640

9641

// Convert this build_vector into two horizontal add/sub followed by

9642

// a concat vector.

9643

bool isUndefLO = NumUndefsLO == Half;

9644

bool isUndefHI = NumUndefsHI == Half;

9645

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

9646

isUndefLO, isUndefHI);

9647

}

9648

9649

return SDValue();

9650

}

9651

9652

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

9653

SelectionDAG &DAG);

9654

9655

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

9656

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

9657

/// just apply the bit to the vectors.

9658

/// NOTE: Its not in our interest to start make a general purpose vectorizer

9659

/// from this, but enough scalar bit operations are created from the later

9660

/// legalization + scalarization stages to need basic support.

9661

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

9662

const X86Subtarget &Subtarget,

9663

SelectionDAG &DAG) {

9664

SDLoc DL(Op);

9665

MVT VT = Op->getSimpleValueType(0);

9666

unsigned NumElems = VT.getVectorNumElements();

9667

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9668

9669

// Check that all elements have the same opcode.

9670

// TODO: Should we allow UNDEFS and if so how many?

9671

unsigned Opcode = Op->getOperand(0).getOpcode();

9672

for (unsigned i = 1; i < NumElems; ++i)

9673

if (Opcode != Op->getOperand(i).getOpcode())

9674

return SDValue();

9675

9676

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

9677

bool IsShift = false;

9678

switch (Opcode) {

9679

default:

9680

return SDValue();

9681

case ISD::SHL:

9682

case ISD::SRL:

9683

case ISD::SRA:

9684

IsShift = true;

9685

break;

9686

case ISD::AND:

9687

case ISD::XOR:

9688

case ISD::OR:

9689

// Don't do this if the buildvector is a splat - we'd replace one

9690

// constant with an entire vector.

9691

if (Op->getSplatValue())

9692

return SDValue();

9693

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

9694

return SDValue();

9695

break;

9696

}

9697

9698

SmallVector<SDValue, 4> LHSElts, RHSElts;

9699

for (SDValue Elt : Op->ops()) {

9700

SDValue LHS = Elt.getOperand(0);

9701

SDValue RHS = Elt.getOperand(1);

9702

9703

// We expect the canonicalized RHS operand to be the constant.

9704

if (!isa<ConstantSDNode>(RHS))

9705

return SDValue();

9706

9707

// Extend shift amounts.

9708

if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

9709

if (!IsShift)

9710

return SDValue();

9711

RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

9712

}

9713

9714

LHSElts.push_back(LHS);

9715

RHSElts.push_back(RHS);

9716

}

9717

9718

// Limit to shifts by uniform immediates.

9719

// TODO: Only accept vXi8/vXi64 special cases?

9720

// TODO: Permit non-uniform XOP/AVX2/MULLO cases?

9721

if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

9722

return SDValue();

9723

9724

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

9725

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

9726

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

9727

9728

if (!IsShift)

9729

return Res;

9730

9731

// Immediately lower the shift to ensure the constant build vector doesn't

9732

// get converted to a constant pool before the shift is lowered.

9733

return LowerShift(Res, Subtarget, DAG);

9734

}

9735

9736

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

9737

/// functionality to do this, so it's all zeros, all ones, or some derivation

9738

/// that is cheap to calculate.

9739

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

9740

const X86Subtarget &Subtarget) {

9741

SDLoc DL(Op);

9742

MVT VT = Op.getSimpleValueType();

9743

9744

// Vectors containing all zeros can be matched by pxor and xorps.

9745

if (ISD::isBuildVectorAllZeros(Op.getNode()))

9746

return Op;

9747

9748

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

9749

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

9750

// vpcmpeqd on 256-bit vectors.

9751

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

9752

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

9753

return Op;

9754

9755

return getOnesVector(VT, DAG, DL);

9756

}

9757

9758

return SDValue();

9759

}

9760

9761

/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

9762

/// from a vector of source values and a vector of extraction indices.

9763

/// The vectors might be manipulated to match the type of the permute op.

9764

static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

9765

SDLoc &DL, SelectionDAG &DAG,

9766

const X86Subtarget &Subtarget) {

9767

MVT ShuffleVT = VT;

9768

EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

9769

unsigned NumElts = VT.getVectorNumElements();

9770

unsigned SizeInBits = VT.getSizeInBits();

9771

9772

// Adjust IndicesVec to match VT size.

9773

assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9774, __PRETTY_FUNCTION__))

9774

"Illegal variable permute mask size")((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9774, __PRETTY_FUNCTION__));

9775

if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

9776

IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

9777

NumElts * VT.getScalarSizeInBits());

9778

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

9779

9780

// Handle SrcVec that don't match VT type.

9781

if (SrcVec.getValueSizeInBits() != SizeInBits) {

9782

if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

9783

// Handle larger SrcVec by treating it as a larger permute.

9784

unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

9785

VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

9786

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

9787

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

9788

Subtarget, DAG, SDLoc(IndicesVec));

9789

SDValue NewSrcVec =

9790

createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

9791

if (NewSrcVec)

9792

return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

9793

return SDValue();

9794

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

9795

// Widen smaller SrcVec to match VT.

9796

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

9797

} else

9798

return SDValue();

9799

}

9800

9801

auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

9802

assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9802, __PRETTY_FUNCTION__));

9803

EVT SrcVT = Idx.getValueType();

9804

unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

9805

uint64_t IndexScale = 0;

9806

uint64_t IndexOffset = 0;

9807

9808

// If we're scaling a smaller permute op, then we need to repeat the

9809

// indices, scaling and offsetting them as well.

9810

// e.g. v4i32 -> v16i8 (Scale = 4)

9811

// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

9812

// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

9813

for (uint64_t i = 0; i != Scale; ++i) {

9814

IndexScale |= Scale << (i * NumDstBits);

9815

IndexOffset |= i << (i * NumDstBits);

9816

}

9817

9818

Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

9819

DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

9820

Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

9821

DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

9822

return Idx;

9823

};

9824

9825

unsigned Opcode = 0;

9826

switch (VT.SimpleTy) {

9827

default:

9828

break;

9829

case MVT::v16i8:

9830

if (Subtarget.hasSSSE3())

9831

Opcode = X86ISD::PSHUFB;

9832

break;

9833

case MVT::v8i16:

9834

if (Subtarget.hasVLX() && Subtarget.hasBWI())

9835

Opcode = X86ISD::VPERMV;

9836

else if (Subtarget.hasSSSE3()) {

9837

Opcode = X86ISD::PSHUFB;

9838

ShuffleVT = MVT::v16i8;

9839

}

9840

break;

9841

case MVT::v4f32:

9842

case MVT::v4i32:

9843

if (Subtarget.hasAVX()) {

9844

Opcode = X86ISD::VPERMILPV;

9845

ShuffleVT = MVT::v4f32;

9846

} else if (Subtarget.hasSSSE3()) {

9847

Opcode = X86ISD::PSHUFB;

9848

ShuffleVT = MVT::v16i8;

9849

}

9850

break;

9851

case MVT::v2f64:

9852

case MVT::v2i64:

9853

if (Subtarget.hasAVX()) {

9854

// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

9855

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

9856

Opcode = X86ISD::VPERMILPV;

9857

ShuffleVT = MVT::v2f64;

9858

} else if (Subtarget.hasSSE41()) {

9859

// SSE41 can compare v2i64 - select between indices 0 and 1.

9860

return DAG.getSelectCC(

9861

DL, IndicesVec,

9862

getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

9863

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

9864

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

9865

ISD::CondCode::SETEQ);

9866

}

9867

break;

9868

case MVT::v32i8:

9869

if (Subtarget.hasVLX() && Subtarget.hasVBMI())

9870

Opcode = X86ISD::VPERMV;

9871

else if (Subtarget.hasXOP()) {

9872

SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

9873

SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

9874

SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

9875

SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

9876

return DAG.getNode(

9877

ISD::CONCAT_VECTORS, DL, VT,

9878

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

9879

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

9880

} else if (Subtarget.hasAVX()) {

9881

SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

9882

SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

9883

SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

9884

SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

9885

auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

9886

ArrayRef<SDValue> Ops) {

9887

// Permute Lo and Hi and then select based on index range.

9888

// This works as SHUFB uses bits[3:0] to permute elements and we don't

9889

// care about the bit[7] as its just an index vector.

9890

SDValue Idx = Ops[2];

9891

EVT VT = Idx.getValueType();

9892

return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

9893

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

9894

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

9895

ISD::CondCode::SETGT);

9896

};

9897

SDValue Ops[] = {LoLo, HiHi, IndicesVec};

9898

return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

9899

PSHUFBBuilder);

9900

}

9901

break;

9902

case MVT::v16i16:

9903

if (Subtarget.hasVLX() && Subtarget.hasBWI())

9904

Opcode = X86ISD::VPERMV;

9905

else if (Subtarget.hasAVX()) {

9906

// Scale to v32i8 and perform as v32i8.

9907

IndicesVec = ScaleIndices(IndicesVec, 2);

9908

return DAG.getBitcast(

9909

VT, createVariablePermute(

9910

MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

9911

DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

9912

}

9913

break;

9914

case MVT::v8f32:

9915

case MVT::v8i32:

9916

if (Subtarget.hasAVX2())

9917

Opcode = X86ISD::VPERMV;

9918

else if (Subtarget.hasAVX()) {

9919

SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

9920

SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

9921

{0, 1, 2, 3, 0, 1, 2, 3});

9922

SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

9923

{4, 5, 6, 7, 4, 5, 6, 7});

9924

if (Subtarget.hasXOP())

9925

return DAG.getBitcast(

9926

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

9927

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

9928

// Permute Lo and Hi and then select based on index range.

9929

// This works as VPERMILPS only uses index bits[0:1] to permute elements.

9930

SDValue Res = DAG.getSelectCC(

9931

DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

9932

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

9933

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

9934

ISD::CondCode::SETGT);

9935

return DAG.getBitcast(VT, Res);

9936

}

9937

break;

9938

case MVT::v4i64:

9939

case MVT::v4f64:

9940

if (Subtarget.hasAVX512()) {

9941

if (!Subtarget.hasVLX()) {

9942

MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

9943

SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

9944

SDLoc(SrcVec));

9945

IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

9946

DAG, SDLoc(IndicesVec));

9947

SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

9948

DAG, Subtarget);

9949

return extract256BitVector(Res, 0, DAG, DL);

9950

}

9951

Opcode = X86ISD::VPERMV;

9952

} else if (Subtarget.hasAVX()) {

9953

SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

9954

SDValue LoLo =

9955

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

9956

SDValue HiHi =

9957

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

9958

// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

9959

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

9960

if (Subtarget.hasXOP())

9961

return DAG.getBitcast(

9962

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

9963

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

9964

// Permute Lo and Hi and then select based on index range.

9965

// This works as VPERMILPD only uses index bit[1] to permute elements.

9966

SDValue Res = DAG.getSelectCC(

9967

DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

9968

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

9969

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

9970

ISD::CondCode::SETGT);

9971

return DAG.getBitcast(VT, Res);

9972

}

9973

break;

9974

case MVT::v64i8:

9975

if (Subtarget.hasVBMI())

9976

Opcode = X86ISD::VPERMV;

9977

break;

9978

case MVT::v32i16:

9979

if (Subtarget.hasBWI())

9980

Opcode = X86ISD::VPERMV;

9981

break;

9982

case MVT::v16f32:

9983

case MVT::v16i32:

9984

case MVT::v8f64:

9985

case MVT::v8i64:

9986

if (Subtarget.hasAVX512())

9987

Opcode = X86ISD::VPERMV;

9988

break;

9989

}

9990

if (!Opcode)

9991

return SDValue();

9992

9993

assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9995, __PRETTY_FUNCTION__))

9994

(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9995, __PRETTY_FUNCTION__))

9995

"Illegal variable permute shuffle type")(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9995, __PRETTY_FUNCTION__));

9996

9997

uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

9998

if (Scale > 1)

9999

IndicesVec = ScaleIndices(IndicesVec, Scale);

10000

10001

EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

10002

IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

10003

10004

SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

10005

SDValue Res = Opcode == X86ISD::VPERMV

10006

? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

10007

: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

10008

return DAG.getBitcast(VT, Res);

10009

}

10010

10011

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

10012

// reasoned to be a permutation of a vector by indices in a non-constant vector.

10013

// (build_vector (extract_elt V, (extract_elt I, 0)),

10014

// (extract_elt V, (extract_elt I, 1)),

10015

// ...

10016

// ->

10017

// (vpermv I, V)

10018

//

10019

// TODO: Handle undefs

10020

// TODO: Utilize pshufb and zero mask blending to support more efficient

10021

// construction of vectors with constant-0 elements.

10022

static SDValue

10023

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

10024

const X86Subtarget &Subtarget) {

10025

SDValue SrcVec, IndicesVec;

10026

// Check for a match of the permute source vector and permute index elements.

10027

// This is done by checking that the i-th build_vector operand is of the form:

10028

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

10029

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

10030

SDValue Op = V.getOperand(Idx);

10031

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

10032

return SDValue();

10033

10034

// If this is the first extract encountered in V, set the source vector,

10035

// otherwise verify the extract is from the previously defined source

10036

// vector.

10037

if (!SrcVec)

10038

SrcVec = Op.getOperand(0);

10039

else if (SrcVec != Op.getOperand(0))

10040

return SDValue();

10041

SDValue ExtractedIndex = Op->getOperand(1);

10042

// Peek through extends.

10043

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

10044

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

10045

ExtractedIndex = ExtractedIndex.getOperand(0);

10046

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

10047

return SDValue();

10048

10049

// If this is the first extract from the index vector candidate, set the

10050

// indices vector, otherwise verify the extract is from the previously

10051

// defined indices vector.

10052

if (!IndicesVec)

10053

IndicesVec = ExtractedIndex.getOperand(0);

10054

else if (IndicesVec != ExtractedIndex.getOperand(0))

10055

return SDValue();

10056

10057

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

10058

if (!PermIdx || PermIdx->getAPIntValue() != Idx)

10059

return SDValue();

10060

}

10061

10062

SDLoc DL(V);

10063

MVT VT = V.getSimpleValueType();

10064

return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10065

}

10066

10067

SDValue

10068

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

10069

SDLoc dl(Op);

10070

10071

MVT VT = Op.getSimpleValueType();

10072

MVT EltVT = VT.getVectorElementType();

10073

unsigned NumElems = Op.getNumOperands();

10074

10075

// Generate vectors for predicate vectors.

10076

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

10077

return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

10078

10079

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

10080

return VectorConstant;

10081

10082

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

10083

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

10084

return AddSub;

10085

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

10086

return HorizontalOp;

10087

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

10088

return Broadcast;

10089

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

10090

return BitOp;

10091

10092

unsigned EVTBits = EltVT.getSizeInBits();

10093

10094

unsigned NumZero = 0;

10095

unsigned NumNonZero = 0;

10096

uint64_t NonZeros = 0;

10097

bool IsAllConstants = true;

10098

SmallSet<SDValue, 8> Values;

10099

unsigned NumConstants = NumElems;

10100

for (unsigned i = 0; i < NumElems; ++i) {

10101

SDValue Elt = Op.getOperand(i);

10102

if (Elt.isUndef())

10103

continue;

10104

Values.insert(Elt);

10105

if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {

10106

IsAllConstants = false;

10107

NumConstants--;

10108

}

10109

if (X86::isZeroNode(Elt))

10110

NumZero++;

10111

else {

10112

assert(i < sizeof(NonZeros) * 8)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * 8", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10112, __PRETTY_FUNCTION__)); // Make sure the shift is within range.

10113

NonZeros |= ((uint64_t)1 << i);

10114

NumNonZero++;

10115

}

10116

}

10117

10118

// All undef vector. Return an UNDEF. All zero vectors were handled above.

10119

if (NumNonZero == 0)

10120

return DAG.getUNDEF(VT);

10121

10122

// If we are inserting one variable into a vector of non-zero constants, try

10123

// to avoid loading each constant element as a scalar. Load the constants as a

10124

// vector and then insert the variable scalar element. If insertion is not

10125

// supported, fall back to a shuffle to get the scalar blended with the

10126

// constants. Insertion into a zero vector is handled as a special-case

10127

// somewhere below here.

10128

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

10129

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

10130

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

10131

// Create an all-constant vector. The variable element in the old

10132

// build vector is replaced by undef in the constant vector. Save the

10133

// variable scalar element and its index for use in the insertelement.

10134

LLVMContext &Context = *DAG.getContext();

10135

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

10136

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

10137

SDValue VarElt;

10138

SDValue InsIndex;

10139

for (unsigned i = 0; i != NumElems; ++i) {

10140

SDValue Elt = Op.getOperand(i);

10141

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

10142

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

10143

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

10144

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

10145

else if (!Elt.isUndef()) {

10146

assert(!VarElt.getNode() && !InsIndex.getNode() &&((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10147, __PRETTY_FUNCTION__))

10147

"Expected one variable element in this vector")((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10147, __PRETTY_FUNCTION__));

10148

VarElt = Elt;

10149

InsIndex = DAG.getVectorIdxConstant(i, dl);

10150

}

10151

}

10152

Constant *CV = ConstantVector::get(ConstVecOps);

10153

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

10154

10155

// The constants we just created may not be legal (eg, floating point). We

10156

// must lower the vector right here because we can not guarantee that we'll

10157

// legalize it before loading it. This is also why we could not just create

10158

// a new build vector here. If the build vector contains illegal constants,

10159

// it could get split back up into a series of insert elements.

10160

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

10161

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

10162

MachineFunction &MF = DAG.getMachineFunction();

10163

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

10164

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

10165

unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();

10166

unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

10167

if (InsertC < NumEltsInLow128Bits)

10168

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

10169

10170

// There's no good way to insert into the high elements of a >128-bit

10171

// vector, so use shuffles to avoid an extract/insert sequence.

10172

assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((VT.getSizeInBits() > 128 && "Invalid insertion index?"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10172, __PRETTY_FUNCTION__));

10173

assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10173, __PRETTY_FUNCTION__));

10174

SmallVector<int, 8> ShuffleMask;

10175

unsigned NumElts = VT.getVectorNumElements();

10176

for (unsigned i = 0; i != NumElts; ++i)

10177

ShuffleMask.push_back(i == InsertC ? NumElts : i);

10178

SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

10179

return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

10180

}

10181

10182

// Special case for single non-zero, non-undef, element.

10183

if (NumNonZero == 1) {

10184

unsigned Idx = countTrailingZeros(NonZeros);

10185

SDValue Item = Op.getOperand(Idx);

10186

10187

// If we have a constant or non-constant insertion into the low element of

10188

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

10189

// the rest of the elements. This will be matched as movd/movq/movss/movsd

10190

// depending on what the source datatype is.

10191

if (Idx == 0) {

10192

if (NumZero == 0)

10193

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

10194

10195

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

10196

(EltVT == MVT::i64 && Subtarget.is64Bit())) {

10197

assert((VT.is128BitVector() || VT.is256BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10199, __PRETTY_FUNCTION__))

10198

VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10199, __PRETTY_FUNCTION__))

10199

"Expected an SSE value type!")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10199, __PRETTY_FUNCTION__));

10200

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

10201

// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.

10202

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

10203

}

10204

10205

// We can't directly insert an i8 or i16 into a vector, so zero extend

10206

// it to i32 first.

10207

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

10208

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

10209

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);

10210

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

10211

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

10212

return DAG.getBitcast(VT, Item);

10213

}

10214

}

10215

10216

// Is it a vector logical left shift?

10217

if (NumElems == 2 && Idx == 1 &&

10218

X86::isZeroNode(Op.getOperand(0)) &&

10219

!X86::isZeroNode(Op.getOperand(1))) {

10220

unsigned NumBits = VT.getSizeInBits();

10221

return getVShift(true, VT,

10222

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

10223

VT, Op.getOperand(1)),

10224

NumBits/2, DAG, *this, dl);

10225

}

10226

10227

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

10228

return SDValue();

10229

10230

// Otherwise, if this is a vector with i32 or f32 elements, and the element

10231

// is a non-constant being inserted into an element other than the low one,

10232

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

10233

// movd/movss) to move this into the low element, then shuffle it into

10234

// place.

10235

if (EVTBits == 32) {

10236

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

10237

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

10238

}

10239

}

10240

10241

// Splat is obviously ok. Let legalizer expand it to a shuffle.

10242

if (Values.size() == 1) {

10243

if (EVTBits == 32) {

10244

// Instead of a shuffle like this:

10245

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

10246

// Check if it's possible to issue this instead.

10247

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

10248

unsigned Idx = countTrailingZeros(NonZeros);

10249

SDValue Item = Op.getOperand(Idx);

10250

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

10251

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

10252

}

10253

return SDValue();

10254

}

10255

10256

// A vector full of immediates; various special cases are already

10257

// handled, so this is best done with a single constant-pool load.

10258

if (IsAllConstants)

10259

return SDValue();

10260

10261

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

10262

return V;

10263

10264

// See if we can use a vector load to get all of the elements.

10265

{

10266

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

10267

if (SDValue LD =

10268

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

10269

return LD;

10270

}

10271

10272

// If this is a splat of pairs of 32-bit elements, we can use a narrower

10273

// build_vector and broadcast it.

10274

// TODO: We could probably generalize this more.

10275

if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

10276

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

10277

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

10278

auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

10279

// Make sure all the even/odd operands match.

10280

for (unsigned i = 2; i != NumElems; ++i)

10281

if (Ops[i % 2] != Op.getOperand(i))

10282

return false;

10283

return true;

10284

};

10285

if (CanSplat(Op, NumElems, Ops)) {

10286

MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

10287

MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

10288

// Create a new build vector and cast to v2i64/v2f64.

10289

SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

10290

DAG.getBuildVector(NarrowVT, dl, Ops));

10291

// Broadcast from v2i64/v2f64 and cast to final VT.

10292

MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);

10293

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

10294

NewBV));

10295

}

10296

}

10297

10298

// For AVX-length vectors, build the individual 128-bit pieces and use

10299

// shuffles to put them in place.

10300

if (VT.getSizeInBits() > 128) {

10301

MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);

10302

10303

// Build both the lower and upper subvector.

10304

SDValue Lower =

10305

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

10306

SDValue Upper = DAG.getBuildVector(

10307

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

10308

10309

// Recreate the wider vector with the lower and upper part.

10310

return concatSubVectors(Lower, Upper, DAG, dl);

10311

}

10312

10313

// Let legalizer expand 2-wide build_vectors.

10314

if (EVTBits == 64) {

10315

if (NumNonZero == 1) {

10316

// One half is zero or undef.

10317

unsigned Idx = countTrailingZeros(NonZeros);

10318

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

10319

Op.getOperand(Idx));

10320

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

10321

}

10322

return SDValue();

10323

}

10324

10325

// If element VT is < 32 bits, convert it to inserts into a zero vector.

10326

if (EVTBits == 8 && NumElems == 16)

10327

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,

10328

DAG, Subtarget))

10329

return V;

10330

10331

if (EVTBits == 16 && NumElems == 8)

10332

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,

10333

DAG, Subtarget))

10334

return V;

10335

10336

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

10337

if (EVTBits == 32 && NumElems == 4)

10338

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

10339

return V;

10340

10341

// If element VT is == 32 bits, turn it into a number of shuffles.

10342

if (NumElems == 4 && NumZero > 0) {

10343

SmallVector<SDValue, 8> Ops(NumElems);

10344

for (unsigned i = 0; i < 4; ++i) {

10345

bool isZero = !(NonZeros & (1ULL << i));

10346

if (isZero)

10347

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

10348

else

10349

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

10350

}

10351

10352

for (unsigned i = 0; i < 2; ++i) {

10353

switch ((NonZeros >> (i*2)) & 0x3) {

10354

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10354);

10355

case 0:

10356

Ops[i] = Ops[i*2]; // Must be a zero vector.

10357

break;

10358

case 1:

10359

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

10360

break;

10361

case 2:

10362

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

10363

break;

10364

case 3:

10365

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

10366

break;

10367

}

10368

}

10369

10370

bool Reverse1 = (NonZeros & 0x3) == 2;

10371

bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;

10372

int MaskVec[] = {

10373

Reverse1 ? 1 : 0,

10374

Reverse1 ? 0 : 1,

10375

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

10376

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

10377

};

10378

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

10379

}

10380

10381

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? static_cast<void> (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10381, __PRETTY_FUNCTION__));

10382

10383

// Check for a build vector from mostly shuffle plus few inserting.

10384

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

10385

return Sh;

10386

10387

// For SSE 4.1, use insertps to put the high elements into the low element.

10388

if (Subtarget.hasSSE41()) {

10389

SDValue Result;

10390

if (!Op.getOperand(0).isUndef())

10391

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

10392

else

10393

Result = DAG.getUNDEF(VT);

10394

10395

for (unsigned i = 1; i < NumElems; ++i) {

10396

if (Op.getOperand(i).isUndef()) continue;

10397

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

10398

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

10399

}

10400

return Result;

10401

}

10402

10403

// Otherwise, expand into a number of unpckl*, start by extending each of

10404

// our (non-undef) elements to the full vector width with the element in the

10405

// bottom slot of the vector (which generates no code for SSE).

10406

SmallVector<SDValue, 8> Ops(NumElems);

10407

for (unsigned i = 0; i < NumElems; ++i) {

10408

if (!Op.getOperand(i).isUndef())

10409

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

10410

else

10411

Ops[i] = DAG.getUNDEF(VT);

10412

}

10413

10414

// Next, we iteratively mix elements, e.g. for v4f32:

10415

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

10416

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

10417

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

10418

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

10419

// Generate scaled UNPCKL shuffle mask.

10420

SmallVector<int, 16> Mask;

10421

for(unsigned i = 0; i != Scale; ++i)

10422

Mask.push_back(i);

10423

for (unsigned i = 0; i != Scale; ++i)

10424

Mask.push_back(NumElems+i);

10425

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

10426

10427

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

10428

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

10429

}

10430

return Ops[0];

10431

}

10432

10433

// 256-bit AVX can use the vinsertf128 instruction

10434

// to create 256-bit vectors from two other 128-bit ones.

10435

// TODO: Detect subvector broadcast here instead of DAG combine?

10436

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,

10437

const X86Subtarget &Subtarget) {

10438

SDLoc dl(Op);

10439

MVT ResVT = Op.getSimpleValueType();

10440

10441

assert((ResVT.is256BitVector() ||(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10442, __PRETTY_FUNCTION__))

10442

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10442, __PRETTY_FUNCTION__));

10443

10444

unsigned NumOperands = Op.getNumOperands();

10445

unsigned NumZero = 0;

10446

unsigned NumNonZero = 0;

10447

unsigned NonZeros = 0;

10448

for (unsigned i = 0; i != NumOperands; ++i) {

10449

SDValue SubVec = Op.getOperand(i);

10450

if (SubVec.isUndef())

10451

continue;

10452

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

10453

++NumZero;

10454

else {

10455

assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10455, __PRETTY_FUNCTION__)); // Ensure the shift is in range.

10456

NonZeros |= 1 << i;

10457

++NumNonZero;

10458

}

10459

}

10460

10461

// If we have more than 2 non-zeros, build each half separately.

10462

if (NumNonZero > 2) {

10463

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

10464

ArrayRef<SDUse> Ops = Op->ops();

10465

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

10466

Ops.slice(0, NumOperands/2));

10467

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

10468

Ops.slice(NumOperands/2));

10469

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

10470

}

10471

10472

// Otherwise, build it up through insert_subvectors.

10473

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

10474

: DAG.getUNDEF(ResVT);

10475

10476

MVT SubVT = Op.getOperand(0).getSimpleValueType();

10477

unsigned NumSubElems = SubVT.getVectorNumElements();

10478

for (unsigned i = 0; i != NumOperands; ++i) {

10479

if ((NonZeros & (1 << i)) == 0)

10480

continue;

10481

10482

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,

10483

Op.getOperand(i),

10484

DAG.getIntPtrConstant(i * NumSubElems, dl));

10485

}

10486

10487

return Vec;

10488

}

10489

10490

// Returns true if the given node is a type promotion (by concatenating i1

10491

// zeros) of the result of a node that already zeros all upper bits of

10492

// k-register.

10493

// TODO: Merge this with LowerAVXCONCAT_VECTORS?

10494

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

10495

const X86Subtarget &Subtarget,

10496

SelectionDAG & DAG) {

10497

SDLoc dl(Op);

10498

MVT ResVT = Op.getSimpleValueType();

10499

unsigned NumOperands = Op.getNumOperands();

10500

10501

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10502, __PRETTY_FUNCTION__))

10502

"Unexpected number of operands in CONCAT_VECTORS")((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10502, __PRETTY_FUNCTION__));

10503

10504

uint64_t Zeros = 0;

10505

uint64_t NonZeros = 0;

10506

for (unsigned i = 0; i != NumOperands; ++i) {

10507

SDValue SubVec = Op.getOperand(i);

10508

if (SubVec.isUndef())

10509

continue;

10510

assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10510, __PRETTY_FUNCTION__)); // Ensure the shift is in range.

10511

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

10512

Zeros |= (uint64_t)1 << i;

10513

else

10514

NonZeros |= (uint64_t)1 << i;

10515

}

10516

10517

unsigned NumElems = ResVT.getVectorNumElements();

10518

10519

// If we are inserting non-zero vector and there are zeros in LSBs and undef

10520

// in the MSBs we need to emit a KSHIFTL. The generic lowering to

10521

// insert_subvector will give us two kshifts.

10522

if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

10523

Log2_64(NonZeros) != NumOperands - 1) {

10524

MVT ShiftVT = ResVT;

10525

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

10526

ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

10527

unsigned Idx = Log2_64(NonZeros);

10528

SDValue SubVec = Op.getOperand(Idx);

10529

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

10530

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,

10531

DAG.getUNDEF(ShiftVT), SubVec,

10532

DAG.getIntPtrConstant(0, dl));

10533

Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,

10534

DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

10535

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

10536

DAG.getIntPtrConstant(0, dl));

10537

}

10538

10539

// If there are zero or one non-zeros we can handle this very simply.

10540

if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

10541

SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

10542

if (!NonZeros)

10543

return Vec;

10544

unsigned Idx = Log2_64(NonZeros);

10545

SDValue SubVec = Op.getOperand(Idx);

10546

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

10547

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

10548

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

10549

}

10550

10551

if (NumOperands > 2) {

10552

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

10553

ArrayRef<SDUse> Ops = Op->ops();

10554

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

10555

Ops.slice(0, NumOperands/2));

10556

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

10557

Ops.slice(NumOperands/2));

10558

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

10559

}

10560

10561

assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((countPopulation(NonZeros) == 2 && "Simple cases not handled?"
) ? static_cast<void> (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10561, __PRETTY_FUNCTION__));

10562

10563

if (ResVT.getVectorNumElements() >= 16)

10564

return Op; // The operation is legal with KUNPCK

10565

10566

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

10567

DAG.getUNDEF(ResVT), Op.getOperand(0),

10568

DAG.getIntPtrConstant(0, dl));

10569

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

10570

DAG.getIntPtrConstant(NumElems/2, dl));

10571

}

10572

10573

static SDValue LowerCONCAT_VECTORS(SDValue Op,

10574

const X86Subtarget &Subtarget,

10575

SelectionDAG &DAG) {

10576

MVT VT = Op.getSimpleValueType();

10577

if (VT.getVectorElementType() == MVT::i1)

10578

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

10579

10580

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10582, __PRETTY_FUNCTION__))

10581

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10582, __PRETTY_FUNCTION__))

10582

Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10582, __PRETTY_FUNCTION__));

10583

10584

// AVX can use the vinsertf128 instruction to create 256-bit vectors

10585

// from two other 128-bit ones.

10586

10587

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

10588

return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);

10589

}

10590

10591

//===----------------------------------------------------------------------===//

10592

// Vector shuffle lowering

10593

//

10594

// This is an experimental code path for lowering vector shuffles on x86. It is

10595

// designed to handle arbitrary vector shuffles and blends, gracefully

10596

// degrading performance as necessary. It works hard to recognize idiomatic

10597

// shuffles and lower them to optimal instruction patterns without leaving

10598

// a framework that allows reasonably efficient handling of all vector shuffle

10599

// patterns.

10600

//===----------------------------------------------------------------------===//

10601

10602

/// Tiny helper function to identify a no-op mask.

10603

///

10604

/// This is a somewhat boring predicate function. It checks whether the mask

10605

/// array input, which is assumed to be a single-input shuffle mask of the kind

10606

/// used by the X86 shuffle instructions (not a fully general

10607

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

10608

/// in-place shuffle are 'no-op's.

10609

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

10610

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

10611

assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10611, __PRETTY_FUNCTION__));

10612

if (Mask[i] >= 0 && Mask[i] != i)

10613

return false;

10614

}

10615

return true;

10616

}

10617

10618

/// Test whether there are elements crossing LaneSizeInBits lanes in this

10619

/// shuffle mask.

10620

///

10621

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

10622

/// and we routinely test for these.

10623

static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

10624

unsigned ScalarSizeInBits,

10625

ArrayRef<int> Mask) {

10626

assert(LaneSizeInBits && ScalarSizeInBits &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10628, __PRETTY_FUNCTION__))

10627

(LaneSizeInBits % ScalarSizeInBits) == 0 &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10628, __PRETTY_FUNCTION__))

10628

"Illegal shuffle lane size")((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10628, __PRETTY_FUNCTION__));

10629

int LaneSize = LaneSizeInBits / ScalarSizeInBits;

10630

int Size = Mask.size();

10631

for (int i = 0; i < Size; ++i)

10632

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

10633

return true;

10634

return false;

10635

}

10636

10637

/// Test whether there are elements crossing 128-bit lanes in this

10638

/// shuffle mask.

10639

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

10640

return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

10641

}

10642

10643

/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

10644

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

10645

/// better support 'repeated mask + lane permute' style shuffles.

10646

static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

10647

unsigned ScalarSizeInBits,

10648

ArrayRef<int> Mask) {

10649

assert(LaneSizeInBits && ScalarSizeInBits &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10651, __PRETTY_FUNCTION__))

10650

(LaneSizeInBits % ScalarSizeInBits) == 0 &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10651, __PRETTY_FUNCTION__))

10651

"Illegal shuffle lane size")((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10651, __PRETTY_FUNCTION__));

10652

int NumElts = Mask.size();

10653

int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

10654

int NumLanes = NumElts / NumEltsPerLane;

10655

if (NumLanes > 1) {

10656

for (int i = 0; i != NumLanes; ++i) {

10657

int SrcLane = -1;

10658

for (int j = 0; j != NumEltsPerLane; ++j) {

10659

int M = Mask[(i * NumEltsPerLane) + j];

10660

if (M < 0)

10661

continue;

10662

int Lane = (M % NumElts) / NumEltsPerLane;

10663

if (SrcLane >= 0 && SrcLane != Lane)

10664

return true;

10665

SrcLane = Lane;

10666

}

10667

}

10668

}

10669

return false;

10670

}

10671

10672

/// Test whether a shuffle mask is equivalent within each sub-lane.

10673

///

10674

/// This checks a shuffle mask to see if it is performing the same

10675

/// lane-relative shuffle in each sub-lane. This trivially implies

10676

/// that it is also not lane-crossing. It may however involve a blend from the

10677

/// same lane of a second vector.

10678

///

10679

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

10680

/// non-trivial to compute in the face of undef lanes. The representation is

10681

/// suitable for use with existing 128-bit shuffles as entries from the second

10682

/// vector have been remapped to [LaneSize, 2*LaneSize).

10683

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

10684

ArrayRef<int> Mask,

10685

SmallVectorImpl<int> &RepeatedMask) {

10686

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

10687

RepeatedMask.assign(LaneSize, -1);

10688

int Size = Mask.size();

10689

for (int i = 0; i < Size; ++i) {

10690

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((Mask[i] == SM_SentinelUndef || Mask[i] >= 0) ? static_cast
<void> (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10690, __PRETTY_FUNCTION__));

10691

if (Mask[i] < 0)

10692

continue;

10693

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

10694

// This entry crosses lanes, so there is no way to model this shuffle.

10695

return false;

10696

10697

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

10698

// Adjust second vector indices to start at LaneSize instead of Size.

10699

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

10700

: Mask[i] % LaneSize + LaneSize;

10701

if (RepeatedMask[i % LaneSize] < 0)

10702

// This is the first non-undef entry in this slot of a 128-bit lane.

10703

RepeatedMask[i % LaneSize] = LocalM;

10704

else if (RepeatedMask[i % LaneSize] != LocalM)

10705

// Found a mismatch with the repeated mask.

10706

return false;

10707

}

10708

return true;

10709

}

10710

10711

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

10712

static bool

10713

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

10714

SmallVectorImpl<int> &RepeatedMask) {

10715

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

10716

}

10717

10718

static bool

10719

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

10720

SmallVector<int, 32> RepeatedMask;

10721

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

10722

}

10723

10724

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

10725

static bool

10726

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

10727

SmallVectorImpl<int> &RepeatedMask) {

10728

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

10729

}

10730

10731

/// Test whether a target shuffle mask is equivalent within each sub-lane.

10732

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

10733

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

10734

unsigned EltSizeInBits,

10735

ArrayRef<int> Mask,

10736

SmallVectorImpl<int> &RepeatedMask) {

10737

int LaneSize = LaneSizeInBits / EltSizeInBits;

10738

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

10739

int Size = Mask.size();

10740

for (int i = 0; i < Size; ++i) {

10741

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((isUndefOrZero(Mask[i]) || (Mask[i] >= 0)) ? static_cast<
void> (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10741, __PRETTY_FUNCTION__));

10742

if (Mask[i] == SM_SentinelUndef)

10743

continue;

10744

if (Mask[i] == SM_SentinelZero) {

10745

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

10746

return false;

10747

RepeatedMask[i % LaneSize] = SM_SentinelZero;

10748

continue;

10749

}

10750

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

10751

// This entry crosses lanes, so there is no way to model this shuffle.

10752

return false;

10753

10754

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

10755

// Adjust second vector indices to start at LaneSize instead of Size.

10756

int LocalM =

10757

Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;

10758

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

10759

// This is the first non-undef entry in this slot of a 128-bit lane.

10760

RepeatedMask[i % LaneSize] = LocalM;

10761

else if (RepeatedMask[i % LaneSize] != LocalM)

10762

// Found a mismatch with the repeated mask.

10763

return false;

10764

}

10765

return true;

10766

}

10767

10768

/// Test whether a target shuffle mask is equivalent within each sub-lane.

10769

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

10770

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

10771

ArrayRef<int> Mask,

10772

SmallVectorImpl<int> &RepeatedMask) {

10773

return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

10774

Mask, RepeatedMask);

10775

}

10776

10777

/// Checks whether the vector elements referenced by two shuffle masks are

10778

/// equivalent.

10779

static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

10780

int Idx, int ExpectedIdx) {

10781

assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&((0 <= Idx && Idx < MaskSize && 0 <=
ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index"
) ? static_cast<void> (0) : __assert_fail ("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10782, __PRETTY_FUNCTION__))

10782

ExpectedIdx < MaskSize && "Out of range element index")((0 <= Idx && Idx < MaskSize && 0 <=
ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index"
) ? static_cast<void> (0) : __assert_fail ("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10782, __PRETTY_FUNCTION__));

10783

if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

10784

return false;

10785

10786

switch (Op.getOpcode()) {

10787

case ISD::BUILD_VECTOR:

10788

// If the values are build vectors, we can look through them to find

10789

// equivalent inputs that make the shuffles equivalent.

10790

// TODO: Handle MaskSize != Op.getNumOperands()?

10791

if (MaskSize == (int)Op.getNumOperands() &&

10792

MaskSize == (int)ExpectedOp.getNumOperands())

10793

return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

10794

break;

10795

case X86ISD::HADD:

10796

case X86ISD::HSUB:

10797

case X86ISD::FHADD:

10798

case X86ISD::FHSUB:

10799

case X86ISD::PACKSS:

10800

case X86ISD::PACKUS:

10801

// HOP(X,X) can refer to the elt from the lower/upper half of a lane.

10802

// TODO: Handle MaskSize != NumElts?

10803

// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

10804

if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

10805

MVT VT = Op.getSimpleValueType();

10806

int NumElts = VT.getVectorNumElements();

10807

if (MaskSize == NumElts) {

10808

int NumLanes = VT.getSizeInBits() / 128;

10809

int NumEltsPerLane = NumElts / NumLanes;

10810

int NumHalfEltsPerLane = NumEltsPerLane / 2;

10811

bool SameLane =

10812

(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

10813

bool SameElt =

10814

(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

10815

return SameLane && SameElt;

10816

}

10817

}

10818

break;

10819

}

10820

10821

return false;

10822

}

10823

10824

/// Checks whether a shuffle mask is equivalent to an explicit list of

10825

/// arguments.

10826

///

10827

/// This is a fast way to test a shuffle mask against a fixed pattern:

10828

///

10829

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

10830

///

10831

/// It returns true if the mask is exactly as wide as the argument list, and

10832

/// each element of the mask is either -1 (signifying undef) or the value given

10833

/// in the argument.

10834

static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,

10835

ArrayRef<int> ExpectedMask) {

10836

int Size = Mask.size();

10837

if (Size != (int)ExpectedMask.size())

10838

return false;

10839

10840

for (int i = 0; i < Size; ++i) {

10841

assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10841, __PRETTY_FUNCTION__));

10842

int MaskIdx = Mask[i];

10843

int ExpectedIdx = ExpectedMask[i];

10844

if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

10845

SDValue MaskV = MaskIdx < Size ? V1 : V2;

10846

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

10847

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

10848

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

10849

if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

10850

return false;

10851

}

10852

}

10853

return true;

10854

}

10855

10856

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

10857

///

10858

/// The masks must be exactly the same width.

10859

///

10860

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

10861

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

10862

///

10863

/// SM_SentinelZero is accepted as a valid negative index but must match in

10864

/// both.

10865

static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,

10866

ArrayRef<int> ExpectedMask,

10867

SDValue V1 = SDValue(),

10868

SDValue V2 = SDValue()) {

10869

int Size = Mask.size();

10870

if (Size != (int)ExpectedMask.size())

10871

return false;

10872

assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10873, __PRETTY_FUNCTION__))

10873

"Illegal target shuffle mask")((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10873, __PRETTY_FUNCTION__));

10874

10875

// Check for out-of-range target shuffle mask indices.

10876

if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

10877

return false;

10878

10879

for (int i = 0; i < Size; ++i) {

10880

int MaskIdx = Mask[i];

10881

int ExpectedIdx = ExpectedMask[i];

10882

if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

10883

continue;

10884

if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {

10885

SDValue MaskV = MaskIdx < Size ? V1 : V2;

10886

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

10887

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

10888

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

10889

if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

10890

continue;

10891

}

10892

// TODO - handle SM_Sentinel equivalences.

10893

return false;

10894

}

10895

return true;

10896

}

10897

10898

// Attempt to create a shuffle mask from a VSELECT condition mask.

10899

static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

10900

SDValue Cond) {

10901

EVT CondVT = Cond.getValueType();

10902

unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

10903

unsigned NumElts = CondVT.getVectorNumElements();

10904

10905

APInt UndefElts;

10906

SmallVector<APInt, 32> EltBits;

10907

if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

10908

true, false))

10909

return false;

10910

10911

Mask.resize(NumElts, SM_SentinelUndef);

10912

10913

for (int i = 0; i != (int)NumElts; ++i) {

10914

Mask[i] = i;

10915

// Arbitrarily choose from the 2nd operand if the select condition element

10916

// is undef.

10917

// TODO: Can we do better by matching patterns such as even/odd?

10918

if (UndefElts[i] || EltBits[i].isNullValue())

10919

Mask[i] += NumElts;

10920

}

10921

10922

return true;

10923

}

10924

10925

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

10926

// instructions.

10927

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {

10928

if (VT != MVT::v8i32 && VT != MVT::v8f32)

10929

return false;

10930

10931

SmallVector<int, 8> Unpcklwd;

10932

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

10933

/* Unary = */ false);

10934

SmallVector<int, 8> Unpckhwd;

10935

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

10936

/* Unary = */ false);

10937

bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||

10938

isTargetShuffleEquivalent(Mask, Unpckhwd));

10939

return IsUnpackwdMask;

10940

}

10941

10942

static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {

10943

// Create 128-bit vector type based on mask size.

10944

MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

10945

MVT VT = MVT::getVectorVT(EltVT, Mask.size());

10946

10947

// We can't assume a canonical shuffle mask, so try the commuted version too.

10948

SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());

10949

ShuffleVectorSDNode::commuteMask(CommutedMask);

10950

10951

// Match any of unary/binary or low/high.

10952

for (unsigned i = 0; i != 4; ++i) {

10953

SmallVector<int, 16> UnpackMask;

10954

createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

10955

if (isTargetShuffleEquivalent(Mask, UnpackMask) ||

10956

isTargetShuffleEquivalent(CommutedMask, UnpackMask))

10957

return true;

10958

}

10959

return false;

10960

}

10961

10962

/// Return true if a shuffle mask chooses elements identically in its top and

10963

/// bottom halves. For example, any splat mask has the same top and bottom

10964

/// halves. If an element is undefined in only one half of the mask, the halves

10965

/// are not considered identical.

10966

static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

10967

assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10967, __PRETTY_FUNCTION__));

10968

unsigned HalfSize = Mask.size() / 2;

10969

for (unsigned i = 0; i != HalfSize; ++i) {

10970

if (Mask[i] != Mask[i + HalfSize])

10971

return false;

10972

}

10973

return true;

10974

}

10975

10976

/// Get a 4-lane 8-bit shuffle immediate for a mask.

10977

///

10978

/// This helper function produces an 8-bit shuffle immediate corresponding to

10979

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

10980

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

10981

/// example.

10982

///

10983

/// NB: We rely heavily on "undef" masks preserving the input lane.

10984

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

10985

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10985, __PRETTY_FUNCTION__));

10986

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10986, __PRETTY_FUNCTION__));

10987

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10987, __PRETTY_FUNCTION__));

10988

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10988, __PRETTY_FUNCTION__));

10989

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10989, __PRETTY_FUNCTION__));

10990

10991

// If the mask only uses one non-undef element, then fully 'splat' it to

10992

// improve later broadcast matching.

10993

int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

10994

assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")((0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10994, __PRETTY_FUNCTION__));

10995

10996

int FirstElt = Mask[FirstIndex];

10997

if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

10998

return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

10999

11000

unsigned Imm = 0;

11001

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

11002

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

11003

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

11004

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

11005

return Imm;

11006

}

11007

11008

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

11009

SelectionDAG &DAG) {

11010

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

11011

}

11012

11013

// The Shuffle result is as follow:

11014

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

11015

// Each Zeroable's element correspond to a particular Mask's element.

11016

// As described in computeZeroableShuffleElements function.

11017

//

11018

// The function looks for a sub-mask that the nonzero elements are in

11019

// increasing order. If such sub-mask exist. The function returns true.

11020

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

11021

ArrayRef<int> Mask, const EVT &VectorType,

11022

bool &IsZeroSideLeft) {

11023

int NextElement = -1;

11024

// Check if the Mask's nonzero elements are in increasing order.

11025

for (int i = 0, e = Mask.size(); i < e; i++) {

11026

// Checks if the mask's zeros elements are built from only zeros.

11027

assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11027, __PRETTY_FUNCTION__));

11028

if (Mask[i] < 0)

11029

return false;

11030

if (Zeroable[i])

11031

continue;

11032

// Find the lowest non zero element

11033

if (NextElement < 0) {

11034

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

11035

IsZeroSideLeft = NextElement != 0;

11036

}

11037

// Exit if the mask's non zero elements are not in increasing order.

11038

if (NextElement != Mask[i])

11039

return false;

11040

NextElement++;

11041

}

11042

return true;

11043

}

11044

11045

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

11046

static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

11047

ArrayRef<int> Mask, SDValue V1,

11048

SDValue V2, const APInt &Zeroable,

11049

const X86Subtarget &Subtarget,

11050

SelectionDAG &DAG) {

11051

int Size = Mask.size();

11052

int LaneSize = 128 / VT.getScalarSizeInBits();

11053

const int NumBytes = VT.getSizeInBits() / 8;

11054

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

11055

11056

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11058, __PRETTY_FUNCTION__))

11057

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11058, __PRETTY_FUNCTION__))

11058

(Subtarget.hasBWI() && VT.is512BitVector()))(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11058, __PRETTY_FUNCTION__));

11059

11060

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

11061

// Sign bit set in i8 mask means zero element.

11062

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

11063

11064

SDValue V;

11065

for (int i = 0; i < NumBytes; ++i) {

11066

int M = Mask[i / NumEltBytes];

11067

if (M < 0) {

11068

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

11069

continue;

11070

}

11071

if (Zeroable[i / NumEltBytes]) {

11072

PSHUFBMask[i] = ZeroMask;

11073

continue;

11074

}

11075

11076

// We can only use a single input of V1 or V2.

11077

SDValue SrcV = (M >= Size ? V2 : V1);

11078

if (V && V != SrcV)

11079

return SDValue();

11080

V = SrcV;

11081

M %= Size;

11082

11083

// PSHUFB can't cross lanes, ensure this doesn't happen.

11084

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

11085

return SDValue();

11086

11087

M = M % LaneSize;

11088

M = M * NumEltBytes + (i % NumEltBytes);

11089

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

11090

}

11091

assert(V && "Failed to find a source input")((V && "Failed to find a source input") ? static_cast
<void> (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11091, __PRETTY_FUNCTION__));

11092

11093

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

11094

return DAG.getBitcast(

11095

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

11096

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

11097

}

11098

11099

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

11100

const X86Subtarget &Subtarget, SelectionDAG &DAG,

11101

const SDLoc &dl);

11102

11103

// X86 has dedicated shuffle that can be lowered to VEXPAND

11104

static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

11105

const APInt &Zeroable,

11106

ArrayRef<int> Mask, SDValue &V1,

11107

SDValue &V2, SelectionDAG &DAG,

11108

const X86Subtarget &Subtarget) {

11109

bool IsLeftZeroSide = true;

11110

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

11111

IsLeftZeroSide))

11112

return SDValue();

11113

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

11114

MVT IntegerType =

11115

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

11116

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

11117

unsigned NumElts = VT.getVectorNumElements();

11118

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11119, __PRETTY_FUNCTION__))

11119

"Unexpected number of vector elements")(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11119, __PRETTY_FUNCTION__));

11120

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

11121

Subtarget, DAG, DL);

11122

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

11123

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

11124

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

11125

}

11126

11127

static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

11128

unsigned &UnpackOpcode, bool IsUnary,

11129

ArrayRef<int> TargetMask, const SDLoc &DL,

11130

SelectionDAG &DAG,

11131

const X86Subtarget &Subtarget) {

11132

int NumElts = VT.getVectorNumElements();

11133

11134

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

11135

for (int i = 0; i != NumElts; i += 2) {

11136

int M1 = TargetMask[i + 0];

11137

int M2 = TargetMask[i + 1];

11138

Undef1 &= (SM_SentinelUndef == M1);

11139

Undef2 &= (SM_SentinelUndef == M2);

11140

Zero1 &= isUndefOrZero(M1);

11141

Zero2 &= isUndefOrZero(M2);

11142

}

11143

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11144, __PRETTY_FUNCTION__))

11144

"Zeroable shuffle detected")((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11144, __PRETTY_FUNCTION__));

11145

11146

// Attempt to match the target mask against the unpack lo/hi mask patterns.

11147

SmallVector<int, 64> Unpckl, Unpckh;

11148

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

11149

if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {

11150

UnpackOpcode = X86ISD::UNPCKL;

11151

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

11152

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

11153

return true;

11154

}

11155

11156

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

11157

if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {

11158

UnpackOpcode = X86ISD::UNPCKH;

11159

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

11160

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

11161

return true;

11162

}

11163

11164

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

11165

if (IsUnary && (Zero1 || Zero2)) {

11166

// Don't bother if we can blend instead.

11167

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

11168

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

11169

return false;

11170

11171

bool MatchLo = true, MatchHi = true;

11172

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

11173

int M = TargetMask[i];

11174

11175

// Ignore if the input is known to be zero or the index is undef.

11176

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

11177

(M == SM_SentinelUndef))

11178

continue;

11179

11180

MatchLo &= (M == Unpckl[i]);

11181

MatchHi &= (M == Unpckh[i]);

11182

}

11183

11184

if (MatchLo || MatchHi) {

11185

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

11186

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

11187

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

11188

return true;

11189

}

11190

}

11191

11192

// If a binary shuffle, commute and try again.

11193

if (!IsUnary) {

11194

ShuffleVectorSDNode::commuteMask(Unpckl);

11195

if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {

11196

UnpackOpcode = X86ISD::UNPCKL;

11197

std::swap(V1, V2);

11198

return true;

11199

}

11200

11201

ShuffleVectorSDNode::commuteMask(Unpckh);

11202

if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {

11203

UnpackOpcode = X86ISD::UNPCKH;

11204

std::swap(V1, V2);

11205

return true;

11206

}

11207

}

11208

11209

return false;

11210

}

11211

11212

// X86 has dedicated unpack instructions that can handle specific blend

11213

// operations: UNPCKH and UNPCKL.

11214

static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

11215

ArrayRef<int> Mask, SDValue V1, SDValue V2,

11216

SelectionDAG &DAG) {

11217

SmallVector<int, 8> Unpckl;

11218

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

11219

if (isShuffleEquivalent(V1, V2, Mask, Unpckl))

11220

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

11221

11222

SmallVector<int, 8> Unpckh;

11223

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

11224

if (isShuffleEquivalent(V1, V2, Mask, Unpckh))

11225

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

11226

11227

// Commute and try again.

11228

ShuffleVectorSDNode::commuteMask(Unpckl);

11229

if (isShuffleEquivalent(V1, V2, Mask, Unpckl))

11230

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

11231

11232

ShuffleVectorSDNode::commuteMask(Unpckh);

11233

if (isShuffleEquivalent(V1, V2, Mask, Unpckh))

11234

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

11235

11236

return SDValue();

11237

}

11238

11239

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

11240

/// followed by unpack 256-bit.

11241

static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

11242

ArrayRef<int> Mask, SDValue V1,

11243

SDValue V2, SelectionDAG &DAG) {

11244

SmallVector<int, 32> Unpckl, Unpckh;

11245

createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

11246

createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

11247

11248

unsigned UnpackOpcode;

11249

if (isShuffleEquivalent(V1, V2, Mask, Unpckl))

11250

UnpackOpcode = X86ISD::UNPCKL;

11251

else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))

11252

UnpackOpcode = X86ISD::UNPCKH;

11253

else

11254

return SDValue();

11255

11256

// This is a "natural" unpack operation (rather than the 128-bit sectored

11257

// operation implemented by AVX). We need to rearrange 64-bit chunks of the

11258

// input in order to use the x86 instruction.

11259

V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

11260

DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

11261

V1 = DAG.getBitcast(VT, V1);

11262

return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

11263

}

11264

11265

// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

11266

// source into the lower elements and zeroing the upper elements.

11267

static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

11268

ArrayRef<int> Mask, const APInt &Zeroable,

11269

const X86Subtarget &Subtarget) {

11270

if (!VT.is512BitVector() && !Subtarget.hasVLX())

11271

return false;

11272

11273

unsigned NumElts = Mask.size();

11274

unsigned EltSizeInBits = VT.getScalarSizeInBits();

11275

unsigned MaxScale = 64 / EltSizeInBits;

11276

11277

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

11278

unsigned SrcEltBits = EltSizeInBits * Scale;

11279

if (SrcEltBits < 32 && !Subtarget.hasBWI())

11280

continue;

11281

unsigned NumSrcElts = NumElts / Scale;

11282

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

11283

continue;

11284

unsigned UpperElts = NumElts - NumSrcElts;

11285

if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())

11286

continue;

11287

SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

11288

SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

11289

DstVT = MVT::getIntegerVT(EltSizeInBits);

11290

if ((NumSrcElts * EltSizeInBits) >= 128) {

11291

// ISD::TRUNCATE

11292

DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

11293

} else {

11294

// X86ISD::VTRUNC

11295

DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

11296

}

11297

return true;

11298

}

11299

11300

return false;

11301

}

11302

11303

// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

11304

// element padding to the final DstVT.

11305

static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

11306

const X86Subtarget &Subtarget,

11307

SelectionDAG &DAG, bool ZeroUppers) {

11308

MVT SrcVT = Src.getSimpleValueType();

11309

MVT DstSVT = DstVT.getScalarType();

11310

unsigned NumDstElts = DstVT.getVectorNumElements();

11311

unsigned NumSrcElts = SrcVT.getVectorNumElements();

11312

unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();

11313

11314

if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

11315

return SDValue();

11316

11317

// Perform a direct ISD::TRUNCATE if possible.

11318

if (NumSrcElts == NumDstElts)

11319

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);

11320

11321

if (NumSrcElts > NumDstElts) {

11322

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

11323

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

11324

return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

11325

}

11326

11327

if ((NumSrcElts * DstEltSizeInBits) >= 128) {

11328

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

11329

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

11330

return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

11331

DstVT.getSizeInBits());

11332

}

11333

11334

// Non-VLX targets must truncate from a 512-bit type, so we need to

11335

// widen, truncate and then possibly extract the original subvector.

11336

if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

11337

SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

11338

return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

11339

}

11340

11341

// Fallback to a X86ISD::VTRUNC, padding if necessary.

11342

MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

11343

SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

11344

if (DstVT != TruncVT)

11345

Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

11346

DstVT.getSizeInBits());

11347

return Trunc;

11348

}

11349

11350

// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

11351

//

11352

// An example is the following:

11353

//

11354

// t0: ch = EntryToken

11355

// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

11356

// t25: v4i32 = truncate t2

11357

// t41: v8i16 = bitcast t25

11358

// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

11359

// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

11360

// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

11361

// t18: v2i64 = bitcast t51

11362

//

11363

// One can just use a single vpmovdw instruction, without avx512vl we need to

11364

// use the zmm variant and extract the lower subvector, padding with zeroes.

11365

// TODO: Merge with lowerShuffleAsVTRUNC.

11366

static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

11367

SDValue V2, ArrayRef<int> Mask,

11368

const APInt &Zeroable,

11369

const X86Subtarget &Subtarget,

11370

SelectionDAG &DAG) {

11371

assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11371, __PRETTY_FUNCTION__));

11372

if (!Subtarget.hasAVX512())

11373

return SDValue();

11374

11375

unsigned NumElts = VT.getVectorNumElements();

11376

unsigned EltSizeInBits = VT.getScalarSizeInBits();

11377

unsigned MaxScale = 64 / EltSizeInBits;

11378

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

11379

unsigned NumSrcElts = NumElts / Scale;

11380

unsigned UpperElts = NumElts - NumSrcElts;

11381

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

11382

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())

11383

continue;

11384

11385

SDValue Src = V1;

11386

if (!Src.hasOneUse())

11387

return SDValue();

11388

11389

Src = peekThroughOneUseBitcasts(Src);

11390

if (Src.getOpcode() != ISD::TRUNCATE ||

11391

Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))

11392

return SDValue();

11393

Src = Src.getOperand(0);

11394

11395

// VPMOVWB is only available with avx512bw.

11396

MVT SrcVT = Src.getSimpleValueType();

11397

if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&

11398

!Subtarget.hasBWI())

11399

return SDValue();

11400

11401

bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

11402

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

11403

}

11404

11405

return SDValue();

11406

}

11407

11408

// Attempt to match binary shuffle patterns as a truncate.

11409

static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

11410

SDValue V2, ArrayRef<int> Mask,

11411

const APInt &Zeroable,

11412

const X86Subtarget &Subtarget,

11413

SelectionDAG &DAG) {

11414

assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unexpected VTRUNC type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11415, __PRETTY_FUNCTION__))

11415

"Unexpected VTRUNC type")(((VT.is128BitVector() || VT.is256BitVector()) && "Unexpected VTRUNC type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11415, __PRETTY_FUNCTION__));

11416

if (!Subtarget.hasAVX512())

11417

return SDValue();

11418

11419

unsigned NumElts = VT.getVectorNumElements();

11420

unsigned EltSizeInBits = VT.getScalarSizeInBits();

11421

unsigned MaxScale = 64 / EltSizeInBits;

11422

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

11423

// TODO: Support non-BWI VPMOVWB truncations?

11424

unsigned SrcEltBits = EltSizeInBits * Scale;

11425

if (SrcEltBits < 32 && !Subtarget.hasBWI())

11426

continue;

11427

11428

// Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>

11429

// Bail if the V2 elements are undef.

11430

unsigned NumHalfSrcElts = NumElts / Scale;

11431

unsigned NumSrcElts = 2 * NumHalfSrcElts;

11432

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

11433

isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

11434

continue;

11435

11436

// The elements beyond the truncation must be undef/zero.

11437

unsigned UpperElts = NumElts - NumSrcElts;

11438

if (UpperElts > 0 &&

11439

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())

11440

continue;

11441

bool UndefUppers =

11442

UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);

11443

11444

// As we're using both sources then we need to concat them together

11445

// and truncate from the double-sized src.

11446

MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);

11447

SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

11448

11449

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

11450

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

11451

Src = DAG.getBitcast(SrcVT, Src);

11452

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

11453

}

11454

11455

return SDValue();

11456

}

11457

11458

/// Check whether a compaction lowering can be done by dropping even

11459

/// elements and compute how many times even elements must be dropped.

11460

///

11461

/// This handles shuffles which take every Nth element where N is a power of

11462

/// two. Example shuffle masks:

11463

///

11464

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

11465

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

11466

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

11467

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

11468

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

11469

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

11470

///

11471

/// Any of these lanes can of course be undef.

11472

///

11473

/// This routine only supports N <= 3.

11474

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

11475

/// for larger N.

11476

///

11477

/// \returns N above, or the number of times even elements must be dropped if

11478

/// there is such a number. Otherwise returns zero.

11479

static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,

11480

bool IsSingleInput) {

11481

// The modulus for the shuffle vector entries is based on whether this is

11482

// a single input or not.

11483

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

11484

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11485, __PRETTY_FUNCTION__))

11485

"We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11485, __PRETTY_FUNCTION__));

11486

11487

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

11488

11489

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

11490

// and 2^3 simultaneously. This is because we may have ambiguity with

11491

// partially undef inputs.

11492

bool ViableForN[3] = {true, true, true};

11493

11494

for (int i = 0, e = Mask.size(); i < e; ++i) {

11495

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

11496

// want.

11497

if (Mask[i] < 0)

11498

continue;

11499

11500

bool IsAnyViable = false;

11501

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

11502

if (ViableForN[j]) {

11503

uint64_t N = j + 1;

11504

11505

// The shuffle mask must be equal to (i * 2^N) % M.

11506

if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))

11507

IsAnyViable = true;

11508

else

11509

ViableForN[j] = false;

11510

}

11511

// Early exit if we exhaust the possible powers of two.

11512

if (!IsAnyViable)

11513

break;

11514

}

11515

11516

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

11517

if (ViableForN[j])

11518

return j + 1;

11519

11520

// Return 0 as there is no viable power of two.

11521

return 0;

11522

}

11523

11524

// X86 has dedicated pack instructions that can handle specific truncation

11525

// operations: PACKSS and PACKUS.

11526

// Checks for compaction shuffle masks if MaxStages > 1.

11527

// TODO: Add support for matching multiple PACKSS/PACKUS stages.

11528

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

11529

unsigned &PackOpcode, ArrayRef<int> TargetMask,

11530

SelectionDAG &DAG,

11531

const X86Subtarget &Subtarget,

11532

unsigned MaxStages = 1) {

11533

unsigned NumElts = VT.getVectorNumElements();

11534

unsigned BitSize = VT.getScalarSizeInBits();

11535

assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&((0 < MaxStages && MaxStages <= 3 && (BitSize
<< MaxStages) <= 64 && "Illegal maximum compaction"
) ? static_cast<void> (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11536, __PRETTY_FUNCTION__))

11536

"Illegal maximum compaction")((0 < MaxStages && MaxStages <= 3 && (BitSize
<< MaxStages) <= 64 && "Illegal maximum compaction"
) ? static_cast<void> (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11536, __PRETTY_FUNCTION__));

11537

11538

auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

11539

unsigned NumSrcBits = PackVT.getScalarSizeInBits();

11540

unsigned NumPackedBits = NumSrcBits - BitSize;

11541

SDValue VV1 = DAG.getBitcast(PackVT, N1);

11542

SDValue VV2 = DAG.getBitcast(PackVT, N2);

11543

if (Subtarget.hasSSE41() || BitSize == 8) {

11544

APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

11545

if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&

11546

(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {

11547

V1 = VV1;

11548

V2 = VV2;

11549

SrcVT = PackVT;

11550

PackOpcode = X86ISD::PACKUS;

11551

return true;

11552

}

11553

}

11554

if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&

11555

(N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {

11556

V1 = VV1;

11557

V2 = VV2;

11558

SrcVT = PackVT;

11559

PackOpcode = X86ISD::PACKSS;

11560

return true;

11561

}

11562

return false;

11563

};

11564

11565

// Attempt to match against wider and wider compaction patterns.

11566

for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

11567

MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

11568

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

11569

11570

// Try binary shuffle.

11571

SmallVector<int, 32> BinaryMask;

11572

createPackShuffleMask(VT, BinaryMask, false, NumStages);

11573

if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))

11574

if (MatchPACK(V1, V2, PackVT))

11575

return true;

11576

11577

// Try unary shuffle.

11578

SmallVector<int, 32> UnaryMask;

11579

createPackShuffleMask(VT, UnaryMask, true, NumStages);

11580

if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))

11581

if (MatchPACK(V1, V1, PackVT))

11582

return true;

11583

}

11584

11585

return false;

11586

}

11587

11588

static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

11589

SDValue V1, SDValue V2, SelectionDAG &DAG,

11590

const X86Subtarget &Subtarget) {

11591

MVT PackVT;

11592

unsigned PackOpcode;

11593

unsigned SizeBits = VT.getSizeInBits();

11594

unsigned EltBits = VT.getScalarSizeInBits();

11595

unsigned MaxStages = Log2_32(64 / EltBits);

11596

if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

11597

Subtarget, MaxStages))

11598

return SDValue();

11599

11600

unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

11601

unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

11602

11603

// Don't lower multi-stage packs on AVX512, truncation is better.

11604

if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

11605

return SDValue();

11606

11607

// Pack to the largest type possible:

11608

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

11609

unsigned MaxPackBits = 16;

11610

if (CurrentEltBits > 16 &&

11611

(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

11612

MaxPackBits = 32;

11613

11614

// Repeatedly pack down to the target size.

11615

SDValue Res;

11616

for (unsigned i = 0; i != NumStages; ++i) {

11617

unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

11618

unsigned NumSrcElts = SizeBits / SrcEltBits;

11619

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

11620

MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

11621

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

11622

MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

11623

Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

11624

DAG.getBitcast(SrcVT, V2));

11625

V1 = V2 = Res;

11626

CurrentEltBits /= 2;

11627

}

11628

assert(Res && Res.getValueType() == VT &&((Res && Res.getValueType() == VT && "Failed to lower compaction shuffle"
) ? static_cast<void> (0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11629, __PRETTY_FUNCTION__))

11629

"Failed to lower compaction shuffle")((Res && Res.getValueType() == VT && "Failed to lower compaction shuffle"
) ? static_cast<void> (0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11629, __PRETTY_FUNCTION__));

11630

return Res;

11631

}

11632

11633

/// Try to emit a bitmask instruction for a shuffle.

11634

///

11635

/// This handles cases where we can model a blend exactly as a bitmask due to

11636

/// one of the inputs being zeroable.

11637

static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

11638

SDValue V2, ArrayRef<int> Mask,

11639

const APInt &Zeroable,

11640

const X86Subtarget &Subtarget,

11641

SelectionDAG &DAG) {

11642

MVT MaskVT = VT;

11643

MVT EltVT = VT.getVectorElementType();

11644

SDValue Zero, AllOnes;

11645

// Use f64 if i64 isn't legal.

11646

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

11647

EltVT = MVT::f64;

11648

MaskVT = MVT::getVectorVT(EltVT, Mask.size());

11649

}

11650

11651

MVT LogicVT = VT;

11652

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

11653

Zero = DAG.getConstantFP(0.0, DL, EltVT);

11654

APFloat AllOnesValue = APFloat::getAllOnesValue(

11655

SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());

11656

AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

11657

LogicVT =

11658

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

11659

} else {

11660

Zero = DAG.getConstant(0, DL, EltVT);

11661

AllOnes = DAG.getAllOnesConstant(DL, EltVT);

11662

}

11663

11664

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

11665

SDValue V;

11666

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11667

if (Zeroable[i])

11668

continue;

11669

if (Mask[i] % Size != i)

11670

return SDValue(); // Not a blend.

11671

if (!V)

11672

V = Mask[i] < Size ? V1 : V2;

11673

else if (V != (Mask[i] < Size ? V1 : V2))

11674

return SDValue(); // Can only let one input through the mask.

11675

11676

VMaskOps[i] = AllOnes;

11677

}

11678

if (!V)

11679

return SDValue(); // No non-zeroable elements!

11680

11681

SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

11682

VMask = DAG.getBitcast(LogicVT, VMask);

11683

V = DAG.getBitcast(LogicVT, V);

11684

SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

11685

return DAG.getBitcast(VT, And);

11686

}

11687

11688

/// Try to emit a blend instruction for a shuffle using bit math.

11689

///

11690

/// This is used as a fallback approach when first class blend instructions are

11691

/// unavailable. Currently it is only suitable for integer vectors, but could

11692

/// be generalized for floating point vectors if desirable.

11693

static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

11694

SDValue V2, ArrayRef<int> Mask,

11695

SelectionDAG &DAG) {

11696

assert(VT.isInteger() && "Only supports integer vector types!")((VT.isInteger() && "Only supports integer vector types!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11696, __PRETTY_FUNCTION__));

11697

MVT EltVT = VT.getVectorElementType();

11698

SDValue Zero = DAG.getConstant(0, DL, EltVT);

11699

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

11700

SmallVector<SDValue, 16> MaskOps;

11701

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11702

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

11703

return SDValue(); // Shuffled input!

11704

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

11705

}

11706

11707

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

11708

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

11709

V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);

11710

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

11711

}

11712

11713

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

11714

SDValue PreservedSrc,

11715

const X86Subtarget &Subtarget,

11716

SelectionDAG &DAG);

11717

11718

static bool matchShuffleAsBlend(SDValue V1, SDValue V2,

11719

MutableArrayRef<int> Mask,

11720

const APInt &Zeroable, bool &ForceV1Zero,

11721

bool &ForceV2Zero, uint64_t &BlendMask) {

11722

bool V1IsZeroOrUndef =

11723

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

11724

bool V2IsZeroOrUndef =

11725

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

11726

11727

BlendMask = 0;

11728

ForceV1Zero = false, ForceV2Zero = false;

11729

assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11729, __PRETTY_FUNCTION__));

11730

11731

// Attempt to generate the binary blend mask. If an input is zero then

11732

// we can use any lane.

11733

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11734

int M = Mask[i];

11735

if (M == SM_SentinelUndef)

11736

continue;

11737

if (M == i)

11738

continue;

11739

if (M == i + Size) {

11740

BlendMask |= 1ull << i;

11741

continue;

11742

}

11743

if (Zeroable[i]) {

11744

if (V1IsZeroOrUndef) {

11745

ForceV1Zero = true;

11746

Mask[i] = i;

11747

continue;

11748

}

11749

if (V2IsZeroOrUndef) {

11750

ForceV2Zero = true;

11751

BlendMask |= 1ull << i;

11752

Mask[i] = i + Size;

11753

continue;

11754

}

11755

}

11756

return false;

11757

}

11758

return true;

11759

}

11760

11761

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

11762

int Scale) {

11763

uint64_t ScaledMask = 0;

11764

for (int i = 0; i != Size; ++i)

11765

if (BlendMask & (1ull << i))

11766

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

11767

return ScaledMask;

11768

}

11769

11770

/// Try to emit a blend instruction for a shuffle.

11771

///

11772

/// This doesn't do any checks for the availability of instructions for blending

11773

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

11774

/// be matched in the backend with the type given. What it does check for is

11775

/// that the shuffle mask is a blend, or convertible into a blend with zero.

11776

static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

11777

SDValue V2, ArrayRef<int> Original,

11778

const APInt &Zeroable,

11779

const X86Subtarget &Subtarget,

11780

SelectionDAG &DAG) {

11781

uint64_t BlendMask = 0;

11782

bool ForceV1Zero = false, ForceV2Zero = false;

11783

SmallVector<int, 64> Mask(Original.begin(), Original.end());

11784

if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

11785

BlendMask))

11786

return SDValue();

11787

11788

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

11789

if (ForceV1Zero)

11790

V1 = getZeroVector(VT, Subtarget, DAG, DL);

11791

if (ForceV2Zero)

11792

V2 = getZeroVector(VT, Subtarget, DAG, DL);

11793

11794

switch (VT.SimpleTy) {

11795

case MVT::v4i64:

11796

case MVT::v8i32:

11797

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11797, __PRETTY_FUNCTION__));

11798

LLVM_FALLTHROUGH[[gnu::fallthrough]];

11799

case MVT::v4f64:

11800

case MVT::v8f32:

11801

assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11801, __PRETTY_FUNCTION__));

11802

LLVM_FALLTHROUGH[[gnu::fallthrough]];

11803

case MVT::v2f64:

11804

case MVT::v2i64:

11805

case MVT::v4f32:

11806

case MVT::v4i32:

11807

case MVT::v8i16:

11808

assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11808, __PRETTY_FUNCTION__));

11809

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

11810

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

11811

case MVT::v16i16: {

11812

assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11812, __PRETTY_FUNCTION__));

11813

SmallVector<int, 8> RepeatedMask;

11814

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

11815

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

11816

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11816, __PRETTY_FUNCTION__));

11817

BlendMask = 0;

11818

for (int i = 0; i < 8; ++i)

11819

if (RepeatedMask[i] >= 8)

11820

BlendMask |= 1ull << i;

11821

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

11822

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

11823

}

11824

// Use PBLENDW for lower/upper lanes and then blend lanes.

11825

// TODO - we should allow 2 PBLENDW here and leave shuffle combine to

11826

// merge to VSELECT where useful.

11827

uint64_t LoMask = BlendMask & 0xFF;

11828

uint64_t HiMask = (BlendMask >> 8) & 0xFF;

11829

if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

11830

SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

11831

DAG.getTargetConstant(LoMask, DL, MVT::i8));

11832

SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

11833

DAG.getTargetConstant(HiMask, DL, MVT::i8));

11834

return DAG.getVectorShuffle(

11835

MVT::v16i16, DL, Lo, Hi,

11836

{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

11837

}

11838

LLVM_FALLTHROUGH[[gnu::fallthrough]];

11839

}

11840

case MVT::v32i8:

11841

assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11841, __PRETTY_FUNCTION__));

11842

LLVM_FALLTHROUGH[[gnu::fallthrough]];

11843

case MVT::v16i8: {

11844

assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11844, __PRETTY_FUNCTION__));

11845

11846

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

11847

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

11848

Subtarget, DAG))

11849

return Masked;

11850

11851

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

11852

MVT IntegerType =

11853

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

11854

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

11855

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

11856

}

11857

11858

// If we have VPTERNLOG, we can use that as a bit blend.

11859

if (Subtarget.hasVLX())

11860

if (SDValue BitBlend =

11861

lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

11862

return BitBlend;

11863

11864

// Scale the blend by the number of bytes per element.

11865

int Scale = VT.getScalarSizeInBits() / 8;

11866

11867

// This form of blend is always done on bytes. Compute the byte vector

11868

// type.

11869

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

11870

11871

// x86 allows load folding with blendvb from the 2nd source operand. But

11872

// we are still using LLVM select here (see comment below), so that's V1.

11873

// If V2 can be load-folded and V1 cannot be load-folded, then commute to

11874

// allow that load-folding possibility.

11875

if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

11876

ShuffleVectorSDNode::commuteMask(Mask);

11877

std::swap(V1, V2);

11878

}

11879

11880

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

11881

// mix of LLVM's code generator and the x86 backend. We tell the code

11882

// generator that boolean values in the elements of an x86 vector register

11883

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

11884

// mapping a select to operand #1, and 'false' mapping to operand #2. The

11885

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

11886

// of the element (the remaining are ignored) and 0 in that high bit would

11887

// mean operand #1 while 1 in the high bit would mean operand #2. So while

11888

// the LLVM model for boolean values in vector elements gets the relevant

11889

// bit set, it is set backwards and over constrained relative to x86's

11890

// actual model.

11891

SmallVector<SDValue, 32> VSELECTMask;

11892

for (int i = 0, Size = Mask.size(); i < Size; ++i)

11893

for (int j = 0; j < Scale; ++j)

11894

VSELECTMask.push_back(

11895

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

11896

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

11897

MVT::i8));

11898

11899

V1 = DAG.getBitcast(BlendVT, V1);

11900

V2 = DAG.getBitcast(BlendVT, V2);

11901

return DAG.getBitcast(

11902

VT,

11903

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

11904

V1, V2));

11905

}

11906

case MVT::v16f32:

11907

case MVT::v8f64:

11908

case MVT::v8i64:

11909

case MVT::v16i32:

11910

case MVT::v32i16:

11911

case MVT::v64i8: {

11912

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

11913

bool OptForSize = DAG.shouldOptForSize();

11914

if (!OptForSize) {

11915

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

11916

Subtarget, DAG))

11917

return Masked;

11918

}

11919

11920

// Otherwise load an immediate into a GPR, cast to k-register, and use a

11921

// masked move.

11922

MVT IntegerType =

11923

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

11924

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

11925

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

11926

}

11927

default:

11928

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11928);

11929

}

11930

}

11931

11932

/// Try to lower as a blend of elements from two inputs followed by

11933

/// a single-input permutation.

11934

///

11935

/// This matches the pattern where we can blend elements from two inputs and

11936

/// then reduce the shuffle to a single-input permutation.

11937

static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

11938

SDValue V1, SDValue V2,

11939

ArrayRef<int> Mask,

11940

SelectionDAG &DAG,

11941

bool ImmBlends = false) {

11942

// We build up the blend mask while checking whether a blend is a viable way

11943

// to reduce the shuffle.

11944

SmallVector<int, 32> BlendMask(Mask.size(), -1);

11945

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

11946

11947

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11948

if (Mask[i] < 0)

11949

continue;

11950

11951

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11951, __PRETTY_FUNCTION__));

11952

11953

if (BlendMask[Mask[i] % Size] < 0)

11954

BlendMask[Mask[i] % Size] = Mask[i];

11955

else if (BlendMask[Mask[i] % Size] != Mask[i])

11956

return SDValue(); // Can't blend in the needed input!

11957

11958

PermuteMask[i] = Mask[i] % Size;

11959

}

11960

11961

// If only immediate blends, then bail if the blend mask can't be widened to

11962

// i16.

11963

unsigned EltSize = VT.getScalarSizeInBits();

11964

if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

11965

return SDValue();

11966

11967

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

11968

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

11969

}

11970

11971

/// Try to lower as an unpack of elements from two inputs followed by

11972

/// a single-input permutation.

11973

///

11974

/// This matches the pattern where we can unpack elements from two inputs and

11975

/// then reduce the shuffle to a single-input (wider) permutation.

11976

static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

11977

SDValue V1, SDValue V2,

11978

ArrayRef<int> Mask,

11979

SelectionDAG &DAG) {

11980

int NumElts = Mask.size();

11981

int NumLanes = VT.getSizeInBits() / 128;

11982

int NumLaneElts = NumElts / NumLanes;

11983

int NumHalfLaneElts = NumLaneElts / 2;

11984

11985

bool MatchLo = true, MatchHi = true;

11986

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

11987

11988

// Determine UNPCKL/UNPCKH type and operand order.

11989

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

11990

for (int Elt = 0; Elt != NumLaneElts; ++Elt) {

11991

int M = Mask[Lane + Elt];

11992

if (M < 0)

11993

continue;

11994

11995

SDValue &Op = Ops[Elt & 1];

11996

if (M < NumElts && (Op.isUndef() || Op == V1))

11997

Op = V1;

11998

else if (NumElts <= M && (Op.isUndef() || Op == V2))

11999

Op = V2;

12000

else

12001

return SDValue();

12002

12003

int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

12004

MatchLo &= isUndefOrInRange(M, Lo, Mid) ||

12005

isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);

12006

MatchHi &= isUndefOrInRange(M, Mid, Hi) ||

12007

isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);

12008

if (!MatchLo && !MatchHi)

12009

return SDValue();

12010

}

12011

}

12012

assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? static_cast<void> (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12012, __PRETTY_FUNCTION__));

12013

12014

// Now check that each pair of elts come from the same unpack pair

12015

// and set the permute mask based on each pair.

12016

// TODO - Investigate cases where we permute individual elements.

12017

SmallVector<int, 32> PermuteMask(NumElts, -1);

12018

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

12019

for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {

12020

int M0 = Mask[Lane + Elt + 0];

12021

int M1 = Mask[Lane + Elt + 1];

12022

if (0 <= M0 && 0 <= M1 &&

12023

(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))

12024

return SDValue();

12025

if (0 <= M0)

12026

PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));

12027

if (0 <= M1)

12028

PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;

12029

}

12030

}

12031

12032

unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

12033

SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

12034

return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

12035

}

12036

12037

/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

12038

/// permuting the elements of the result in place.

12039

static SDValue lowerShuffleAsByteRotateAndPermute(

12040

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

12041

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

12042

if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

12043

(VT.is256BitVector() && !Subtarget.hasAVX2()) ||

12044

(VT.is512BitVector() && !Subtarget.hasBWI()))

12045

return SDValue();

12046

12047

// We don't currently support lane crossing permutes.

12048

if (is128BitLaneCrossingShuffleMask(VT, Mask))

12049

return SDValue();

12050

12051

int Scale = VT.getScalarSizeInBits() / 8;

12052

int NumLanes = VT.getSizeInBits() / 128;

12053

int NumElts = VT.getVectorNumElements();

12054

int NumEltsPerLane = NumElts / NumLanes;

12055

12056

// Determine range of mask elts.

12057

bool Blend1 = true;

12058

bool Blend2 = true;

12059

std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

12060

std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

12061

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

12062

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

12063

int M = Mask[Lane + Elt];

12064

if (M < 0)

12065

continue;

12066

if (M < NumElts) {

12067

Blend1 &= (M == (Lane + Elt));

12068

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12068, __PRETTY_FUNCTION__));

12069

M = M % NumEltsPerLane;

12070

Range1.first = std::min(Range1.first, M);

12071

Range1.second = std::max(Range1.second, M);

12072

} else {

12073

M -= NumElts;

12074

Blend2 &= (M == (Lane + Elt));

12075

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12075, __PRETTY_FUNCTION__));

12076

M = M % NumEltsPerLane;

12077

Range2.first = std::min(Range2.first, M);

12078

Range2.second = std::max(Range2.second, M);

12079

}

12080

}

12081

}

12082

12083

// Bail if we don't need both elements.

12084

// TODO - it might be worth doing this for unary shuffles if the permute

12085

// can be widened.

12086

if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

12087

!(0 <= Range2.first && Range2.second < NumEltsPerLane))

12088

return SDValue();

12089

12090

if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

12091

return SDValue();

12092

12093

// Rotate the 2 ops so we can access both ranges, then permute the result.

12094

auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

12095

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

12096

SDValue Rotate = DAG.getBitcast(

12097

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

12098

DAG.getBitcast(ByteVT, Lo),

12099

DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

12100

SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

12101

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

12102

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

12103

int M = Mask[Lane + Elt];

12104

if (M < 0)

12105

continue;

12106

if (M < NumElts)

12107

PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

12108

else

12109

PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

12110

}

12111

}

12112

return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

12113

};

12114

12115

// Check if the ranges are small enough to rotate from either direction.

12116

if (Range2.second < Range1.first)

12117

return RotateAndPermute(V1, V2, Range1.first, 0);

12118

if (Range1.second < Range2.first)

12119

return RotateAndPermute(V2, V1, Range2.first, NumElts);

12120

return SDValue();

12121

}

12122

12123

/// Generic routine to decompose a shuffle and blend into independent

12124

/// blends and permutes.

12125

///

12126

/// This matches the extremely common pattern for handling combined

12127

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

12128

/// operations. It will try to pick the best arrangement of shuffles and

12129

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.

12130

static SDValue lowerShuffleAsDecomposedShuffleMerge(

12131

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

12132

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

12133

int NumElts = Mask.size();

12134

int NumLanes = VT.getSizeInBits() / 128;

12135

int NumEltsPerLane = NumElts / NumLanes;

12136

12137

// Shuffle the input elements into the desired positions in V1 and V2 and

12138

// unpack/blend them together.

12139

bool IsAlternating = true;

12140

SmallVector<int, 32> V1Mask(NumElts, -1);

12141

SmallVector<int, 32> V2Mask(NumElts, -1);

12142

SmallVector<int, 32> FinalMask(NumElts, -1);

12143

for (int i = 0; i < NumElts; ++i) {

12144

int M = Mask[i];

12145

if (M >= 0 && M < NumElts) {

12146

V1Mask[i] = M;

12147

FinalMask[i] = i;

12148

IsAlternating &= (i & 1) == 0;

12149

} else if (M >= NumElts) {

12150

V2Mask[i] = M - NumElts;

12151

FinalMask[i] = i + NumElts;

12152

IsAlternating &= (i & 1) == 1;

12153

}

12154

}

12155

12156

// Try to lower with the simpler initial blend/unpack/rotate strategies unless

12157

// one of the input shuffles would be a no-op. We prefer to shuffle inputs as

12158

// the shuffle may be able to fold with a load or other benefit. However, when

12159

// we'll have to do 2x as many shuffles in order to achieve this, a 2-input

12160

// pre-shuffle first is a better strategy.

12161

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

12162

// Only prefer immediate blends to unpack/rotate.

12163

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

12164

DAG, true))

12165

return BlendPerm;

12166

if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,

12167

DAG))

12168

return UnpackPerm;

12169

if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

12170

DL, VT, V1, V2, Mask, Subtarget, DAG))

12171

return RotatePerm;

12172

// Unpack/rotate failed - try again with variable blends.

12173

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

12174

DAG))

12175

return BlendPerm;

12176

}

12177

12178

// If the final mask is an alternating blend of vXi8/vXi16, convert to an

12179

// UNPCKL(SHUFFLE, SHUFFLE) pattern.

12180

// TODO: It doesn't have to be alternating - but each lane mustn't have more

12181

// than half the elements coming from each source.

12182

if (IsAlternating && VT.getScalarSizeInBits() < 32) {

12183

V1Mask.assign(NumElts, -1);

12184

V2Mask.assign(NumElts, -1);

12185

FinalMask.assign(NumElts, -1);

12186

for (int i = 0; i != NumElts; i += NumEltsPerLane)

12187

for (int j = 0; j != NumEltsPerLane; ++j) {

12188

int M = Mask[i + j];

12189

if (M >= 0 && M < NumElts) {

12190

V1Mask[i + (j / 2)] = M;

12191

FinalMask[i + j] = i + (j / 2);

12192

} else if (M >= NumElts) {

12193

V2Mask[i + (j / 2)] = M - NumElts;

12194

FinalMask[i + j] = i + (j / 2) + NumElts;

12195

}

12196

}

12197

}

12198

12199

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

12200

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

12201

return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

12202

}

12203

12204

/// Try to lower a vector shuffle as a bit rotation.

12205

///

12206

/// Look for a repeated rotation pattern in each sub group.

12207

/// Returns a ISD::ROTL element rotation amount or -1 if failed.

12208

static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

12209

int NumElts = Mask.size();

12210

assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12210, __PRETTY_FUNCTION__));

12211

12212

int RotateAmt = -1;

12213

for (int i = 0; i != NumElts; i += NumSubElts) {

12214

for (int j = 0; j != NumSubElts; ++j) {

12215

int M = Mask[i + j];

12216

if (M < 0)

12217

continue;

12218

if (!isInRange(M, i, i + NumSubElts))

12219

return -1;

12220

int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

12221

if (0 <= RotateAmt && Offset != RotateAmt)

12222

return -1;

12223

RotateAmt = Offset;

12224

}

12225

}

12226

return RotateAmt;

12227

}

12228

12229

static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

12230

const X86Subtarget &Subtarget,

12231

ArrayRef<int> Mask) {

12232

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12232, __PRETTY_FUNCTION__));

12233

assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")((EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? static_cast<void> (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12233, __PRETTY_FUNCTION__));

12234

12235

// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

12236

int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

12237

int MaxSubElts = 64 / EltSizeInBits;

12238

for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

12239

int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

12240

if (RotateAmt < 0)

12241

continue;

12242

12243

int NumElts = Mask.size();

12244

MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

12245

RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

12246

return RotateAmt * EltSizeInBits;

12247

}

12248

12249

return -1;

12250

}

12251

12252

/// Lower shuffle using X86ISD::VROTLI rotations.

12253

static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

12254

ArrayRef<int> Mask,

12255

const X86Subtarget &Subtarget,

12256

SelectionDAG &DAG) {

12257

// Only XOP + AVX512 targets have bit rotation instructions.

12258

// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

12259

bool IsLegal =

12260

(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

12261

if (!IsLegal && Subtarget.hasSSE3())

12262

return SDValue();

12263

12264

MVT RotateVT;

12265

int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

12266

Subtarget, Mask);

12267

if (RotateAmt < 0)

12268

return SDValue();

12269

12270

// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

12271

// expanded to OR(SRL,SHL), will be more efficient, but if they can

12272

// widen to vXi16 or more then existing lowering should will be better.

12273

if (!IsLegal) {

12274

if ((RotateAmt % 16) == 0)

12275

return SDValue();

12276

// TODO: Use getTargetVShiftByConstNode.

12277

unsigned ShlAmt = RotateAmt;

12278

unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

12279

V1 = DAG.getBitcast(RotateVT, V1);

12280

SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

12281

DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

12282

SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

12283

DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

12284

SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

12285

return DAG.getBitcast(VT, Rot);

12286

}

12287

12288

SDValue Rot =

12289

DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

12290

DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

12291

return DAG.getBitcast(VT, Rot);

12292

}

12293

12294

/// Try to match a vector shuffle as an element rotation.

12295

///

12296

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

12297

static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

12298

ArrayRef<int> Mask) {

12299

int NumElts = Mask.size();

12300

12301

// We need to detect various ways of spelling a rotation:

12302

// [11, 12, 13, 14, 15, 0, 1, 2]

12303

// [-1, 12, 13, 14, -1, -1, 1, -1]

12304

// [-1, -1, -1, -1, -1, -1, 1, 2]

12305

// [ 3, 4, 5, 6, 7, 8, 9, 10]

12306

// [-1, 4, 5, 6, -1, -1, 9, -1]

12307

// [-1, 4, 5, 6, -1, -1, -1, -1]

12308

int Rotation = 0;

12309

SDValue Lo, Hi;

12310

for (int i = 0; i < NumElts; ++i) {

12311

int M = Mask[i];

12312

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12313, __PRETTY_FUNCTION__))

12313

"Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12313, __PRETTY_FUNCTION__));

12314

if (M < 0)

12315

continue;

12316

12317

// Determine where a rotated vector would have started.

12318

int StartIdx = i - (M % NumElts);

12319

if (StartIdx == 0)

12320

// The identity rotation isn't interesting, stop.

12321

return -1;

12322

12323

// If we found the tail of a vector the rotation must be the missing

12324

// front. If we found the head of a vector, it must be how much of the

12325

// head.

12326

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

12327

12328

if (Rotation == 0)

12329

Rotation = CandidateRotation;

12330

else if (Rotation != CandidateRotation)

12331

// The rotations don't match, so we can't match this mask.

12332

return -1;

12333

12334

// Compute which value this mask is pointing at.

12335

SDValue MaskV = M < NumElts ? V1 : V2;

12336

12337

// Compute which of the two target values this index should be assigned

12338

// to. This reflects whether the high elements are remaining or the low

12339

// elements are remaining.

12340

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

12341

12342

// Either set up this value if we've not encountered it before, or check

12343

// that it remains consistent.

12344

if (!TargetV)

12345

TargetV = MaskV;

12346

else if (TargetV != MaskV)

12347

// This may be a rotation, but it pulls from the inputs in some

12348

// unsupported interleaving.

12349

return -1;

12350

}

12351

12352

// Check that we successfully analyzed the mask, and normalize the results.

12353

assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!"
) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12353, __PRETTY_FUNCTION__));

12354

assert((Lo || Hi) && "Failed to find a rotated input vector!")(((Lo || Hi) && "Failed to find a rotated input vector!"
) ? static_cast<void> (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12354, __PRETTY_FUNCTION__));

12355

if (!Lo)

12356

Lo = Hi;

12357

else if (!Hi)

12358

Hi = Lo;

12359

12360

V1 = Lo;

12361

V2 = Hi;

12362

12363

return Rotation;

12364

}

12365

12366

/// Try to lower a vector shuffle as a byte rotation.

12367

///

12368

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

12369

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

12370

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

12371

/// try to generically lower a vector shuffle through such an pattern. It

12372

/// does not check for the profitability of lowering either as PALIGNR or

12373

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

12374

/// This matches shuffle vectors that look like:

12375

///

12376

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

12377

///

12378

/// Essentially it concatenates V1 and V2, shifts right by some number of

12379

/// elements, and takes the low elements as the result. Note that while this is

12380

/// specified as a *right shift* because x86 is little-endian, it is a *left

12381

/// rotate* of the vector lanes.

12382

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

12383

ArrayRef<int> Mask) {

12384

// Don't accept any shuffles with zero elements.

12385

if (isAnyZero(Mask))

12386

return -1;

12387

12388

// PALIGNR works on 128-bit lanes.

12389

SmallVector<int, 16> RepeatedMask;

12390

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

12391

return -1;

12392

12393

int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

12394

if (Rotation <= 0)

12395

return -1;

12396

12397

// PALIGNR rotates bytes, so we need to scale the

12398

// rotation based on how many bytes are in the vector lane.

12399

int NumElts = RepeatedMask.size();

12400

int Scale = 16 / NumElts;

12401

return Rotation * Scale;

12402

}

12403

12404

static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

12405

SDValue V2, ArrayRef<int> Mask,

12406

const X86Subtarget &Subtarget,

12407

SelectionDAG &DAG) {

12408

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12408, __PRETTY_FUNCTION__));

12409

12410

SDValue Lo = V1, Hi = V2;

12411

int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

12412

if (ByteRotation <= 0)

12413

return SDValue();

12414

12415

// Cast the inputs to i8 vector of correct length to match PALIGNR or

12416

// PSLLDQ/PSRLDQ.

12417

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

12418

Lo = DAG.getBitcast(ByteVT, Lo);

12419

Hi = DAG.getBitcast(ByteVT, Hi);

12420

12421

// SSSE3 targets can use the palignr instruction.

12422

if (Subtarget.hasSSSE3()) {

12423

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12424, __PRETTY_FUNCTION__))

12424

"512-bit PALIGNR requires BWI instructions")(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12424, __PRETTY_FUNCTION__));

12425

return DAG.getBitcast(

12426

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

12427

DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

12428

}

12429

12430

assert(VT.is128BitVector() &&((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12431, __PRETTY_FUNCTION__))

12431

"Rotate-based lowering only supports 128-bit lowering!")((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12431, __PRETTY_FUNCTION__));

12432

assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12433, __PRETTY_FUNCTION__))

12433

"Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12433, __PRETTY_FUNCTION__));

12434

assert(ByteVT == MVT::v16i8 &&((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12435, __PRETTY_FUNCTION__))

12435

"SSE2 rotate lowering only needed for v16i8!")((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12435, __PRETTY_FUNCTION__));

12436

12437

// Default SSE2 implementation

12438

int LoByteShift = 16 - ByteRotation;

12439

int HiByteShift = ByteRotation;

12440

12441

SDValue LoShift =

12442

DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

12443

DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

12444

SDValue HiShift =

12445

DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

12446

DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

12447

return DAG.getBitcast(VT,

12448

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

12449

}

12450

12451

/// Try to lower a vector shuffle as a dword/qword rotation.

12452

///

12453

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

12454

/// rotation of the concatenation of two vectors; This routine will

12455

/// try to generically lower a vector shuffle through such an pattern.

12456

///

12457

/// Essentially it concatenates V1 and V2, shifts right by some number of

12458

/// elements, and takes the low elements as the result. Note that while this is

12459

/// specified as a *right shift* because x86 is little-endian, it is a *left

12460

/// rotate* of the vector lanes.

12461

static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

12462

SDValue V2, ArrayRef<int> Mask,

12463

const X86Subtarget &Subtarget,

12464

SelectionDAG &DAG) {

12465

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12466, __PRETTY_FUNCTION__))

12466

"Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12466, __PRETTY_FUNCTION__));

12467

12468

// 128/256-bit vectors are only supported with VLX.

12469

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12470, __PRETTY_FUNCTION__))

12470

&& "VLX required for 128/256-bit vectors")(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12470, __PRETTY_FUNCTION__));

12471

12472

SDValue Lo = V1, Hi = V2;

12473

int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

12474

if (Rotation <= 0)

12475

return SDValue();

12476

12477

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

12478

DAG.getTargetConstant(Rotation, DL, MVT::i8));

12479

}

12480

12481

/// Try to lower a vector shuffle as a byte shift sequence.

12482

static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

12483

SDValue V2, ArrayRef<int> Mask,

12484

const APInt &Zeroable,

12485

const X86Subtarget &Subtarget,

12486

SelectionDAG &DAG) {

12487

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12487, __PRETTY_FUNCTION__));

12488

assert(VT.is128BitVector() && "Only 128-bit vectors supported")((VT.is128BitVector() && "Only 128-bit vectors supported"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12488, __PRETTY_FUNCTION__));

12489

12490

// We need a shuffle that has zeros at one/both ends and a sequential

12491

// shuffle from one source within.

12492

unsigned ZeroLo = Zeroable.countTrailingOnes();

12493

unsigned ZeroHi = Zeroable.countLeadingOnes();

12494

if (!ZeroLo && !ZeroHi)

12495

return SDValue();

12496

12497

unsigned NumElts = Mask.size();

12498

unsigned Len = NumElts - (ZeroLo + ZeroHi);

12499

if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

12500

return SDValue();

12501

12502

unsigned Scale = VT.getScalarSizeInBits() / 8;

12503

ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

12504

if (!isUndefOrInRange(StubMask, 0, NumElts) &&

12505

!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

12506

return SDValue();

12507

12508

SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

12509

Res = DAG.getBitcast(MVT::v16i8, Res);

12510

12511

// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

12512

// inner sequential set of elements, possibly offset:

12513

// 01234567 --> zzzzzz01 --> 1zzzzzzz

12514

// 01234567 --> 4567zzzz --> zzzzz456

12515

// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

12516

if (ZeroLo == 0) {

12517

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

12518

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

12519

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

12520

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

12521

DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

12522

} else if (ZeroHi == 0) {

12523

unsigned Shift = Mask[ZeroLo] % NumElts;

12524

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

12525

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

12526

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

12527

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

12528

} else if (!Subtarget.hasSSSE3()) {

12529

// If we don't have PSHUFB then its worth avoiding an AND constant mask

12530

// by performing 3 byte shifts. Shuffle combining can kick in above that.

12531

// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

12532

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

12533

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

12534

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

12535

Shift += Mask[ZeroLo] % NumElts;

12536

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

12537

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

12538

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

12539

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

12540

} else

12541

return SDValue();

12542

12543

return DAG.getBitcast(VT, Res);

12544

}

12545

12546

/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

12547

///

12548

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

12549

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

12550

/// matches elements from one of the input vectors shuffled to the left or

12551

/// right with zeroable elements 'shifted in'. It handles both the strictly

12552

/// bit-wise element shifts and the byte shift across an entire 128-bit double

12553

/// quad word lane.

12554

///

12555

/// PSHL : (little-endian) left bit shift.

12556

/// [ zz, 0, zz, 2 ]

12557

/// [ -1, 4, zz, -1 ]

12558

/// PSRL : (little-endian) right bit shift.

12559

/// [ 1, zz, 3, zz]

12560

/// [ -1, -1, 7, zz]

12561

/// PSLLDQ : (little-endian) left byte shift

12562

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

12563

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

12564

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

12565

/// PSRLDQ : (little-endian) right byte shift

12566

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

12567

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

12568

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

12569

static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

12570

unsigned ScalarSizeInBits, ArrayRef<int> Mask,

12571

int MaskOffset, const APInt &Zeroable,

12572

const X86Subtarget &Subtarget) {

12573

int Size = Mask.size();

12574

unsigned SizeInBits = Size * ScalarSizeInBits;

12575

12576

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

12577

for (int i = 0; i < Size; i += Scale)

12578

for (int j = 0; j < Shift; ++j)

12579

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

12580

return false;

12581

12582

return true;

12583

};

12584

12585

auto MatchShift = [&](int Shift, int Scale, bool Left) {

12586

for (int i = 0; i != Size; i += Scale) {

12587

unsigned Pos = Left ? i + Shift : i;

12588

unsigned Low = Left ? i : i + Shift;

12589

unsigned Len = Scale - Shift;

12590

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

12591

return -1;

12592

}

12593

12594

int ShiftEltBits = ScalarSizeInBits * Scale;

12595

bool ByteShift = ShiftEltBits > 64;

12596

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

12597

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

12598

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

12599

12600

// Normalize the scale for byte shifts to still produce an i64 element

12601

// type.

12602

Scale = ByteShift ? Scale / 2 : Scale;

12603

12604

// We need to round trip through the appropriate type for the shift.

12605

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

12606

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

12607

: MVT::getVectorVT(ShiftSVT, Size / Scale);

12608

return (int)ShiftAmt;

12609

};

12610

12611

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

12612

// keep doubling the size of the integer elements up to that. We can

12613

// then shift the elements of the integer vector by whole multiples of

12614

// their width within the elements of the larger integer vector. Test each

12615

// multiple to see if we can find a match with the moved element indices

12616

// and that the shifted in elements are all zeroable.

12617

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

12618

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

12619

for (int Shift = 1; Shift != Scale; ++Shift)

12620

for (bool Left : {true, false})

12621

if (CheckZeros(Shift, Scale, Left)) {

12622

int ShiftAmt = MatchShift(Shift, Scale, Left);

12623

if (0 < ShiftAmt)

12624

return ShiftAmt;

12625

}

12626

12627

// no match

12628

return -1;

12629

}

12630

12631

static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

12632

SDValue V2, ArrayRef<int> Mask,

12633

const APInt &Zeroable,

12634

const X86Subtarget &Subtarget,

12635

SelectionDAG &DAG) {

12636

int Size = Mask.size();

12637

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12637, __PRETTY_FUNCTION__));

12638

12639

MVT ShiftVT;

12640

SDValue V = V1;

12641

unsigned Opcode;

12642

12643

// Try to match shuffle against V1 shift.

12644

int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

12645

Mask, 0, Zeroable, Subtarget);

12646

12647

// If V1 failed, try to match shuffle against V2 shift.

12648

if (ShiftAmt < 0) {

12649

ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

12650

Mask, Size, Zeroable, Subtarget);

12651

V = V2;

12652

}

12653

12654

if (ShiftAmt < 0)

12655

return SDValue();

12656

12657

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12658, __PRETTY_FUNCTION__))

12658

"Illegal integer vector type")((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12658, __PRETTY_FUNCTION__));

12659

V = DAG.getBitcast(ShiftVT, V);

12660

V = DAG.getNode(Opcode, DL, ShiftVT, V,

12661

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

12662

return DAG.getBitcast(VT, V);

12663

}

12664

12665

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

12666

// Remainder of lower half result is zero and upper half is all undef.

12667

static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

12668

ArrayRef<int> Mask, uint64_t &BitLen,

12669

uint64_t &BitIdx, const APInt &Zeroable) {

12670

int Size = Mask.size();

12671

int HalfSize = Size / 2;

12672

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12672, __PRETTY_FUNCTION__));

12673

assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12673, __PRETTY_FUNCTION__));

12674

12675

// Upper half must be undefined.

12676

if (!isUndefUpperHalf(Mask))

12677

return false;

12678

12679

// Determine the extraction length from the part of the

12680

// lower half that isn't zeroable.

12681

int Len = HalfSize;

12682

for (; Len > 0; --Len)

12683

if (!Zeroable[Len - 1])

12684

break;

12685

assert(Len > 0 && "Zeroable shuffle mask")((Len > 0 && "Zeroable shuffle mask") ? static_cast
<void> (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12685, __PRETTY_FUNCTION__));

12686

12687

// Attempt to match first Len sequential elements from the lower half.

12688

SDValue Src;

12689

int Idx = -1;

12690

for (int i = 0; i != Len; ++i) {

12691

int M = Mask[i];

12692

if (M == SM_SentinelUndef)

12693

continue;

12694

SDValue &V = (M < Size ? V1 : V2);

12695

M = M % Size;

12696

12697

// The extracted elements must start at a valid index and all mask

12698

// elements must be in the lower half.

12699

if (i > M || M >= HalfSize)

12700

return false;

12701

12702

if (Idx < 0 || (Src == V && Idx == (M - i))) {

12703

Src = V;

12704

Idx = M - i;

12705

continue;

12706

}

12707

return false;

12708

}

12709

12710

if (!Src || Idx < 0)

12711

return false;

12712

12713

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(((Idx + Len) <= HalfSize && "Illegal extraction mask"
) ? static_cast<void> (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12713, __PRETTY_FUNCTION__));

12714

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

12715

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

12716

V1 = Src;

12717

return true;

12718

}

12719

12720

// INSERTQ: Extract lowest Len elements from lower half of second source and

12721

// insert over first source, starting at Idx.

12722

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

12723

static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

12724

ArrayRef<int> Mask, uint64_t &BitLen,

12725

uint64_t &BitIdx) {

12726

int Size = Mask.size();

12727

int HalfSize = Size / 2;

12728

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12728, __PRETTY_FUNCTION__));

12729

12730

// Upper half must be undefined.

12731

if (!isUndefUpperHalf(Mask))

12732

return false;

12733

12734

for (int Idx = 0; Idx != HalfSize; ++Idx) {

12735

SDValue Base;

12736

12737

// Attempt to match first source from mask before insertion point.

12738

if (isUndefInRange(Mask, 0, Idx)) {

12739

/* EMPTY */

12740

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

12741

Base = V1;

12742

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

12743

Base = V2;

12744

} else {

12745

continue;

12746

}

12747

12748

// Extend the extraction length looking to match both the insertion of

12749

// the second source and the remaining elements of the first.

12750

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

12751

SDValue Insert;

12752

int Len = Hi - Idx;

12753

12754

// Match insertion.

12755

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

12756

Insert = V1;

12757

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

12758

Insert = V2;

12759

} else {

12760

continue;

12761

}

12762

12763

// Match the remaining elements of the lower half.

12764

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

12765

/* EMPTY */

12766

} else if ((!Base || (Base == V1)) &&

12767

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

12768

Base = V1;

12769

} else if ((!Base || (Base == V2)) &&

12770

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

12771

Size + Hi)) {

12772

Base = V2;

12773

} else {

12774

continue;

12775

}

12776

12777

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

12778

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

12779

V1 = Base;

12780

V2 = Insert;

12781

return true;

12782

}

12783

}

12784

12785

return false;

12786

}

12787

12788

/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

12789

static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

12790

SDValue V2, ArrayRef<int> Mask,

12791

const APInt &Zeroable, SelectionDAG &DAG) {

12792

uint64_t BitLen, BitIdx;

12793

if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

12794

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

12795

DAG.getTargetConstant(BitLen, DL, MVT::i8),

12796

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

12797

12798

if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

12799

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

12800

V2 ? V2 : DAG.getUNDEF(VT),

12801

DAG.getTargetConstant(BitLen, DL, MVT::i8),

12802

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

12803

12804

return SDValue();

12805

}

12806

12807

/// Lower a vector shuffle as a zero or any extension.

12808

///

12809

/// Given a specific number of elements, element bit width, and extension

12810

/// stride, produce either a zero or any extension based on the available

12811

/// features of the subtarget. The extended elements are consecutive and

12812

/// begin and can start from an offsetted element index in the input; to

12813

/// avoid excess shuffling the offset must either being in the bottom lane

12814

/// or at the start of a higher lane. All extended elements must be from

12815

/// the same lane.

12816

static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(

12817

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

12818

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

12819

assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast
<void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12819, __PRETTY_FUNCTION__));

12820

int EltBits = VT.getScalarSizeInBits();

12821

int NumElements = VT.getVectorNumElements();

12822

int NumEltsPerLane = 128 / EltBits;

12823

int OffsetLane = Offset / NumEltsPerLane;

12824

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12825, __PRETTY_FUNCTION__))

12825

"Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12825, __PRETTY_FUNCTION__));

12826

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."
) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12826, __PRETTY_FUNCTION__));

12827

assert(0 <= Offset && "Extension offset must be positive.")((0 <= Offset && "Extension offset must be positive."
) ? static_cast<void> (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12827, __PRETTY_FUNCTION__));

12828

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12829, __PRETTY_FUNCTION__))

12829

"Extension offset must be in the first lane or start an upper lane.")(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12829, __PRETTY_FUNCTION__));

12830

12831

// Check that an index is in same lane as the base offset.

12832

auto SafeOffset = [&](int Idx) {

12833

return OffsetLane == (Idx / NumEltsPerLane);

12834

};

12835

12836

// Shift along an input so that the offset base moves to the first element.

12837

auto ShuffleOffset = [&](SDValue V) {

12838

if (!Offset)

12839

return V;

12840

12841

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

12842

for (int i = 0; i * Scale < NumElements; ++i) {

12843

int SrcIdx = i + Offset;

12844

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

12845

}

12846

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

12847

};

12848

12849

// Found a valid a/zext mask! Try various lowering strategies based on the

12850

// input type and available ISA extensions.

12851

if (Subtarget.hasSSE41()) {

12852

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

12853

// PUNPCK will catch this in a later shuffle match.

12854

if (Offset && Scale == 2 && VT.is128BitVector())

12855

return SDValue();

12856

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

12857

NumElements / Scale);

12858

InputV = ShuffleOffset(InputV);

12859

InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,

12860

ExtVT, InputV, DAG);

12861

return DAG.getBitcast(VT, InputV);

12862

}

12863

12864

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12864, __PRETTY_FUNCTION__));

12865

12866

// For any extends we can cheat for larger element sizes and use shuffle

12867

// instructions that can fold with a load and/or copy.

12868

if (AnyExt && EltBits == 32) {

12869

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

12870

-1};

12871

return DAG.getBitcast(

12872

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

12873

DAG.getBitcast(MVT::v4i32, InputV),

12874

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

12875

}

12876

if (AnyExt && EltBits == 16 && Scale > 2) {

12877

int PSHUFDMask[4] = {Offset / 2, -1,

12878

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

12879

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

12880

DAG.getBitcast(MVT::v4i32, InputV),

12881

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

12882

int PSHUFWMask[4] = {1, -1, -1, -1};

12883

unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

12884

return DAG.getBitcast(

12885

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

12886

DAG.getBitcast(MVT::v8i16, InputV),

12887

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

12888

}

12889

12890

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

12891

// to 64-bits.

12892

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

12893

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12893, __PRETTY_FUNCTION__));

12894

assert(VT.is128BitVector() && "Unexpected vector width!")((VT.is128BitVector() && "Unexpected vector width!") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12894, __PRETTY_FUNCTION__));

12895

12896

int LoIdx = Offset * EltBits;

12897

SDValue Lo = DAG.getBitcast(

12898

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

12899

DAG.getTargetConstant(EltBits, DL, MVT::i8),

12900

DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

12901

12902

if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

12903

return DAG.getBitcast(VT, Lo);

12904

12905

int HiIdx = (Offset + 1) * EltBits;

12906

SDValue Hi = DAG.getBitcast(

12907

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

12908

DAG.getTargetConstant(EltBits, DL, MVT::i8),

12909

DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

12910

return DAG.getBitcast(VT,

12911

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

12912

}

12913

12914

// If this would require more than 2 unpack instructions to expand, use

12915

// pshufb when available. We can only use more than 2 unpack instructions

12916

// when zero extending i8 elements which also makes it easier to use pshufb.

12917

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

12918

assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12918, __PRETTY_FUNCTION__));

12919

SDValue PSHUFBMask[16];

12920

for (int i = 0; i < 16; ++i) {

12921

int Idx = Offset + (i / Scale);

12922

if ((i % Scale == 0 && SafeOffset(Idx))) {

12923

PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

12924

continue;

12925

}

12926

PSHUFBMask[i] =

12927

AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

12928

}

12929

InputV = DAG.getBitcast(MVT::v16i8, InputV);

12930

return DAG.getBitcast(

12931

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

12932

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

12933

}

12934

12935

// If we are extending from an offset, ensure we start on a boundary that

12936

// we can unpack from.

12937

int AlignToUnpack = Offset % (NumElements / Scale);

12938

if (AlignToUnpack) {

12939

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

12940

for (int i = AlignToUnpack; i < NumElements; ++i)

12941

ShMask[i - AlignToUnpack] = i;

12942

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

12943

Offset -= AlignToUnpack;

12944

}

12945

12946

// Otherwise emit a sequence of unpacks.

12947

do {

12948

unsigned UnpackLoHi = X86ISD::UNPCKL;

12949

if (Offset >= (NumElements / 2)) {

12950

UnpackLoHi = X86ISD::UNPCKH;

12951

Offset -= (NumElements / 2);

12952

}

12953

12954

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

12955

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

12956

: getZeroVector(InputVT, Subtarget, DAG, DL);

12957

InputV = DAG.getBitcast(InputVT, InputV);

12958

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

12959

Scale /= 2;

12960

EltBits *= 2;

12961

NumElements /= 2;

12962

} while (Scale > 1);

12963

return DAG.getBitcast(VT, InputV);

12964

}

12965

12966

/// Try to lower a vector shuffle as a zero extension on any microarch.

12967

///

12968

/// This routine will try to do everything in its power to cleverly lower

12969

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

12970

/// check for the profitability of this lowering, it tries to aggressively

12971

/// match this pattern. It will use all of the micro-architectural details it

12972

/// can to emit an efficient lowering. It handles both blends with all-zero

12973

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

12974

/// masking out later).

12975

///

12976

/// The reason we have dedicated lowering for zext-style shuffles is that they

12977

/// are both incredibly common and often quite performance sensitive.

12978

static SDValue lowerShuffleAsZeroOrAnyExtend(

12979

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

12980

const APInt &Zeroable, const X86Subtarget &Subtarget,

12981

SelectionDAG &DAG) {

12982

int Bits = VT.getSizeInBits();

12983

int NumLanes = Bits / 128;

12984

int NumElements = VT.getVectorNumElements();

12985

int NumEltsPerLane = NumElements / NumLanes;

12986

assert(VT.getScalarSizeInBits() <= 32 &&((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12987, __PRETTY_FUNCTION__))

12987

"Exceeds 32-bit integer zero extension limit")((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12987, __PRETTY_FUNCTION__));

12988

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(((int)Mask.size() == NumElements && "Unexpected shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12988, __PRETTY_FUNCTION__));

12989

12990

// Define a helper function to check a particular ext-scale and lower to it if

12991

// valid.

12992

auto Lower = [&](int Scale) -> SDValue {

12993

SDValue InputV;

12994

bool AnyExt = true;

12995

int Offset = 0;

12996

int Matches = 0;

12997

for (int i = 0; i < NumElements; ++i) {

12998

int M = Mask[i];

12999

if (M < 0)

13000

continue; // Valid anywhere but doesn't tell us anything.

13001

if (i % Scale != 0) {

13002

// Each of the extended elements need to be zeroable.

13003

if (!Zeroable[i])

13004

return SDValue();

13005

13006

// We no longer are in the anyext case.

13007

AnyExt = false;

13008

continue;

13009

}

13010

13011

// Each of the base elements needs to be consecutive indices into the

13012

// same input vector.

13013

SDValue V = M < NumElements ? V1 : V2;

13014

M = M % NumElements;

13015

if (!InputV) {

13016

InputV = V;

13017

Offset = M - (i / Scale);

13018

} else if (InputV != V)

13019

return SDValue(); // Flip-flopping inputs.

13020

13021

// Offset must start in the lowest 128-bit lane or at the start of an

13022

// upper lane.

13023

// FIXME: Is it ever worth allowing a negative base offset?

13024

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

13025

(Offset % NumEltsPerLane) == 0))

13026

return SDValue();

13027

13028

// If we are offsetting, all referenced entries must come from the same

13029

// lane.

13030

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

13031

return SDValue();

13032

13033

if ((M % NumElements) != (Offset + (i / Scale)))

13034

return SDValue(); // Non-consecutive strided elements.

13035

Matches++;

13036

}

13037

13038

// If we fail to find an input, we have a zero-shuffle which should always

13039

// have already been handled.

13040

// FIXME: Maybe handle this here in case during blending we end up with one?

13041

if (!InputV)

13042

return SDValue();

13043

13044

// If we are offsetting, don't extend if we only match a single input, we

13045

// can always do better by using a basic PSHUF or PUNPCK.

13046

if (Offset != 0 && Matches < 2)

13047

return SDValue();

13048

13049

return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,

13050

InputV, Mask, Subtarget, DAG);

13051

};

13052

13053

// The widest scale possible for extending is to a 64-bit integer.

13054

assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13055, __PRETTY_FUNCTION__))

13055

"The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13055, __PRETTY_FUNCTION__));

13056

int NumExtElements = Bits / 64;

13057

13058

// Each iteration, try extending the elements half as much, but into twice as

13059

// many elements.

13060

for (; NumExtElements < NumElements; NumExtElements *= 2) {

13061

assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13062, __PRETTY_FUNCTION__))

13062

"The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13062, __PRETTY_FUNCTION__));

13063

if (SDValue V = Lower(NumElements / NumExtElements))

13064

return V;

13065

}

13066

13067

// General extends failed, but 128-bit vectors may be able to use MOVQ.

13068

if (Bits != 128)

13069

return SDValue();

13070

13071

// Returns one of the source operands if the shuffle can be reduced to a

13072

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

13073

auto CanZExtLowHalf = [&]() {

13074

for (int i = NumElements / 2; i != NumElements; ++i)

13075

if (!Zeroable[i])

13076

return SDValue();

13077

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

13078

return V1;

13079

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

13080

return V2;

13081

return SDValue();

13082

};

13083

13084

if (SDValue V = CanZExtLowHalf()) {

13085

V = DAG.getBitcast(MVT::v2i64, V);

13086

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

13087

return DAG.getBitcast(VT, V);

13088

}

13089

13090

// No viable ext lowering found.

13091

return SDValue();

13092

}

13093

13094

/// Try to get a scalar value for a specific element of a vector.

13095

///

13096

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

13097

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

13098

SelectionDAG &DAG) {

13099

MVT VT = V.getSimpleValueType();

13100

MVT EltVT = VT.getVectorElementType();

13101

V = peekThroughBitcasts(V);

13102

13103

// If the bitcasts shift the element size, we can't extract an equivalent

13104

// element from it.

13105

MVT NewVT = V.getSimpleValueType();

13106

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

13107

return SDValue();

13108

13109

if (V.getOpcode() == ISD::BUILD_VECTOR ||

13110

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

13111

// Ensure the scalar operand is the same size as the destination.

13112

// FIXME: Add support for scalar truncation where possible.

13113

SDValue S = V.getOperand(Idx);

13114

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

13115

return DAG.getBitcast(EltVT, S);

13116

}

13117

13118

return SDValue();

13119

}

13120

13121

/// Helper to test for a load that can be folded with x86 shuffles.

13122

///

13123

/// This is particularly important because the set of instructions varies

13124

/// significantly based on whether the operand is a load or not.

13125

static bool isShuffleFoldableLoad(SDValue V) {

13126

V = peekThroughBitcasts(V);

13127

return ISD::isNON_EXTLoad(V.getNode());

13128

}

13129

13130

/// Try to lower insertion of a single element into a zero vector.

13131

///

13132

/// This is a common pattern that we have especially efficient patterns to lower

13133

/// across all subtarget feature sets.

13134

static SDValue lowerShuffleAsElementInsertion(

13135

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13136

const APInt &Zeroable, const X86Subtarget &Subtarget,

13137

SelectionDAG &DAG) {

13138

MVT ExtVT = VT;

13139

MVT EltVT = VT.getVectorElementType();

13140

13141

int V2Index =

13142

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

13143

Mask.begin();

13144

bool IsV1Zeroable = true;

13145

for (int i = 0, Size = Mask.size(); i < Size; ++i)

13146

if (i != V2Index && !Zeroable[i]) {

13147

IsV1Zeroable = false;

13148

break;

13149

}

13150

13151

// Check for a single input from a SCALAR_TO_VECTOR node.

13152

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

13153

// all the smarts here sunk into that routine. However, the current

13154

// lowering of BUILD_VECTOR makes that nearly impossible until the old

13155

// vector shuffle lowering is dead.

13156

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

13157

DAG);

13158

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

13159

// We need to zext the scalar if it is smaller than an i32.

13160

V2S = DAG.getBitcast(EltVT, V2S);

13161

if (EltVT == MVT::i8 || EltVT == MVT::i16) {

13162

// Using zext to expand a narrow element won't work for non-zero

13163

// insertions.

13164

if (!IsV1Zeroable)

13165

return SDValue();

13166

13167

// Zero-extend directly to i32.

13168

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

13169

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

13170

}

13171

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

13172

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

13173

EltVT == MVT::i16) {

13174

// Either not inserting from the low element of the input or the input

13175

// element size is too small to use VZEXT_MOVL to clear the high bits.

13176

return SDValue();

13177

}

13178

13179

if (!IsV1Zeroable) {

13180

// If V1 can't be treated as a zero vector we have fewer options to lower

13181

// this. We can't support integer vectors or non-zero targets cheaply, and

13182

// the V1 elements can't be permuted in any way.

13183

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13183, __PRETTY_FUNCTION__));

13184

if (!VT.isFloatingPoint() || V2Index != 0)

13185

return SDValue();

13186

SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());

13187

V1Mask[V2Index] = -1;

13188

if (!isNoopShuffleMask(V1Mask))

13189

return SDValue();

13190

if (!VT.is128BitVector())

13191

return SDValue();

13192

13193

// Otherwise, use MOVSD or MOVSS.

13194

assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13195, __PRETTY_FUNCTION__))

13195

"Only two types of floating point element types to handle!")(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13195, __PRETTY_FUNCTION__));

13196

return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,

13197

ExtVT, V1, V2);

13198

}

13199

13200

// This lowering only works for the low element with floating point vectors.

13201

if (VT.isFloatingPoint() && V2Index != 0)

13202

return SDValue();

13203

13204

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

13205

if (ExtVT != VT)

13206

V2 = DAG.getBitcast(VT, V2);

13207

13208

if (V2Index != 0) {

13209

// If we have 4 or fewer lanes we can cheaply shuffle the element into

13210

// the desired position. Otherwise it is more efficient to do a vector

13211

// shift left. We know that we can do a vector shift left because all

13212

// the inputs are zero.

13213

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

13214

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

13215

V2Shuffle[V2Index] = 0;

13216

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

13217

} else {

13218

V2 = DAG.getBitcast(MVT::v16i8, V2);

13219

V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

13220

DAG.getTargetConstant(

13221

V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));

13222

V2 = DAG.getBitcast(VT, V2);

13223

}

13224

}

13225

return V2;

13226

}

13227

13228

/// Try to lower broadcast of a single - truncated - integer element,

13229

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

13230

///

13231

/// This assumes we have AVX2.

13232

static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

13233

int BroadcastIdx,

13234

const X86Subtarget &Subtarget,

13235

SelectionDAG &DAG) {

13236

assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13237, __PRETTY_FUNCTION__))

13237

"We can only lower integer broadcasts with AVX2!")((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13237, __PRETTY_FUNCTION__));

13238

13239

MVT EltVT = VT.getVectorElementType();

13240

MVT V0VT = V0.getSimpleValueType();

13241

13242

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13242, __PRETTY_FUNCTION__));

13243

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? static_cast<void> (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13243, __PRETTY_FUNCTION__));

13244

13245

MVT V0EltVT = V0VT.getVectorElementType();

13246

if (!V0EltVT.isInteger())

13247

return SDValue();

13248

13249

const unsigned EltSize = EltVT.getSizeInBits();

13250

const unsigned V0EltSize = V0EltVT.getSizeInBits();

13251

13252

// This is only a truncation if the original element type is larger.

13253

if (V0EltSize <= EltSize)

13254

return SDValue();

13255

13256

assert(((V0EltSize % EltSize) == 0) &&((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13257, __PRETTY_FUNCTION__))

13257

"Scalar type sizes must all be powers of 2 on x86!")((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13257, __PRETTY_FUNCTION__));

13258

13259

const unsigned V0Opc = V0.getOpcode();

13260

const unsigned Scale = V0EltSize / EltSize;

13261

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

13262

13263

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

13264

V0Opc != ISD::BUILD_VECTOR)

13265

return SDValue();

13266

13267

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

13268

13269

// If we're extracting non-least-significant bits, shift so we can truncate.

13270

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

13271

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

13272

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

13273

if (const int OffsetIdx = BroadcastIdx % Scale)

13274

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

13275

DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

13276

13277

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

13278

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

13279

}

13280

13281

/// Test whether this can be lowered with a single SHUFPS instruction.

13282

///

13283

/// This is used to disable more specialized lowerings when the shufps lowering

13284

/// will happen to be efficient.

13285

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

13286

// This routine only handles 128-bit shufps.

13287

assert(Mask.size() == 4 && "Unsupported mask size!")((Mask.size() == 4 && "Unsupported mask size!") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13287, __PRETTY_FUNCTION__));

13288

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13288, __PRETTY_FUNCTION__));

13289

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13289, __PRETTY_FUNCTION__));

13290

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13290, __PRETTY_FUNCTION__));

13291

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13291, __PRETTY_FUNCTION__));

13292

13293

// To lower with a single SHUFPS we need to have the low half and high half

13294

// each requiring a single input.

13295

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

13296

return false;

13297

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

13298

return false;

13299

13300

return true;

13301

}

13302

13303

/// If we are extracting two 128-bit halves of a vector and shuffling the

13304

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

13305

/// multi-shuffle lowering.

13306

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

13307

SDValue N1, ArrayRef<int> Mask,

13308

SelectionDAG &DAG) {

13309

MVT VT = N0.getSimpleValueType();

13310

assert((VT.is128BitVector() &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13312, __PRETTY_FUNCTION__))

13311

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13312, __PRETTY_FUNCTION__))

13312

"VPERM* family of shuffles requires 32-bit or 64-bit elements")(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13312, __PRETTY_FUNCTION__));

13313

13314

// Check that both sources are extracts of the same source vector.

13315

if (!N0.hasOneUse() || !N1.hasOneUse() ||

13316

N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

13317

N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

13318

N0.getOperand(0) != N1.getOperand(0))

13319

return SDValue();

13320

13321

SDValue WideVec = N0.getOperand(0);

13322

MVT WideVT = WideVec.getSimpleValueType();

13323

if (!WideVT.is256BitVector())

13324

return SDValue();

13325

13326

// Match extracts of each half of the wide source vector. Commute the shuffle

13327

// if the extract of the low half is N1.

13328

unsigned NumElts = VT.getVectorNumElements();

13329

SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());

13330

const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

13331

const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

13332

if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

13333

ShuffleVectorSDNode::commuteMask(NewMask);

13334

else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

13335

return SDValue();

13336

13337

// Final bailout: if the mask is simple, we are better off using an extract

13338

// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

13339

// because that avoids a constant load from memory.

13340

if (NumElts == 4 &&

13341

(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))

13342

return SDValue();

13343

13344

// Extend the shuffle mask with undef elements.

13345

NewMask.append(NumElts, -1);

13346

13347

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

13348

SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

13349

NewMask);

13350

// This is free: ymm -> xmm.

13351

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

13352

DAG.getIntPtrConstant(0, DL));

13353

}

13354

13355

/// Try to lower broadcast of a single element.

13356

///

13357

/// For convenience, this code also bundles all of the subtarget feature set

13358

/// filtering. While a little annoying to re-dispatch on type here, there isn't

13359

/// a convenient way to factor it out.

13360

static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

13361

SDValue V2, ArrayRef<int> Mask,

13362

const X86Subtarget &Subtarget,

13363

SelectionDAG &DAG) {

13364

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

13365

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

13366

(Subtarget.hasAVX2() && VT.isInteger())))

13367

return SDValue();

13368

13369

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

13370

// we can only broadcast from a register with AVX2.

13371

unsigned NumEltBits = VT.getScalarSizeInBits();

13372

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

13373

? X86ISD::MOVDDUP

13374

: X86ISD::VBROADCAST;

13375

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

13376

13377

// Check that the mask is a broadcast.

13378

int BroadcastIdx = getSplatIndex(Mask);

13379

if (BroadcastIdx < 0)

13380

return SDValue();

13381

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13383, __PRETTY_FUNCTION__))

13382

"a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13383, __PRETTY_FUNCTION__))

13383

"comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13383, __PRETTY_FUNCTION__));

13384

13385

// Go up the chain of (vector) values to find a scalar load that we can

13386

// combine with the broadcast.

13387

// TODO: Combine this logic with findEltLoadSrc() used by

13388

// EltsFromConsecutiveLoads().

13389

int BitOffset = BroadcastIdx * NumEltBits;

13390

SDValue V = V1;

13391

for (;;) {

13392

switch (V.getOpcode()) {

13393

case ISD::BITCAST: {

13394

V = V.getOperand(0);

13395

continue;

13396

}

13397

case ISD::CONCAT_VECTORS: {

13398

int OpBitWidth = V.getOperand(0).getValueSizeInBits();

13399

int OpIdx = BitOffset / OpBitWidth;

13400

V = V.getOperand(OpIdx);

13401

BitOffset %= OpBitWidth;

13402

continue;

13403

}

13404

case ISD::EXTRACT_SUBVECTOR: {

13405

// The extraction index adds to the existing offset.

13406

unsigned EltBitWidth = V.getScalarValueSizeInBits();

13407

unsigned Idx = V.getConstantOperandVal(1);

13408

unsigned BeginOffset = Idx * EltBitWidth;

13409

BitOffset += BeginOffset;

13410

V = V.getOperand(0);

13411

continue;

13412

}

13413

case ISD::INSERT_SUBVECTOR: {

13414

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

13415

int EltBitWidth = VOuter.getScalarValueSizeInBits();

13416

int Idx = (int)V.getConstantOperandVal(2);

13417

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

13418

int BeginOffset = Idx * EltBitWidth;

13419

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

13420

if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

13421

BitOffset -= BeginOffset;

13422

V = VInner;

13423

} else {

13424

V = VOuter;

13425

}

13426

continue;

13427

}

13428

}

13429

break;

13430

}

13431

assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13431, __PRETTY_FUNCTION__));

13432

BroadcastIdx = BitOffset / NumEltBits;

13433

13434

// Do we need to bitcast the source to retrieve the original broadcast index?

13435

bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

13436

13437

// Check if this is a broadcast of a scalar. We special case lowering

13438

// for scalars so that we can more effectively fold with loads.

13439

// If the original value has a larger element type than the shuffle, the

13440

// broadcast element is in essence truncated. Make that explicit to ease

13441

// folding.

13442

if (BitCastSrc && VT.isInteger())

13443

if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

13444

DL, VT, V, BroadcastIdx, Subtarget, DAG))

13445

return TruncBroadcast;

13446

13447

// Also check the simpler case, where we can directly reuse the scalar.

13448

if (!BitCastSrc &&

13449

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

13450

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

13451

V = V.getOperand(BroadcastIdx);

13452

13453

// If we can't broadcast from a register, check that the input is a load.

13454

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

13455

return SDValue();

13456

} else if (ISD::isNormalLoad(V.getNode()) &&

13457

cast<LoadSDNode>(V)->isSimple()) {

13458

// We do not check for one-use of the vector load because a broadcast load

13459

// is expected to be a win for code size, register pressure, and possibly

13460

// uops even if the original vector load is not eliminated.

13461

13462

// Reduce the vector load and shuffle to a broadcasted scalar load.

13463

LoadSDNode *Ld = cast<LoadSDNode>(V);

13464

SDValue BaseAddr = Ld->getOperand(1);

13465

MVT SVT = VT.getScalarType();

13466

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

13467

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13467, __PRETTY_FUNCTION__));

13468

SDValue NewAddr =

13469

DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);

13470

13471

// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

13472

// than MOVDDUP.

13473

// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

13474

if (Opcode == X86ISD::VBROADCAST) {

13475

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

13476

SDValue Ops[] = {Ld->getChain(), NewAddr};

13477

V = DAG.getMemIntrinsicNode(

13478

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

13479

DAG.getMachineFunction().getMachineMemOperand(

13480

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

13481

DAG.makeEquivalentMemoryOrdering(Ld, V);

13482

return DAG.getBitcast(VT, V);

13483

}

13484

assert(SVT == MVT::f64 && "Unexpected VT!")((SVT == MVT::f64 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13484, __PRETTY_FUNCTION__));

13485

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

13486

DAG.getMachineFunction().getMachineMemOperand(

13487

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

13488

DAG.makeEquivalentMemoryOrdering(Ld, V);

13489

} else if (!BroadcastFromReg) {

13490

// We can't broadcast from a vector register.

13491

return SDValue();

13492

} else if (BitOffset != 0) {

13493

// We can only broadcast from the zero-element of a vector register,

13494

// but it can be advantageous to broadcast from the zero-element of a

13495

// subvector.

13496

if (!VT.is256BitVector() && !VT.is512BitVector())

13497

return SDValue();

13498

13499

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

13500

if (VT == MVT::v4f64 || VT == MVT::v4i64)

13501

return SDValue();

13502

13503

// Only broadcast the zero-element of a 128-bit subvector.

13504

if ((BitOffset % 128) != 0)

13505

return SDValue();

13506

13507

assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13508, __PRETTY_FUNCTION__))

13508

"Unexpected bit-offset")(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13508, __PRETTY_FUNCTION__));

13509

assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13510, __PRETTY_FUNCTION__))

13510

"Unexpected vector size")(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13510, __PRETTY_FUNCTION__));

13511

unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

13512

V = extract128BitVector(V, ExtractIdx, DAG, DL);

13513

}

13514

13515

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())

13516

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

13517

DAG.getBitcast(MVT::f64, V));

13518

13519

// If this is a scalar, do the broadcast on this type and bitcast.

13520

if (!V.getValueType().isVector()) {

13521

assert(V.getScalarValueSizeInBits() == NumEltBits &&((V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"
) ? static_cast<void> (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13522, __PRETTY_FUNCTION__))

13522

"Unexpected scalar size")((V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"
) ? static_cast<void> (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13522, __PRETTY_FUNCTION__));

13523

MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

13524

VT.getVectorNumElements());

13525

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

13526

}

13527

13528

// We only support broadcasting from 128-bit vectors to minimize the

13529

// number of patterns we need to deal with in isel. So extract down to

13530

// 128-bits, removing as many bitcasts as possible.

13531

if (V.getValueSizeInBits() > 128)

13532

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

13533

13534

// Otherwise cast V to a vector with the same element type as VT, but

13535

// possibly narrower than VT. Then perform the broadcast.

13536

unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

13537

MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

13538

return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

13539

}

13540

13541

// Check for whether we can use INSERTPS to perform the shuffle. We only use

13542

// INSERTPS when the V1 elements are already in the correct locations

13543

// because otherwise we can just always use two SHUFPS instructions which

13544

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

13545

// perform INSERTPS if a single V1 element is out of place and all V2

13546

// elements are zeroable.

13547

static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

13548

unsigned &InsertPSMask,

13549

const APInt &Zeroable,

13550

ArrayRef<int> Mask, SelectionDAG &DAG) {

13551

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((V1.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13551, __PRETTY_FUNCTION__));

13552

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((V2.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13552, __PRETTY_FUNCTION__));

13553

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13553, __PRETTY_FUNCTION__));

13554

13555

// Attempt to match INSERTPS with one element from VA or VB being

13556

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

13557

// are updated.

13558

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

13559

ArrayRef<int> CandidateMask) {

13560

unsigned ZMask = 0;

13561

int VADstIndex = -1;

13562

int VBDstIndex = -1;

13563

bool VAUsedInPlace = false;

13564

13565

for (int i = 0; i < 4; ++i) {

13566

// Synthesize a zero mask from the zeroable elements (includes undefs).

13567

if (Zeroable[i]) {

13568

ZMask |= 1 << i;

13569

continue;

13570

}

13571

13572

// Flag if we use any VA inputs in place.

13573

if (i == CandidateMask[i]) {

13574

VAUsedInPlace = true;

13575

continue;

13576

}

13577

13578

// We can only insert a single non-zeroable element.

13579

if (VADstIndex >= 0 || VBDstIndex >= 0)

13580

return false;

13581

13582

if (CandidateMask[i] < 4) {

13583

// VA input out of place for insertion.

13584

VADstIndex = i;

13585

} else {

13586

// VB input for insertion.

13587

VBDstIndex = i;

13588

}

13589

}

13590

13591

// Don't bother if we have no (non-zeroable) element for insertion.

13592

if (VADstIndex < 0 && VBDstIndex < 0)

13593

return false;

13594

13595

// Determine element insertion src/dst indices. The src index is from the

13596

// start of the inserted vector, not the start of the concatenated vector.

13597

unsigned VBSrcIndex = 0;

13598

if (VADstIndex >= 0) {

13599

// If we have a VA input out of place, we use VA as the V2 element

13600

// insertion and don't use the original V2 at all.

13601

VBSrcIndex = CandidateMask[VADstIndex];

13602

VBDstIndex = VADstIndex;

13603

VB = VA;

13604

} else {

13605

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

13606

}

13607

13608

// If no V1 inputs are used in place, then the result is created only from

13609

// the zero mask and the V2 insertion - so remove V1 dependency.

13610

if (!VAUsedInPlace)

13611

VA = DAG.getUNDEF(MVT::v4f32);

13612

13613

// Update V1, V2 and InsertPSMask accordingly.

13614

V1 = VA;

13615

V2 = VB;

13616

13617

// Insert the V2 element into the desired position.

13618

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

13619

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13619, __PRETTY_FUNCTION__));

13620

return true;

13621

};

13622

13623

if (matchAsInsertPS(V1, V2, Mask))

13624

return true;

13625

13626

// Commute and try again.

13627

SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());

13628

ShuffleVectorSDNode::commuteMask(CommutedMask);

13629

if (matchAsInsertPS(V2, V1, CommutedMask))

13630

return true;

13631

13632

return false;

13633

}

13634

13635

static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

13636

ArrayRef<int> Mask, const APInt &Zeroable,

13637

SelectionDAG &DAG) {

13638

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13638, __PRETTY_FUNCTION__));

13639

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13639, __PRETTY_FUNCTION__));

13640

13641

// Attempt to match the insertps pattern.

13642

unsigned InsertPSMask;

13643

if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

13644

return SDValue();

13645

13646

// Insert the V2 element into the desired position.

13647

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

13648

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

13649

}

13650

13651

/// Try to lower a shuffle as a permute of the inputs followed by an

13652

/// UNPCK instruction.

13653

///

13654

/// This specifically targets cases where we end up with alternating between

13655

/// the two inputs, and so can permute them into something that feeds a single

13656

/// UNPCK instruction. Note that this routine only targets integer vectors

13657

/// because for floating point vectors we have a generalized SHUFPS lowering

13658

/// strategy that handles everything that doesn't *exactly* match an unpack,

13659

/// making this clever lowering unnecessary.

13660

static SDValue lowerShuffleAsPermuteAndUnpack(

13661

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13662

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13663

assert(!VT.isFloatingPoint() &&((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13664, __PRETTY_FUNCTION__))

13664

"This routine only supports integer vectors.")((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13664, __PRETTY_FUNCTION__));

13665

assert(VT.is128BitVector() &&((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13666, __PRETTY_FUNCTION__))

13666

"This routine only works on 128-bit vectors.")((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13666, __PRETTY_FUNCTION__));

13667

assert(!V2.isUndef() &&((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13668, __PRETTY_FUNCTION__))

13668

"This routine should only be used when blending two inputs.")((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13668, __PRETTY_FUNCTION__));

13669

assert(Mask.size() >= 2 && "Single element masks are invalid.")((Mask.size() >= 2 && "Single element masks are invalid."
) ? static_cast<void> (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13669, __PRETTY_FUNCTION__));

13670

13671

int Size = Mask.size();

13672

13673

int NumLoInputs =

13674

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

13675

int NumHiInputs =

13676

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

13677

13678

bool UnpackLo = NumLoInputs >= NumHiInputs;

13679

13680

auto TryUnpack = [&](int ScalarSize, int Scale) {

13681

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

13682

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

13683

13684

for (int i = 0; i < Size; ++i) {

13685

if (Mask[i] < 0)

13686

continue;

13687

13688

// Each element of the unpack contains Scale elements from this mask.

13689

int UnpackIdx = i / Scale;

13690

13691

// We only handle the case where V1 feeds the first slots of the unpack.

13692

// We rely on canonicalization to ensure this is the case.

13693

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

13694

return SDValue();

13695

13696

// Setup the mask for this input. The indexing is tricky as we have to

13697

// handle the unpack stride.

13698

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

13699

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

13700

Mask[i] % Size;

13701

}

13702

13703

// If we will have to shuffle both inputs to use the unpack, check whether

13704

// we can just unpack first and shuffle the result. If so, skip this unpack.

13705

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

13706

!isNoopShuffleMask(V2Mask))

13707

return SDValue();

13708

13709

// Shuffle the inputs into place.

13710

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13711

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13712

13713

// Cast the inputs to the type we will use to unpack them.

13714

MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

13715

V1 = DAG.getBitcast(UnpackVT, V1);

13716

V2 = DAG.getBitcast(UnpackVT, V2);

13717

13718

// Unpack the inputs and cast the result back to the desired type.

13719

return DAG.getBitcast(

13720

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

13721

UnpackVT, V1, V2));

13722

};

13723

13724

// We try each unpack from the largest to the smallest to try and find one

13725

// that fits this mask.

13726

int OrigScalarSize = VT.getScalarSizeInBits();

13727

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

13728

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

13729

return Unpack;

13730

13731

// If we're shuffling with a zero vector then we're better off not doing

13732

// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

13733

if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

13734

ISD::isBuildVectorAllZeros(V2.getNode()))

13735

return SDValue();

13736

13737

// If none of the unpack-rooted lowerings worked (or were profitable) try an

13738

// initial unpack.

13739

if (NumLoInputs == 0 || NumHiInputs == 0) {

13740

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13741, __PRETTY_FUNCTION__))

13741

"We have to have *some* inputs!")(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13741, __PRETTY_FUNCTION__));

13742

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

13743

13744

// FIXME: We could consider the total complexity of the permute of each

13745

// possible unpacking. Or at the least we should consider how many

13746

// half-crossings are created.

13747

// FIXME: We could consider commuting the unpacks.

13748

13749

SmallVector<int, 32> PermMask((unsigned)Size, -1);

13750

for (int i = 0; i < Size; ++i) {

13751

if (Mask[i] < 0)

13752

continue;

13753

13754

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((Mask[i] % Size >= HalfOffset && "Found input from wrong half!"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13754, __PRETTY_FUNCTION__));

13755

13756

PermMask[i] =

13757

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

13758

}

13759

return DAG.getVectorShuffle(

13760

VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,

13761

DL, VT, V1, V2),

13762

DAG.getUNDEF(VT), PermMask);

13763

}

13764

13765

return SDValue();

13766

}

13767

13768

/// Handle lowering of 2-lane 64-bit floating point shuffles.

13769

///

13770

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

13771

/// support for floating point shuffles but not integer shuffles. These

13772

/// instructions will incur a domain crossing penalty on some chips though so

13773

/// it is better to avoid lowering through this for integer vectors where

13774

/// possible.

13775

static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

13776

const APInt &Zeroable, SDValue V1, SDValue V2,

13777

const X86Subtarget &Subtarget,

13778

SelectionDAG &DAG) {

13779

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13779, __PRETTY_FUNCTION__));

13780

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13780, __PRETTY_FUNCTION__));

13781

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13781, __PRETTY_FUNCTION__));

13782

13783

if (V2.isUndef()) {

13784

// Check for being able to broadcast a single element.

13785

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

13786

Mask, Subtarget, DAG))

13787

return Broadcast;

13788

13789

// Straight shuffle of a single input vector. Simulate this by using the

13790

// single input as both of the "inputs" to this instruction..

13791

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

13792

13793

if (Subtarget.hasAVX()) {

13794

// If we have AVX, we can use VPERMILPS which will allow folding a load

13795

// into the shuffle.

13796

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

13797

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

13798

}

13799

13800

return DAG.getNode(

13801

X86ISD::SHUFP, DL, MVT::v2f64,

13802

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

13803

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

13804

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

13805

}

13806

assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13806, __PRETTY_FUNCTION__));

13807

assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13807, __PRETTY_FUNCTION__));

13808

assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13808, __PRETTY_FUNCTION__));

13809

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13809, __PRETTY_FUNCTION__));

13810

13811

if (Subtarget.hasAVX2())

13812

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

13813

return Extract;

13814

13815

// When loading a scalar and then shuffling it into a vector we can often do

13816

// the insertion cheaply.

13817

if (SDValue Insertion = lowerShuffleAsElementInsertion(

13818

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

13819

return Insertion;

13820

// Try inverting the insertion since for v2 masks it is easy to do and we

13821

// can't reliably sort the mask one way or the other.

13822

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

13823

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

13824

if (SDValue Insertion = lowerShuffleAsElementInsertion(

13825

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

13826

return Insertion;

13827

13828

// Try to use one of the special instruction patterns to handle two common

13829

// blend patterns if a zero-blend above didn't work.

13830

if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||

13831

isShuffleEquivalent(V1, V2, Mask, {1, 3}))

13832

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

13833

// We can either use a special instruction to load over the low double or

13834

// to move just the low double.

13835

return DAG.getNode(

13836

X86ISD::MOVSD, DL, MVT::v2f64, V2,

13837

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

13838

13839

if (Subtarget.hasSSE41())

13840

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

13841

Zeroable, Subtarget, DAG))

13842

return Blend;

13843

13844

// Use dedicated unpack instructions for masks that match their pattern.

13845

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

13846

return V;

13847

13848

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

13849

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

13850

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

13851

}

13852

13853

/// Handle lowering of 2-lane 64-bit integer shuffles.

13854

///

13855

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

13856

/// the integer unit to minimize domain crossing penalties. However, for blends

13857

/// it falls back to the floating point shuffle operation with appropriate bit

13858

/// casting.

13859

static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

13860

const APInt &Zeroable, SDValue V1, SDValue V2,

13861

const X86Subtarget &Subtarget,

13862

SelectionDAG &DAG) {

13863

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13863, __PRETTY_FUNCTION__));

13864

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13864, __PRETTY_FUNCTION__));

13865

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13865, __PRETTY_FUNCTION__));

13866

13867

if (V2.isUndef()) {

13868

// Check for being able to broadcast a single element.

13869

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

13870

Mask, Subtarget, DAG))

13871

return Broadcast;

13872

13873

// Straight shuffle of a single input vector. For everything from SSE2

13874

// onward this has a single fast instruction with no scary immediates.

13875

// We have to map the mask as it is actually a v4i32 shuffle instruction.

13876

V1 = DAG.getBitcast(MVT::v4i32, V1);

13877

int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

13878

Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

13879

Mask[1] < 0 ? -1 : (Mask[1] * 2),

13880

Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

13881

return DAG.getBitcast(

13882

MVT::v2i64,

13883

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

13884

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

13885

}

13886

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13886, __PRETTY_FUNCTION__));

13887

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13887, __PRETTY_FUNCTION__));

13888

assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13888, __PRETTY_FUNCTION__));

13889

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13889, __PRETTY_FUNCTION__));

13890

13891

if (Subtarget.hasAVX2())

13892

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

13893

return Extract;

13894

13895

// Try to use shift instructions.

13896

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,

13897

Zeroable, Subtarget, DAG))

13898

return Shift;

13899

13900

// When loading a scalar and then shuffling it into a vector we can often do

13901

// the insertion cheaply.

13902

if (SDValue Insertion = lowerShuffleAsElementInsertion(

13903

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

13904

return Insertion;

13905

// Try inverting the insertion since for v2 masks it is easy to do and we

13906

// can't reliably sort the mask one way or the other.

13907

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

13908

if (SDValue Insertion = lowerShuffleAsElementInsertion(

13909

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

13910

return Insertion;

13911

13912

// We have different paths for blend lowering, but they all must use the

13913

// *exact* same predicate.

13914

bool IsBlendSupported = Subtarget.hasSSE41();

13915

if (IsBlendSupported)

13916

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

13917

Zeroable, Subtarget, DAG))

13918

return Blend;

13919

13920

// Use dedicated unpack instructions for masks that match their pattern.

13921

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

13922

return V;

13923

13924

// Try to use byte rotation instructions.

13925

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

13926

if (Subtarget.hasSSSE3()) {

13927

if (Subtarget.hasVLX())

13928

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

13929

Subtarget, DAG))

13930

return Rotate;

13931

13932

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

13933

Subtarget, DAG))

13934

return Rotate;

13935

}

13936

13937

// If we have direct support for blends, we should lower by decomposing into

13938

// a permute. That will be faster than the domain cross.

13939

if (IsBlendSupported)

13940

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

13941

Subtarget, DAG);

13942

13943

// We implement this with SHUFPD which is pretty lame because it will likely

13944

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

13945

// However, all the alternatives are still more cycles and newer chips don't

13946

// have this problem. It would be really nice if x86 had better shuffles here.

13947

V1 = DAG.getBitcast(MVT::v2f64, V1);

13948

V2 = DAG.getBitcast(MVT::v2f64, V2);

13949

return DAG.getBitcast(MVT::v2i64,

13950

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

13951

}

13952

13953

/// Lower a vector shuffle using the SHUFPS instruction.

13954

///

13955

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

13956

/// It makes no assumptions about whether this is the *best* lowering, it simply

13957

/// uses it.

13958

static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

13959

ArrayRef<int> Mask, SDValue V1,

13960

SDValue V2, SelectionDAG &DAG) {

13961

SDValue LowV = V1, HighV = V2;

13962

SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());

13963

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

13964

13965

if (NumV2Elements == 1) {

13966

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

13967

13968

// Compute the index adjacent to V2Index and in the same half by toggling

13969

// the low bit.

13970

int V2AdjIndex = V2Index ^ 1;

13971

13972

if (Mask[V2AdjIndex] < 0) {

13973

// Handles all the cases where we have a single V2 element and an undef.

13974

// This will only ever happen in the high lanes because we commute the

13975

// vector otherwise.

13976

if (V2Index < 2)

13977

std::swap(LowV, HighV);

13978

NewMask[V2Index] -= 4;

13979

} else {

13980

// Handle the case where the V2 element ends up adjacent to a V1 element.

13981

// To make this work, blend them together as the first step.

13982

int V1Index = V2AdjIndex;

13983

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

13984

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

13985

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

13986

13987

// Now proceed to reconstruct the final blend as we have the necessary

13988

// high or low half formed.

13989

if (V2Index < 2) {

13990

LowV = V2;

13991

HighV = V1;

13992

} else {

13993

HighV = V2;

13994

}

13995

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

13996

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

13997

}

13998

} else if (NumV2Elements == 2) {

13999

if (Mask[0] < 4 && Mask[1] < 4) {

14000

// Handle the easy case where we have V1 in the low lanes and V2 in the

14001

// high lanes.

14002

NewMask[2] -= 4;

14003

NewMask[3] -= 4;

14004

} else if (Mask[2] < 4 && Mask[3] < 4) {

14005

// We also handle the reversed case because this utility may get called

14006

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

14007

// arrange things in the right direction.

14008

NewMask[0] -= 4;

14009

NewMask[1] -= 4;

14010

HighV = V1;

14011

LowV = V2;

14012

} else {

14013

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

14014

// trying to place elements directly, just blend them and set up the final

14015

// shuffle to place them.

14016

14017

// The first two blend mask elements are for V1, the second two are for

14018

// V2.

14019

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

14020

Mask[2] < 4 ? Mask[2] : Mask[3],

14021

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

14022

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

14023

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

14024

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

14025

14026

// Now we do a normal shuffle of V1 by giving V1 as both operands to

14027

// a blend.

14028

LowV = HighV = V1;

14029

NewMask[0] = Mask[0] < 4 ? 0 : 2;

14030

NewMask[1] = Mask[0] < 4 ? 2 : 0;

14031

NewMask[2] = Mask[2] < 4 ? 1 : 3;

14032

NewMask[3] = Mask[2] < 4 ? 3 : 1;

14033

}

14034

} else if (NumV2Elements == 3) {

14035

// Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

14036

// we can get here due to other paths (e.g repeated mask matching) that we

14037

// don't want to do another round of lowerVECTOR_SHUFFLE.

14038

ShuffleVectorSDNode::commuteMask(NewMask);

14039

return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

14040

}

14041

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

14042

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

14043

}

14044

14045

/// Lower 4-lane 32-bit floating point shuffles.

14046

///

14047

/// Uses instructions exclusively from the floating point unit to minimize

14048

/// domain crossing penalties, as these are sufficient to implement all v4f32

14049

/// shuffles.

14050

static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

14051

const APInt &Zeroable, SDValue V1, SDValue V2,

14052

const X86Subtarget &Subtarget,

14053

SelectionDAG &DAG) {

14054

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14054, __PRETTY_FUNCTION__));

14055

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14055, __PRETTY_FUNCTION__));

14056

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14056, __PRETTY_FUNCTION__));

14057

14058

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

14059

14060

if (NumV2Elements == 0) {

14061

// Check for being able to broadcast a single element.

14062

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

14063

Mask, Subtarget, DAG))

14064

return Broadcast;

14065

14066

// Use even/odd duplicate instructions for masks that match their pattern.

14067

if (Subtarget.hasSSE3()) {

14068

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))

14069

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

14070

if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))

14071

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

14072

}

14073

14074

if (Subtarget.hasAVX()) {

14075

// If we have AVX, we can use VPERMILPS which will allow folding a load

14076

// into the shuffle.

14077

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

14078

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

14079

}

14080

14081

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

14082

// in SSE1 because otherwise they are widened to v2f64 and never get here.

14083

if (!Subtarget.hasSSE2()) {

14084

if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))

14085

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

14086

if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))

14087

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

14088

}

14089

14090

// Otherwise, use a straight shuffle of a single input vector. We pass the

14091

// input vector to both operands to simulate this with a SHUFPS.

14092

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

14093

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

14094

}

14095

14096

if (Subtarget.hasAVX2())

14097

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

14098

return Extract;

14099

14100

// There are special ways we can lower some single-element blends. However, we

14101

// have custom ways we can lower more complex single-element blends below that

14102

// we defer to if both this and BLENDPS fail to match, so restrict this to

14103

// when the V2 input is targeting element 0 of the mask -- that is the fast

14104

// case here.

14105

if (NumV2Elements == 1 && Mask[0] >= 4)

14106

if (SDValue V = lowerShuffleAsElementInsertion(

14107

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

14108

return V;

14109

14110

if (Subtarget.hasSSE41()) {

14111

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

14112

Zeroable, Subtarget, DAG))

14113

return Blend;

14114

14115

// Use INSERTPS if we can complete the shuffle efficiently.

14116

if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

14117

return V;

14118

14119

if (!isSingleSHUFPSMask(Mask))

14120

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

14121

V2, Mask, DAG))

14122

return BlendPerm;

14123

}

14124

14125

// Use low/high mov instructions. These are only valid in SSE1 because

14126

// otherwise they are widened to v2f64 and never get here.

14127

if (!Subtarget.hasSSE2()) {

14128

if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))

14129

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

14130

if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))

14131

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

14132

}

14133

14134

// Use dedicated unpack instructions for masks that match their pattern.

14135

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

14136

return V;

14137

14138

// Otherwise fall back to a SHUFPS lowering strategy.

14139

return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

14140

}

14141

14142

/// Lower 4-lane i32 vector shuffles.

14143

///

14144

/// We try to handle these with integer-domain shuffles where we can, but for

14145

/// blends we use the floating point domain blend instructions.

14146

static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

14147

const APInt &Zeroable, SDValue V1, SDValue V2,

14148

const X86Subtarget &Subtarget,

14149

SelectionDAG &DAG) {

14150

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14150, __PRETTY_FUNCTION__));

14151

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14151, __PRETTY_FUNCTION__));

14152

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14152, __PRETTY_FUNCTION__));

14153

14154

// Whenever we can lower this as a zext, that instruction is strictly faster

14155

// than any alternative. It also allows us to fold memory operands into the

14156

// shuffle in many cases.

14157

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

14158

Zeroable, Subtarget, DAG))

14159

return ZExt;

14160

14161

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

14162

14163

if (NumV2Elements == 0) {

14164

// Try to use broadcast unless the mask only has one non-undef element.

14165

if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

14166

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

14167

Mask, Subtarget, DAG))

14168

return Broadcast;

14169

}

14170

14171

// Straight shuffle of a single input vector. For everything from SSE2

14172

// onward this has a single fast instruction with no scary immediates.

14173

// We coerce the shuffle pattern to be compatible with UNPCK instructions

14174

// but we aren't actually going to use the UNPCK instruction because doing

14175

// so prevents folding a load into this instruction or making a copy.

14176

const int UnpackLoMask[] = {0, 0, 1, 1};

14177

const int UnpackHiMask[] = {2, 2, 3, 3};

14178

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))

14179

Mask = UnpackLoMask;

14180

else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))

14181

Mask = UnpackHiMask;

14182

14183

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

14184

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

14185

}

14186

14187

if (Subtarget.hasAVX2())

14188

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

14189

return Extract;

14190

14191

// Try to use shift instructions.

14192

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,

14193

Zeroable, Subtarget, DAG))

14194

return Shift;

14195

14196

// There are special ways we can lower some single-element blends.

14197

if (NumV2Elements == 1)

14198

if (SDValue V = lowerShuffleAsElementInsertion(

14199

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

14200

return V;

14201

14202

// We have different paths for blend lowering, but they all must use the

14203

// *exact* same predicate.

14204

bool IsBlendSupported = Subtarget.hasSSE41();

14205

if (IsBlendSupported)

14206

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

14207

Zeroable, Subtarget, DAG))

14208

return Blend;

14209

14210

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

14211

Zeroable, Subtarget, DAG))

14212

return Masked;

14213

14214

// Use dedicated unpack instructions for masks that match their pattern.

14215

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

14216

return V;

14217

14218

// Try to use byte rotation instructions.

14219

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

14220

if (Subtarget.hasSSSE3()) {

14221

if (Subtarget.hasVLX())

14222

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

14223

Subtarget, DAG))

14224

return Rotate;

14225

14226

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

14227

Subtarget, DAG))

14228

return Rotate;

14229

}

14230

14231

// Assume that a single SHUFPS is faster than an alternative sequence of

14232

// multiple instructions (even if the CPU has a domain penalty).

14233

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

14234

if (!isSingleSHUFPSMask(Mask)) {

14235

// If we have direct support for blends, we should lower by decomposing into

14236

// a permute. That will be faster than the domain cross.

14237

if (IsBlendSupported)

14238

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

14239

Subtarget, DAG);

14240

14241

// Try to lower by permuting the inputs into an unpack instruction.

14242

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

14243

Mask, Subtarget, DAG))

14244

return Unpack;

14245

}

14246

14247

// We implement this with SHUFPS because it can blend from two vectors.

14248

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

14249

// up the inputs, bypassing domain shift penalties that we would incur if we

14250

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

14251

// relevant.

14252

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

14253

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

14254

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

14255

return DAG.getBitcast(MVT::v4i32, ShufPS);

14256

}

14257

14258

/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

14259

/// shuffle lowering, and the most complex part.

14260

///

14261

/// The lowering strategy is to try to form pairs of input lanes which are

14262

/// targeted at the same half of the final vector, and then use a dword shuffle

14263

/// to place them onto the right half, and finally unpack the paired lanes into

14264

/// their final position.

14265

///

14266

/// The exact breakdown of how to form these dword pairs and align them on the

14267

/// correct sides is really tricky. See the comments within the function for

14268

/// more of the details.

14269

///

14270

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

14271

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

14272

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

14273

/// vector, form the analogous 128-bit 8-element Mask.

14274

static SDValue lowerV8I16GeneralSingleInputShuffle(

14275

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

14276

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

14277

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((VT.getVectorElementType() == MVT::i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14277, __PRETTY_FUNCTION__));

14278

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

14279

14280

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14280, __PRETTY_FUNCTION__));

14281

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

14282

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

14283

14284

// Attempt to directly match PSHUFLW or PSHUFHW.

14285

if (isUndefOrInRange(LoMask, 0, 4) &&

14286

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

14287

return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

14288

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

14289

}

14290

if (isUndefOrInRange(HiMask, 4, 8) &&

14291

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

14292

for (int i = 0; i != 4; ++i)

14293

HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

14294

return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

14295

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

14296

}

14297

14298

SmallVector<int, 4> LoInputs;

14299

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

14300

array_pod_sort(LoInputs.begin(), LoInputs.end());

14301

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

14302

SmallVector<int, 4> HiInputs;

14303

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

14304

array_pod_sort(HiInputs.begin(), HiInputs.end());

14305

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

14306

int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

14307

int NumHToL = LoInputs.size() - NumLToL;

14308

int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

14309

int NumHToH = HiInputs.size() - NumLToH;

14310

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

14311

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

14312

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

14313

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

14314

14315

// If we are shuffling values from one half - check how many different DWORD

14316

// pairs we need to create. If only 1 or 2 then we can perform this as a

14317

// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

14318

auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

14319

ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

14320

V = DAG.getNode(ShufWOp, DL, VT, V,

14321

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

14322

V = DAG.getBitcast(PSHUFDVT, V);

14323

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

14324

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

14325

return DAG.getBitcast(VT, V);

14326

};

14327

14328

if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

14329

int PSHUFDMask[4] = { -1, -1, -1, -1 };

14330

SmallVector<std::pair<int, int>, 4> DWordPairs;

14331

int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

14332

14333

// Collect the different DWORD pairs.

14334

for (int DWord = 0; DWord != 4; ++DWord) {

14335

int M0 = Mask[2 * DWord + 0];

14336

int M1 = Mask[2 * DWord + 1];

14337

M0 = (M0 >= 0 ? M0 % 4 : M0);

14338

M1 = (M1 >= 0 ? M1 % 4 : M1);

14339

if (M0 < 0 && M1 < 0)

14340

continue;

14341

14342

bool Match = false;

14343

for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

14344

auto &DWordPair = DWordPairs[j];

14345

if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

14346

(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

14347

DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

14348

DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

14349

PSHUFDMask[DWord] = DOffset + j;

14350

Match = true;

14351

break;

14352

}

14353

}

14354

if (!Match) {

14355

PSHUFDMask[DWord] = DOffset + DWordPairs.size();

14356

DWordPairs.push_back(std::make_pair(M0, M1));

14357

}

14358

}

14359

14360

if (DWordPairs.size() <= 2) {

14361

DWordPairs.resize(2, std::make_pair(-1, -1));

14362

int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

14363

DWordPairs[1].first, DWordPairs[1].second};

14364

if ((NumHToL + NumHToH) == 0)

14365

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

14366

if ((NumLToL + NumLToH) == 0)

14367

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

14368

}

14369

}

14370

14371

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

14372

// such inputs we can swap two of the dwords across the half mark and end up

14373

// with <=2 inputs to each half in each half. Once there, we can fall through

14374

// to the generic code below. For example:

14375

//

14376

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

14377

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

14378

//

14379

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

14380

// and an existing 2-into-2 on the other half. In this case we may have to

14381

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

14382

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

14383

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

14384

// because any other situation (including a 3-into-1 or 1-into-3 in the other

14385

// half than the one we target for fixing) will be fixed when we re-enter this

14386

// path. We will also combine away any sequence of PSHUFD instructions that

14387

// result into a single instruction. Here is an example of the tricky case:

14388

//

14389

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

14390

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

14391

//

14392

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

14393

//

14394

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

14395

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

14396

//

14397

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

14398

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

14399

//

14400

// The result is fine to be handled by the generic logic.

14401

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

14402

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

14403

int AOffset, int BOffset) {

14404

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14405, __PRETTY_FUNCTION__))

14405

"Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14405, __PRETTY_FUNCTION__));

14406

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14407, __PRETTY_FUNCTION__))

14407

"Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14407, __PRETTY_FUNCTION__));

14408

assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14409, __PRETTY_FUNCTION__))

14409

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14409, __PRETTY_FUNCTION__));

14410

14411

bool ThreeAInputs = AToAInputs.size() == 3;

14412

14413

// Compute the index of dword with only one word among the three inputs in

14414

// a half by taking the sum of the half with three inputs and subtracting

14415

// the sum of the actual three inputs. The difference is the remaining

14416

// slot.

14417

int ADWord = 0, BDWord = 0;

14418

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

14419

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

14420

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

14421

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

14422

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

14423

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

14424

int TripleNonInputIdx =

14425

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

14426

TripleDWord = TripleNonInputIdx / 2;

14427

14428

// We use xor with one to compute the adjacent DWord to whichever one the

14429

// OneInput is in.

14430

OneInputDWord = (OneInput / 2) ^ 1;

14431

14432

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

14433

// and BToA inputs. If there is also such a problem with the BToB and AToB

14434

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

14435

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

14436

// is essential that we don't *create* a 3<-1 as then we might oscillate.

14437

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

14438

// Compute how many inputs will be flipped by swapping these DWords. We

14439

// need

14440

// to balance this to ensure we don't form a 3-1 shuffle in the other

14441

// half.

14442

int NumFlippedAToBInputs =

14443

std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +

14444

std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);

14445

int NumFlippedBToBInputs =

14446

std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +

14447

std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);

14448

if ((NumFlippedAToBInputs == 1 &&

14449

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

14450

(NumFlippedBToBInputs == 1 &&

14451

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

14452

// We choose whether to fix the A half or B half based on whether that

14453

// half has zero flipped inputs. At zero, we may not be able to fix it

14454

// with that half. We also bias towards fixing the B half because that

14455

// will more commonly be the high half, and we have to bias one way.

14456

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

14457

ArrayRef<int> Inputs) {

14458

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

14459

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

14460

// Determine whether the free index is in the flipped dword or the

14461

// unflipped dword based on where the pinned index is. We use this bit

14462

// in an xor to conditionally select the adjacent dword.

14463

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

14464

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

14465

if (IsFixIdxInput == IsFixFreeIdxInput)

14466

FixFreeIdx += 1;

14467

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

14468

assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14469, __PRETTY_FUNCTION__))

14469

"We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14469, __PRETTY_FUNCTION__));

14470

int PSHUFHalfMask[] = {0, 1, 2, 3};

14471

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

14472

V = DAG.getNode(

14473

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

14474

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

14475

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

14476

14477

for (int &M : Mask)

14478

if (M >= 0 && M == FixIdx)

14479

M = FixFreeIdx;

14480

else if (M >= 0 && M == FixFreeIdx)

14481

M = FixIdx;

14482

};

14483

if (NumFlippedBToBInputs != 0) {

14484

int BPinnedIdx =

14485

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

14486

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

14487

} else {

14488

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!"
) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14488, __PRETTY_FUNCTION__));

14489

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

14490

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

14491

}

14492

}

14493

}

14494

14495

int PSHUFDMask[] = {0, 1, 2, 3};

14496

PSHUFDMask[ADWord] = BDWord;

14497

PSHUFDMask[BDWord] = ADWord;

14498

V = DAG.getBitcast(

14499

VT,

14500

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

14501

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

14502

14503

// Adjust the mask to match the new locations of A and B.

14504

for (int &M : Mask)

14505

if (M >= 0 && M/2 == ADWord)

14506

M = 2 * BDWord + M % 2;

14507

else if (M >= 0 && M/2 == BDWord)

14508

M = 2 * ADWord + M % 2;

14509

14510

// Recurse back into this routine to re-compute state now that this isn't

14511

// a 3 and 1 problem.

14512

return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

14513

};

14514

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

14515

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

14516

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

14517

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

14518

14519

// At this point there are at most two inputs to the low and high halves from

14520

// each half. That means the inputs can always be grouped into dwords and

14521

// those dwords can then be moved to the correct half with a dword shuffle.

14522

// We use at most one low and one high word shuffle to collect these paired

14523

// inputs into dwords, and finally a dword shuffle to place them.

14524

int PSHUFLMask[4] = {-1, -1, -1, -1};

14525

int PSHUFHMask[4] = {-1, -1, -1, -1};

14526

int PSHUFDMask[4] = {-1, -1, -1, -1};

14527

14528

// First fix the masks for all the inputs that are staying in their

14529

// original halves. This will then dictate the targets of the cross-half

14530

// shuffles.

14531

auto fixInPlaceInputs =

14532

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

14533

MutableArrayRef<int> SourceHalfMask,

14534

MutableArrayRef<int> HalfMask, int HalfOffset) {

14535

if (InPlaceInputs.empty())

14536

return;

14537

if (InPlaceInputs.size() == 1) {

14538

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

14539

InPlaceInputs[0] - HalfOffset;

14540

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

14541

return;

14542

}

14543

if (IncomingInputs.empty()) {

14544

// Just fix all of the in place inputs.

14545

for (int Input : InPlaceInputs) {

14546

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

14547

PSHUFDMask[Input / 2] = Input / 2;

14548

}

14549

return;

14550

}

14551

14552

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"
) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14552, __PRETTY_FUNCTION__));

14553

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

14554

InPlaceInputs[0] - HalfOffset;

14555

// Put the second input next to the first so that they are packed into

14556

// a dword. We find the adjacent index by toggling the low bit.

14557

int AdjIndex = InPlaceInputs[0] ^ 1;

14558

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

14559

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

14560

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

14561

};

14562

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

14563

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

14564

14565

// Now gather the cross-half inputs and place them into a free dword of

14566

// their target half.

14567

// FIXME: This operation could almost certainly be simplified dramatically to

14568

// look more like the 3-1 fixing operation.

14569

auto moveInputsToRightHalf = [&PSHUFDMask](

14570

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

14571

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

14572

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

14573

int DestOffset) {

14574

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

14575

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

14576

};

14577

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

14578

int Word) {

14579

int LowWord = Word & ~1;

14580

int HighWord = Word | 1;

14581

return isWordClobbered(SourceHalfMask, LowWord) ||

14582

isWordClobbered(SourceHalfMask, HighWord);

14583

};

14584

14585

if (IncomingInputs.empty())

14586

return;

14587

14588

if (ExistingInputs.empty()) {

14589

// Map any dwords with inputs from them into the right half.

14590

for (int Input : IncomingInputs) {

14591

// If the source half mask maps over the inputs, turn those into

14592

// swaps and use the swapped lane.

14593

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

14594

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

14595

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

14596

Input - SourceOffset;

14597

// We have to swap the uses in our half mask in one sweep.

14598

for (int &M : HalfMask)

14599

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

14600

M = Input;

14601

else if (M == Input)

14602

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

14603

} else {

14604

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14606, __PRETTY_FUNCTION__))

14605

Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14606, __PRETTY_FUNCTION__))

14606

"Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14606, __PRETTY_FUNCTION__));

14607

}

14608

// Note that this correctly re-maps both when we do a swap and when

14609

// we observe the other side of the swap above. We rely on that to

14610

// avoid swapping the members of the input list directly.

14611

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

14612

}

14613

14614

// Map the input's dword into the correct half.

14615

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

14616

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

14617

else

14618

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14620, __PRETTY_FUNCTION__))

14619

Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14620, __PRETTY_FUNCTION__))

14620

"Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14620, __PRETTY_FUNCTION__));

14621

}

14622

14623

// And just directly shift any other-half mask elements to be same-half

14624

// as we will have mirrored the dword containing the element into the

14625

// same position within that half.

14626

for (int &M : HalfMask)

14627

if (M >= SourceOffset && M < SourceOffset + 4) {

14628

M = M - SourceOffset + DestOffset;

14629

assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ?
static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14629, __PRETTY_FUNCTION__));

14630

}

14631

return;

14632

}

14633

14634

// Ensure we have the input in a viable dword of its current half. This

14635

// is particularly tricky because the original position may be clobbered

14636

// by inputs being moved and *staying* in that half.

14637

if (IncomingInputs.size() == 1) {

14638

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

14639

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

14640

SourceOffset;

14641

SourceHalfMask[InputFixed - SourceOffset] =

14642

IncomingInputs[0] - SourceOffset;

14643

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

14644

InputFixed);

14645

IncomingInputs[0] = InputFixed;

14646

}

14647

} else if (IncomingInputs.size() == 2) {

14648

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

14649

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

14650

// We have two non-adjacent or clobbered inputs we need to extract from

14651

// the source half. To do this, we need to map them into some adjacent

14652

// dword slot in the source mask.

14653

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

14654

IncomingInputs[1] - SourceOffset};

14655

14656

// If there is a free slot in the source half mask adjacent to one of

14657

// the inputs, place the other input in it. We use (Index XOR 1) to

14658

// compute an adjacent index.

14659

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

14660

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

14661

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

14662

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

14663

InputsFixed[1] = InputsFixed[0] ^ 1;

14664

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

14665

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

14666

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

14667

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

14668

InputsFixed[0] = InputsFixed[1] ^ 1;

14669

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

14670

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

14671

// The two inputs are in the same DWord but it is clobbered and the

14672

// adjacent DWord isn't used at all. Move both inputs to the free

14673

// slot.

14674

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

14675

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

14676

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

14677

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

14678

} else {

14679

// The only way we hit this point is if there is no clobbering

14680

// (because there are no off-half inputs to this half) and there is no

14681

// free slot adjacent to one of the inputs. In this case, we have to

14682

// swap an input with a non-input.

14683

for (int i = 0; i < 4; ++i)

14684

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14685, __PRETTY_FUNCTION__))

14685

"We can't handle any clobbers here!")(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14685, __PRETTY_FUNCTION__));

14686

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14687, __PRETTY_FUNCTION__))

14687

"Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14687, __PRETTY_FUNCTION__));

14688

14689

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

14690

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

14691

14692

// We also have to update the final source mask in this case because

14693

// it may need to undo the above swap.

14694

for (int &M : FinalSourceHalfMask)

14695

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

14696

M = InputsFixed[1] + SourceOffset;

14697

else if (M == InputsFixed[1] + SourceOffset)

14698

M = (InputsFixed[0] ^ 1) + SourceOffset;

14699

14700

InputsFixed[1] = InputsFixed[0] ^ 1;

14701

}

14702

14703

// Point everything at the fixed inputs.

14704

for (int &M : HalfMask)

14705

if (M == IncomingInputs[0])

14706

M = InputsFixed[0] + SourceOffset;

14707

else if (M == IncomingInputs[1])

14708

M = InputsFixed[1] + SourceOffset;

14709

14710

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

14711

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

14712

}

14713

} else {

14714

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14714);

14715

}

14716

14717

// Now hoist the DWord down to the right half.

14718

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

14719

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((PSHUFDMask[FreeDWord] < 0 && "DWord not free") ?
static_cast<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14719, __PRETTY_FUNCTION__));

14720

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

14721

for (int &M : HalfMask)

14722

for (int Input : IncomingInputs)

14723

if (M == Input)

14724

M = FreeDWord * 2 + Input % 2;

14725

};

14726

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

14727

/*SourceOffset*/ 4, /*DestOffset*/ 0);

14728

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

14729

/*SourceOffset*/ 0, /*DestOffset*/ 4);

14730

14731

// Now enact all the shuffles we've computed to move the inputs into their

14732

// target half.

14733

if (!isNoopShuffleMask(PSHUFLMask))

14734

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

14735

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

14736

if (!isNoopShuffleMask(PSHUFHMask))

14737

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

14738

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

14739

if (!isNoopShuffleMask(PSHUFDMask))

14740

V = DAG.getBitcast(

14741

VT,

14742

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

14743

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

14744

14745

// At this point, each half should contain all its inputs, and we can then

14746

// just shuffle them into their final position.

14747

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14748, __PRETTY_FUNCTION__))

14748

"Failed to lift all the high half inputs to the low mask!")((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14748, __PRETTY_FUNCTION__));

14749

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14750, __PRETTY_FUNCTION__))

14750

"Failed to lift all the low half inputs to the high mask!")((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14750, __PRETTY_FUNCTION__));

14751

14752

// Do a half shuffle for the low mask.

14753

if (!isNoopShuffleMask(LoMask))

14754

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

14755

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

14756

14757

// Do a half shuffle with the high mask after shifting its values down.

14758

for (int &M : HiMask)

14759

if (M >= 0)

14760

M -= 4;

14761

if (!isNoopShuffleMask(HiMask))

14762

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

14763

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

14764

14765

return V;

14766

}

14767

14768

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

14769

/// blend if only one input is used.

14770

static SDValue lowerShuffleAsBlendOfPSHUFBs(

14771

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14772

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

14773

assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14774, __PRETTY_FUNCTION__))

14774

"Lane crossing shuffle masks not supported")((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14774, __PRETTY_FUNCTION__));

14775

14776

int NumBytes = VT.getSizeInBits() / 8;

14777

int Size = Mask.size();

14778

int Scale = NumBytes / Size;

14779

14780

SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

14781

SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

14782

V1InUse = false;

14783

V2InUse = false;

14784

14785

for (int i = 0; i < NumBytes; ++i) {

14786

int M = Mask[i / Scale];

14787

if (M < 0)

14788

continue;

14789

14790

const int ZeroMask = 0x80;

14791

int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

14792

int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

14793

if (Zeroable[i / Scale])

14794

V1Idx = V2Idx = ZeroMask;

14795

14796

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

14797

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

14798

V1InUse |= (ZeroMask != V1Idx);

14799

V2InUse |= (ZeroMask != V2Idx);

14800

}

14801

14802

MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

14803

if (V1InUse)

14804

V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

14805

DAG.getBuildVector(ShufVT, DL, V1Mask));

14806

if (V2InUse)

14807

V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

14808

DAG.getBuildVector(ShufVT, DL, V2Mask));

14809

14810

// If we need shuffled inputs from both, blend the two.

14811

SDValue V;

14812

if (V1InUse && V2InUse)

14813

V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

14814

else

14815

V = V1InUse ? V1 : V2;

14816

14817

// Cast the result back to the correct type.

14818

return DAG.getBitcast(VT, V);

14819

}

14820

14821

/// Generic lowering of 8-lane i16 shuffles.

14822

///

14823

/// This handles both single-input shuffles and combined shuffle/blends with

14824

/// two inputs. The single input shuffles are immediately delegated to

14825

/// a dedicated lowering routine.

14826

///

14827

/// The blends are lowered in one of three fundamental ways. If there are few

14828

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

14829

/// of the input is significantly cheaper when lowered as an interleaving of

14830

/// the two inputs, try to interleave them. Otherwise, blend the low and high

14831

/// halves of the inputs separately (making them have relatively few inputs)

14832

/// and then concatenate them.

14833

static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

14834

const APInt &Zeroable, SDValue V1, SDValue V2,

14835

const X86Subtarget &Subtarget,

14836

SelectionDAG &DAG) {

14837

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14837, __PRETTY_FUNCTION__));

14838

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14838, __PRETTY_FUNCTION__));

14839

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14839, __PRETTY_FUNCTION__));

14840

14841

// Whenever we can lower this as a zext, that instruction is strictly faster

14842

// than any alternative.

14843

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

14844

Zeroable, Subtarget, DAG))

14845

return ZExt;

14846

14847

// Try to use lower using a truncation.

14848

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

14849

Subtarget, DAG))

14850

return V;

14851

14852

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

14853

14854

if (NumV2Inputs == 0) {

14855

// Try to use shift instructions.

14856

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,

14857

Zeroable, Subtarget, DAG))

14858

return Shift;

14859

14860

// Check for being able to broadcast a single element.

14861

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

14862

Mask, Subtarget, DAG))

14863

return Broadcast;

14864

14865

// Try to use bit rotation instructions.

14866

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

14867

Subtarget, DAG))

14868

return Rotate;

14869

14870

// Use dedicated unpack instructions for masks that match their pattern.

14871

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

14872

return V;

14873

14874

// Use dedicated pack instructions for masks that match their pattern.

14875

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

14876

Subtarget))

14877

return V;

14878

14879

// Try to use byte rotation instructions.

14880

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

14881

Subtarget, DAG))

14882

return Rotate;

14883

14884

// Make a copy of the mask so it can be modified.

14885

SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());

14886

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

14887

Subtarget, DAG);

14888

}

14889

14890

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14892, __PRETTY_FUNCTION__))

14891

"All single-input shuffles should be canonicalized to be V1-input "((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14892, __PRETTY_FUNCTION__))

14892

"shuffles.")((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14892, __PRETTY_FUNCTION__));

14893

14894

// Try to use shift instructions.

14895

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,

14896

Zeroable, Subtarget, DAG))

14897

return Shift;

14898

14899

// See if we can use SSE4A Extraction / Insertion.

14900

if (Subtarget.hasSSE4A())

14901

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

14902

Zeroable, DAG))

14903

return V;

14904

14905

// There are special ways we can lower some single-element blends.

14906

if (NumV2Inputs == 1)

14907

if (SDValue V = lowerShuffleAsElementInsertion(

14908

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

14909

return V;

14910

14911

// We have different paths for blend lowering, but they all must use the

14912

// *exact* same predicate.

14913

bool IsBlendSupported = Subtarget.hasSSE41();

14914

if (IsBlendSupported)

14915

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

14916

Zeroable, Subtarget, DAG))

14917

return Blend;

14918

14919

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

14920

Zeroable, Subtarget, DAG))

14921

return Masked;

14922

14923

// Use dedicated unpack instructions for masks that match their pattern.

14924

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

14925

return V;

14926

14927

// Use dedicated pack instructions for masks that match their pattern.

14928

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

14929

Subtarget))

14930

return V;

14931

14932

// Try to use lower using a truncation.

14933

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

14934

Subtarget, DAG))

14935

return V;

14936

14937

// Try to use byte rotation instructions.

14938

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

14939

Subtarget, DAG))

14940

return Rotate;

14941

14942

if (SDValue BitBlend =

14943

lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

14944

return BitBlend;

14945

14946

// Try to use byte shift instructions to mask.

14947

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

14948

Zeroable, Subtarget, DAG))

14949

return V;

14950

14951

// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

14952

// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

14953

// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

14954

int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);

14955

if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

14956

!Subtarget.hasVLX()) {

14957

SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));

14958

for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

14959

DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

14960

SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

14961

V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

14962

DWordClearMask);

14963

V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

14964

DWordClearMask);

14965

// Now pack things back together.

14966

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

14967

if (NumEvenDrops == 2) {

14968

Result = DAG.getBitcast(MVT::v4i32, Result);

14969

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

14970

}

14971

return Result;

14972

}

14973

14974

// Try to lower by permuting the inputs into an unpack instruction.

14975

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

14976

Mask, Subtarget, DAG))

14977

return Unpack;

14978

14979

// If we can't directly blend but can use PSHUFB, that will be better as it

14980

// can both shuffle and set up the inefficient blend.

14981

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

14982

bool V1InUse, V2InUse;

14983

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

14984

Zeroable, DAG, V1InUse, V2InUse);

14985

}

14986

14987

// We can always bit-blend if we have to so the fallback strategy is to

14988

// decompose into single-input permutes and blends/unpacks.

14989

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,

14990

Mask, Subtarget, DAG);

14991

}

14992

14993

// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

14994

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

14995

// the active subvector is extracted.

14996

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

14997

ArrayRef<int> Mask, SDValue V1, SDValue V2,

14998

const X86Subtarget &Subtarget,

14999

SelectionDAG &DAG) {

15000

int NumElts = VT.getVectorNumElements();

15001

MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());

15002

MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, NumElts);

15003

15004

SDValue MaskNode;

15005

MVT ShuffleVT = VT;

15006

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

15007

V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

15008

V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

15009

ShuffleVT = V1.getSimpleValueType();

15010

15011

// Adjust mask to correct indices for the second input.

15012

unsigned Scale = 512 / VT.getSizeInBits();

15013

SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());

15014

for (int &M : AdjustedMask)

15015

if (NumElts <= M)

15016

M += (Scale - 1) * NumElts;

15017

MaskNode = getConstVector(AdjustedMask, MaskVecVT, DAG, DL, true);

15018

MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

15019

} else {

15020

MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);

15021

}

15022

15023

SDValue Result;

15024

if (V2.isUndef())

15025

Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

15026

else

15027

Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);

15028

15029

if (VT != ShuffleVT)

15030

Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());

15031

15032

return Result;

15033

}

15034

15035

/// Generic lowering of v16i8 shuffles.

15036

///

15037

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

15038

/// detect any complexity reducing interleaving. If that doesn't help, it uses

15039

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

15040

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

15041

/// back together.

15042

static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15043

const APInt &Zeroable, SDValue V1, SDValue V2,

15044

const X86Subtarget &Subtarget,

15045

SelectionDAG &DAG) {

15046

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15046, __PRETTY_FUNCTION__));

15047

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15047, __PRETTY_FUNCTION__));

15048

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15048, __PRETTY_FUNCTION__));

15049

15050

// Try to use shift instructions.

15051

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,

15052

Zeroable, Subtarget, DAG))

15053

return Shift;

15054

15055

// Try to use byte rotation instructions.

15056

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

15057

Subtarget, DAG))

15058

return Rotate;

15059

15060

// Use dedicated pack instructions for masks that match their pattern.

15061

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

15062

Subtarget))

15063

return V;

15064

15065

// Try to use a zext lowering.

15066

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

15067

Zeroable, Subtarget, DAG))

15068

return ZExt;

15069

15070

// Try to use lower using a truncation.

15071

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

15072

Subtarget, DAG))

15073

return V;

15074

15075

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

15076

Subtarget, DAG))

15077

return V;

15078

15079

// See if we can use SSE4A Extraction / Insertion.

15080

if (Subtarget.hasSSE4A())

15081

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

15082

Zeroable, DAG))

15083

return V;

15084

15085

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

15086

15087

// For single-input shuffles, there are some nicer lowering tricks we can use.

15088

if (NumV2Elements == 0) {

15089

// Check for being able to broadcast a single element.

15090

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

15091

Mask, Subtarget, DAG))

15092

return Broadcast;

15093

15094

// Try to use bit rotation instructions.

15095

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

15096

Subtarget, DAG))

15097

return Rotate;

15098

15099

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

15100

return V;

15101

15102

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

15103

// Notably, this handles splat and partial-splat shuffles more efficiently.

15104

// However, it only makes sense if the pre-duplication shuffle simplifies

15105

// things significantly. Currently, this means we need to be able to

15106

// express the pre-duplication shuffle as an i16 shuffle.

15107

//

15108

// FIXME: We should check for other patterns which can be widened into an

15109

// i16 shuffle as well.

15110

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

15111

for (int i = 0; i < 16; i += 2)

15112

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

15113

return false;

15114

15115

return true;

15116

};

15117

auto tryToWidenViaDuplication = [&]() -> SDValue {

15118

if (!canWidenViaDuplication(Mask))

15119

return SDValue();

15120

SmallVector<int, 4> LoInputs;

15121

copy_if(Mask, std::back_inserter(LoInputs),

15122

[](int M) { return M >= 0 && M < 8; });

15123

array_pod_sort(LoInputs.begin(), LoInputs.end());

15124

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

15125

LoInputs.end());

15126

SmallVector<int, 4> HiInputs;

15127

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

15128

array_pod_sort(HiInputs.begin(), HiInputs.end());

15129

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

15130

HiInputs.end());

15131

15132

bool TargetLo = LoInputs.size() >= HiInputs.size();

15133

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

15134

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

15135

15136

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

15137

SmallDenseMap<int, int, 8> LaneMap;

15138

for (int I : InPlaceInputs) {

15139

PreDupI16Shuffle[I/2] = I/2;

15140

LaneMap[I] = I;

15141

}

15142

int j = TargetLo ? 0 : 4, je = j + 4;

15143

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

15144

// Check if j is already a shuffle of this input. This happens when

15145

// there are two adjacent bytes after we move the low one.

15146

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

15147

// If we haven't yet mapped the input, search for a slot into which

15148

// we can map it.

15149

while (j < je && PreDupI16Shuffle[j] >= 0)

15150

++j;

15151

15152

if (j == je)

15153

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

15154

return SDValue();

15155

15156

// Map this input with the i16 shuffle.

15157

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

15158

}

15159

15160

// Update the lane map based on the mapping we ended up with.

15161

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

15162

}

15163

V1 = DAG.getBitcast(

15164

MVT::v16i8,

15165

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

15166

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

15167

15168

// Unpack the bytes to form the i16s that will be shuffled into place.

15169

bool EvenInUse = false, OddInUse = false;

15170

for (int i = 0; i < 16; i += 2) {

15171

EvenInUse |= (Mask[i + 0] >= 0);

15172

OddInUse |= (Mask[i + 1] >= 0);

15173

if (EvenInUse && OddInUse)

15174

break;

15175

}

15176

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

15177

MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

15178

OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

15179

15180

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

15181

for (int i = 0; i < 16; ++i)

15182

if (Mask[i] >= 0) {

15183

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

15184

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast
<void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15184, __PRETTY_FUNCTION__));

15185

if (PostDupI16Shuffle[i / 2] < 0)

15186

PostDupI16Shuffle[i / 2] = MappedMask;

15187

else

15188

assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15189, __PRETTY_FUNCTION__))

15189

"Conflicting entries in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15189, __PRETTY_FUNCTION__));

15190

}

15191

return DAG.getBitcast(

15192

MVT::v16i8,

15193

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

15194

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

15195

};

15196

if (SDValue V = tryToWidenViaDuplication())

15197

return V;

15198

}

15199

15200

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

15201

Zeroable, Subtarget, DAG))

15202

return Masked;

15203

15204

// Use dedicated unpack instructions for masks that match their pattern.

15205

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

15206

return V;

15207

15208

// Try to use byte shift instructions to mask.

15209

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

15210

Zeroable, Subtarget, DAG))

15211

return V;

15212

15213

// Check for compaction patterns.

15214

bool IsSingleInput = V2.isUndef();

15215

int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);

15216

15217

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

15218

// with PSHUFB. It is important to do this before we attempt to generate any

15219

// blends but after all of the single-input lowerings. If the single input

15220

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

15221

// want to preserve that and we can DAG combine any longer sequences into

15222

// a PSHUFB in the end. But once we start blending from multiple inputs,

15223

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

15224

// and there are *very* few patterns that would actually be faster than the

15225

// PSHUFB approach because of its ability to zero lanes.

15226

//

15227

// If the mask is a binary compaction, we can more efficiently perform this

15228

// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

15229

//

15230

// FIXME: The only exceptions to the above are blends which are exact

15231

// interleavings with direct instructions supporting them. We currently don't

15232

// handle those well here.

15233

if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

15234

bool V1InUse = false;

15235

bool V2InUse = false;

15236

15237

SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

15238

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

15239

15240

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

15241

// do so. This avoids using them to handle blends-with-zero which is

15242

// important as a single pshufb is significantly faster for that.

15243

if (V1InUse && V2InUse) {

15244

if (Subtarget.hasSSE41())

15245

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

15246

Zeroable, Subtarget, DAG))

15247

return Blend;

15248

15249

// We can use an unpack to do the blending rather than an or in some

15250

// cases. Even though the or may be (very minorly) more efficient, we

15251

// preference this lowering because there are common cases where part of

15252

// the complexity of the shuffles goes away when we do the final blend as

15253

// an unpack.

15254

// FIXME: It might be worth trying to detect if the unpack-feeding

15255

// shuffles will both be pshufb, in which case we shouldn't bother with

15256

// this.

15257

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

15258

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

15259

return Unpack;

15260

15261

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

15262

if (Subtarget.hasVBMI())

15263

return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

15264

DAG);

15265

15266

// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

15267

if (Subtarget.hasXOP()) {

15268

SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

15269

return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

15270

}

15271

15272

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

15273

// PALIGNR will be cheaper than the second PSHUFB+OR.

15274

if (SDValue V = lowerShuffleAsByteRotateAndPermute(

15275

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

15276

return V;

15277

}

15278

15279

return PSHUFB;

15280

}

15281

15282

// There are special ways we can lower some single-element blends.

15283

if (NumV2Elements == 1)

15284

if (SDValue V = lowerShuffleAsElementInsertion(

15285

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

15286

return V;

15287

15288

if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

15289

return Blend;

15290

15291

// Check whether a compaction lowering can be done. This handles shuffles

15292

// which take every Nth element for some even N. See the helper function for

15293

// details.

15294

//

15295

// We special case these as they can be particularly efficiently handled with

15296

// the PACKUSB instruction on x86 and they show up in common patterns of

15297

// rearranging bytes to truncate wide elements.

15298

if (NumEvenDrops) {

15299

// NumEvenDrops is the power of two stride of the elements. Another way of

15300

// thinking about it is that we need to drop the even elements this many

15301

// times to get the original input.

15302

15303

// First we need to zero all the dropped bytes.

15304

assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15305, __PRETTY_FUNCTION__))

15305

"No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15305, __PRETTY_FUNCTION__));

15306

SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

15307

for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

15308

WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

15309

SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

15310

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

15311

WordClearMask);

15312

if (!IsSingleInput)

15313

V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

15314

WordClearMask);

15315

15316

// Now pack things back together.

15317

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

15318

IsSingleInput ? V1 : V2);

15319

for (int i = 1; i < NumEvenDrops; ++i) {

15320

Result = DAG.getBitcast(MVT::v8i16, Result);

15321

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

15322

}

15323

return Result;

15324

}

15325

15326

// Handle multi-input cases by blending/unpacking single-input shuffles.

15327

if (NumV2Elements > 0)

15328

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

15329

Subtarget, DAG);

15330

15331

// The fallback path for single-input shuffles widens this into two v8i16

15332

// vectors with unpacks, shuffles those, and then pulls them back together

15333

// with a pack.

15334

SDValue V = V1;

15335

15336

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

15337

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

15338

for (int i = 0; i < 16; ++i)

15339

if (Mask[i] >= 0)

15340

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

15341

15342

SDValue VLoHalf, VHiHalf;

15343

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

15344

// them out and avoid using UNPCK{L,H} to extract the elements of V as

15345

// i16s.

15346

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

15347

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

15348

// Use a mask to drop the high bytes.

15349

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

15350

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

15351

DAG.getConstant(0x00FF, DL, MVT::v8i16));

15352

15353

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

15354

VHiHalf = DAG.getUNDEF(MVT::v8i16);

15355

15356

// Squash the masks to point directly into VLoHalf.

15357

for (int &M : LoBlendMask)

15358

if (M >= 0)

15359

M /= 2;

15360

for (int &M : HiBlendMask)

15361

if (M >= 0)

15362

M /= 2;

15363

} else {

15364

// Otherwise just unpack the low half of V into VLoHalf and the high half into

15365

// VHiHalf so that we can blend them as i16s.

15366

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

15367

15368

VLoHalf = DAG.getBitcast(

15369

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

15370

VHiHalf = DAG.getBitcast(

15371

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

15372

}

15373

15374

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

15375

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

15376

15377

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

15378

}

15379

15380

/// Dispatching routine to lower various 128-bit x86 vector shuffles.

15381

///

15382

/// This routine breaks down the specific type of 128-bit shuffle and

15383

/// dispatches to the lowering routines accordingly.

15384

static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

15385

MVT VT, SDValue V1, SDValue V2,

15386

const APInt &Zeroable,

15387

const X86Subtarget &Subtarget,

15388

SelectionDAG &DAG) {

15389

switch (VT.SimpleTy) {

15390

case MVT::v2i64:

15391

return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

15392

case MVT::v2f64:

15393

return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

15394

case MVT::v4i32:

15395

return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

15396

case MVT::v4f32:

15397

return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

15398

case MVT::v8i16:

15399

return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

15400

case MVT::v16i8:

15401

return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

15402

15403

default:

15404

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15404);

15405

}

15406

}

15407

15408

/// Generic routine to split vector shuffle into half-sized shuffles.

15409

///

15410

/// This routine just extracts two subvectors, shuffles them independently, and

15411

/// then concatenates them back together. This should work effectively with all

15412

/// AVX vector shuffle types.

15413

static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

15414

SDValue V2, ArrayRef<int> Mask,

15415

SelectionDAG &DAG) {

15416

assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15417, __PRETTY_FUNCTION__))

15417

"Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15417, __PRETTY_FUNCTION__));

15418

assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15418, __PRETTY_FUNCTION__));

15419

assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15419, __PRETTY_FUNCTION__));

15420

15421

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

15422

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

15423

15424

int NumElements = VT.getVectorNumElements();

15425

int SplitNumElements = NumElements / 2;

15426

MVT ScalarVT = VT.getVectorElementType();

15427

MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

15428

15429

// Use splitVector/extractSubVector so that split build-vectors just build two

15430

// narrower build vectors. This helps shuffling with splats and zeros.

15431

auto SplitVector = [&](SDValue V) {

15432

SDValue LoV, HiV;

15433

std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

15434

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

15435

DAG.getBitcast(SplitVT, HiV));

15436

};

15437

15438

SDValue LoV1, HiV1, LoV2, HiV2;

15439

std::tie(LoV1, HiV1) = SplitVector(V1);

15440

std::tie(LoV2, HiV2) = SplitVector(V2);

15441

15442

// Now create two 4-way blends of these half-width vectors.

15443

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

15444

bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;

15445

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

15446

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

15447

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

15448

for (int i = 0; i < SplitNumElements; ++i) {

15449

int M = HalfMask[i];

15450

if (M >= NumElements) {

15451

if (M >= NumElements + SplitNumElements)

15452

UseHiV2 = true;

15453

else

15454

UseLoV2 = true;

15455

V2BlendMask[i] = M - NumElements;

15456

BlendMask[i] = SplitNumElements + i;

15457

} else if (M >= 0) {

15458

if (M >= SplitNumElements)

15459

UseHiV1 = true;

15460

else

15461

UseLoV1 = true;

15462

V1BlendMask[i] = M;

15463

BlendMask[i] = i;

15464

}

15465

}

15466

15467

// Because the lowering happens after all combining takes place, we need to

15468

// manually combine these blend masks as much as possible so that we create

15469

// a minimal number of high-level vector shuffle nodes.

15470

15471

// First try just blending the halves of V1 or V2.

15472

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

15473

return DAG.getUNDEF(SplitVT);

15474

if (!UseLoV2 && !UseHiV2)

15475

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

15476

if (!UseLoV1 && !UseHiV1)

15477

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

15478

15479

SDValue V1Blend, V2Blend;

15480

if (UseLoV1 && UseHiV1) {

15481

V1Blend =

15482

DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

15483

} else {

15484

// We only use half of V1 so map the usage down into the final blend mask.

15485

V1Blend = UseLoV1 ? LoV1 : HiV1;

15486

for (int i = 0; i < SplitNumElements; ++i)

15487

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

15488

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

15489

}

15490

if (UseLoV2 && UseHiV2) {

15491

V2Blend =

15492

DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

15493

} else {

15494

// We only use half of V2 so map the usage down into the final blend mask.

15495

V2Blend = UseLoV2 ? LoV2 : HiV2;

15496

for (int i = 0; i < SplitNumElements; ++i)

15497

if (BlendMask[i] >= SplitNumElements)

15498

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

15499

}

15500

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

15501

};

15502

SDValue Lo = HalfBlend(LoMask);

15503

SDValue Hi = HalfBlend(HiMask);

15504

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

15505

}

15506

15507

/// Either split a vector in halves or decompose the shuffles and the

15508

/// blend/unpack.

15509

///

15510

/// This is provided as a good fallback for many lowerings of non-single-input

15511

/// shuffles with more than one 128-bit lane. In those cases, we want to select

15512

/// between splitting the shuffle into 128-bit components and stitching those

15513

/// back together vs. extracting the single-input shuffles and blending those

15514

/// results.

15515

static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

15516

SDValue V2, ArrayRef<int> Mask,

15517

const X86Subtarget &Subtarget,

15518

SelectionDAG &DAG) {

15519

assert(!V2.isUndef() && "This routine must not be used to lower single-input "((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15520, __PRETTY_FUNCTION__))

15520

"shuffles as it could then recurse on itself.")((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15520, __PRETTY_FUNCTION__));

15521

int Size = Mask.size();

15522

15523

// If this can be modeled as a broadcast of two elements followed by a blend,

15524

// prefer that lowering. This is especially important because broadcasts can

15525

// often fold with memory operands.

15526

auto DoBothBroadcast = [&] {

15527

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

15528

for (int M : Mask)

15529

if (M >= Size) {

15530

if (V2BroadcastIdx < 0)

15531

V2BroadcastIdx = M - Size;

15532

else if (M - Size != V2BroadcastIdx)

15533

return false;

15534

} else if (M >= 0) {

15535

if (V1BroadcastIdx < 0)

15536

V1BroadcastIdx = M;

15537

else if (M != V1BroadcastIdx)

15538

return false;

15539

}

15540

return true;

15541

};

15542

if (DoBothBroadcast())

15543

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

15544

DAG);

15545

15546

// If the inputs all stem from a single 128-bit lane of each input, then we

15547

// split them rather than blending because the split will decompose to

15548

// unusually few instructions.

15549

int LaneCount = VT.getSizeInBits() / 128;

15550

int LaneSize = Size / LaneCount;

15551

SmallBitVector LaneInputs[2];

15552

LaneInputs[0].resize(LaneCount, false);

15553

LaneInputs[1].resize(LaneCount, false);

15554

for (int i = 0; i < Size; ++i)

15555

if (Mask[i] >= 0)

15556

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

15557

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

15558

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

15559

15560

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

15561

// requires that the decomposed single-input shuffles don't end up here.

15562

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

15563

DAG);

15564

}

15565

15566

// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

15567

// TODO: Extend to support v8f32 (+ 512-bit shuffles).

15568

static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

15569

SDValue V1, SDValue V2,

15570

ArrayRef<int> Mask,

15571

SelectionDAG &DAG) {

15572

assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")((VT == MVT::v4f64 && "Only for v4f64 shuffles") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15572, __PRETTY_FUNCTION__));

15573

15574

int LHSMask[4] = {-1, -1, -1, -1};

15575

int RHSMask[4] = {-1, -1, -1, -1};

15576

unsigned SHUFPMask = 0;

15577

15578

// As SHUFPD uses a single LHS/RHS element per lane, we can always

15579

// perform the shuffle once the lanes have been shuffled in place.

15580

for (int i = 0; i != 4; ++i) {

15581

int M = Mask[i];

15582

if (M < 0)

15583

continue;

15584

int LaneBase = i & ~1;

15585

auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

15586

LaneMask[LaneBase + (M & 1)] = M;

15587

SHUFPMask |= (M & 1) << i;

15588

}

15589

15590

SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

15591

SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

15592

return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

15593

DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

15594

}

15595

15596

/// Lower a vector shuffle crossing multiple 128-bit lanes as

15597

/// a lane permutation followed by a per-lane permutation.

15598

///

15599

/// This is mainly for cases where we can have non-repeating permutes

15600

/// in each lane.

15601

///

15602

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

15603

/// we should investigate merging them.

15604

static SDValue lowerShuffleAsLanePermuteAndPermute(

15605

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

15606

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

15607

int NumElts = VT.getVectorNumElements();

15608

int NumLanes = VT.getSizeInBits() / 128;

15609

int NumEltsPerLane = NumElts / NumLanes;

15610

bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();

15611

15612

/// Attempts to find a sublane permute with the given size

15613

/// that gets all elements into their target lanes.

15614

///

15615

/// If successful, fills CrossLaneMask and InLaneMask and returns true.

15616

/// If unsuccessful, returns false and may overwrite InLaneMask.

15617

auto getSublanePermute = [&](int NumSublanes) -> SDValue {

15618

int NumSublanesPerLane = NumSublanes / NumLanes;

15619

int NumEltsPerSublane = NumElts / NumSublanes;

15620

15621

SmallVector<int, 16> CrossLaneMask;

15622

SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

15623

// CrossLaneMask but one entry == one sublane.

15624

SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

15625

15626

for (int i = 0; i != NumElts; ++i) {

15627

int M = Mask[i];

15628

if (M < 0)

15629

continue;

15630

15631

int SrcSublane = M / NumEltsPerSublane;

15632

int DstLane = i / NumEltsPerLane;

15633

15634

// We only need to get the elements into the right lane, not sublane.

15635

// So search all sublanes that make up the destination lane.

15636

bool Found = false;

15637

int DstSubStart = DstLane * NumSublanesPerLane;

15638

int DstSubEnd = DstSubStart + NumSublanesPerLane;

15639

for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

15640

if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

15641

continue;

15642

15643

Found = true;

15644

CrossLaneMaskLarge[DstSublane] = SrcSublane;

15645

int DstSublaneOffset = DstSublane * NumEltsPerSublane;

15646

InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

15647

break;

15648

}

15649

if (!Found)

15650

return SDValue();

15651

}

15652

15653

// Fill CrossLaneMask using CrossLaneMaskLarge.

15654

narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);

15655

15656

if (!CanUseSublanes) {

15657

// If we're only shuffling a single lowest lane and the rest are identity

15658

// then don't bother.

15659

// TODO - isShuffleMaskInputInPlace could be extended to something like

15660

// this.

15661

int NumIdentityLanes = 0;

15662

bool OnlyShuffleLowestLane = true;

15663

for (int i = 0; i != NumLanes; ++i) {

15664

int LaneOffset = i * NumEltsPerLane;

15665

if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

15666

i * NumEltsPerLane))

15667

NumIdentityLanes++;

15668

else if (CrossLaneMask[LaneOffset] != 0)

15669

OnlyShuffleLowestLane = false;

15670

}

15671

if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

15672

return SDValue();

15673

}

15674

15675

SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

15676

return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

15677

InLaneMask);

15678

};

15679

15680

// First attempt a solution with full lanes.

15681

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

15682

return V;

15683

15684

// The rest of the solutions use sublanes.

15685

if (!CanUseSublanes)

15686

return SDValue();

15687

15688

// Then attempt a solution with 64-bit sublanes (vpermq).

15689

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

15690

return V;

15691

15692

// If that doesn't work and we have fast variable shuffle,

15693

// attempt 32-bit sublanes (vpermd).

15694

if (!Subtarget.hasFastVariableShuffle())

15695

return SDValue();

15696

15697

return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

15698

}

15699

15700

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

15701

/// source with a lane permutation.

15702

///

15703

/// This lowering strategy results in four instructions in the worst case for a

15704

/// single-input cross lane shuffle which is lower than any other fully general

15705

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

15706

/// shuffle pattern should be handled prior to trying this lowering.

15707

static SDValue lowerShuffleAsLanePermuteAndShuffle(

15708

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

15709

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

15710

// FIXME: This should probably be generalized for 512-bit vectors as well.

15711

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15711, __PRETTY_FUNCTION__));

15712

int Size = Mask.size();

15713

int LaneSize = Size / 2;

15714

15715

// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

15716

// Only do this if the elements aren't all from the lower lane,

15717

// otherwise we're (probably) better off doing a split.

15718

if (VT == MVT::v4f64 &&

15719

!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

15720

if (SDValue V =

15721

lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))

15722

return V;

15723

15724

// If there are only inputs from one 128-bit lane, splitting will in fact be

15725

// less expensive. The flags track whether the given lane contains an element

15726

// that crosses to another lane.

15727

if (!Subtarget.hasAVX2()) {

15728

bool LaneCrossing[2] = {false, false};

15729

for (int i = 0; i < Size; ++i)

15730

if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

15731

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

15732

if (!LaneCrossing[0] || !LaneCrossing[1])

15733

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

15734

} else {

15735

bool LaneUsed[2] = {false, false};

15736

for (int i = 0; i < Size; ++i)

15737

if (Mask[i] >= 0)

15738

LaneUsed[(Mask[i] % Size) / LaneSize] = true;

15739

if (!LaneUsed[0] || !LaneUsed[1])

15740

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

15741

}

15742

15743

// TODO - we could support shuffling V2 in the Flipped input.

15744

assert(V2.isUndef() &&((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15745, __PRETTY_FUNCTION__))

15745

"This last part of this routine only works on single input shuffles")((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15745, __PRETTY_FUNCTION__));

15746

15747

SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());

15748

for (int i = 0; i < Size; ++i) {

15749

int &M = InLaneMask[i];

15750

if (M < 0)

15751

continue;

15752

if (((M % Size) / LaneSize) != (i / LaneSize))

15753

M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

15754

}

15755

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15756, __PRETTY_FUNCTION__))

15756

"In-lane shuffle mask expected")((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15756, __PRETTY_FUNCTION__));

15757

15758

// Flip the lanes, and shuffle the results which should now be in-lane.

15759

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

15760

SDValue Flipped = DAG.getBitcast(PVT, V1);

15761

Flipped =

15762

DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

15763

Flipped = DAG.getBitcast(VT, Flipped);

15764

return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

15765

}

15766

15767

/// Handle lowering 2-lane 128-bit shuffles.

15768

static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

15769

SDValue V2, ArrayRef<int> Mask,

15770

const APInt &Zeroable,

15771

const X86Subtarget &Subtarget,

15772

SelectionDAG &DAG) {

15773

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

15774

if (Subtarget.hasAVX2() && V2.isUndef())

15775

return SDValue();

15776

15777

bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

15778

15779

SmallVector<int, 4> WidenedMask;

15780

if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

15781

return SDValue();

15782

15783

bool IsLowZero = (Zeroable & 0x3) == 0x3;

15784

bool IsHighZero = (Zeroable & 0xc) == 0xc;

15785

15786

// Try to use an insert into a zero vector.

15787

if (WidenedMask[0] == 0 && IsHighZero) {

15788

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

15789

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

15790

DAG.getIntPtrConstant(0, DL));

15791

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

15792

getZeroVector(VT, Subtarget, DAG, DL), LoV,

15793

DAG.getIntPtrConstant(0, DL));

15794

}

15795

15796

// TODO: If minimizing size and one of the inputs is a zero vector and the

15797

// the zero vector has only one use, we could use a VPERM2X128 to save the

15798

// instruction bytes needed to explicitly generate the zero vector.

15799

15800

// Blends are faster and handle all the non-lane-crossing cases.

15801

if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

15802

Subtarget, DAG))

15803

return Blend;

15804

15805

// If either input operand is a zero vector, use VPERM2X128 because its mask

15806

// allows us to replace the zero input with an implicit zero.

15807

if (!IsLowZero && !IsHighZero) {

15808

// Check for patterns which can be matched with a single insert of a 128-bit

15809

// subvector.

15810

bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});

15811

if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

15812

15813

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

15814

// this will likely become vinsertf128 which can't fold a 256-bit memop.

15815

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

15816

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

15817

SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

15818

OnlyUsesV1 ? V1 : V2,

15819

DAG.getIntPtrConstant(0, DL));

15820

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

15821

DAG.getIntPtrConstant(2, DL));

15822

}

15823

}

15824

15825

// Try to use SHUF128 if possible.

15826

if (Subtarget.hasVLX()) {

15827

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

15828

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

15829

((WidenedMask[1] % 2) << 1);

15830

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

15831

DAG.getTargetConstant(PermMask, DL, MVT::i8));

15832

}

15833

}

15834

}

15835

15836

// Otherwise form a 128-bit permutation. After accounting for undefs,

15837

// convert the 64-bit shuffle mask selection values into 128-bit

15838

// selection bits by dividing the indexes by 2 and shifting into positions

15839

// defined by a vperm2*128 instruction's immediate control byte.

15840

15841

// The immediate permute control byte looks like this:

15842

// [1:0] - select 128 bits from sources for low half of destination

15843

// [2] - ignore

15844

// [3] - zero low half of destination

15845

// [5:4] - select 128 bits from sources for high half of destination

15846

// [6] - ignore

15847

// [7] - zero high half of destination

15848

15849

assert((WidenedMask[0] >= 0 || IsLowZero) &&(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15850, __PRETTY_FUNCTION__))

15850

(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15850, __PRETTY_FUNCTION__));

15851

15852

unsigned PermMask = 0;

15853

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

15854

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

15855

15856

// Check the immediate mask and replace unused sources with undef.

15857

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

15858

V1 = DAG.getUNDEF(VT);

15859

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

15860

V2 = DAG.getUNDEF(VT);

15861

15862

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

15863

DAG.getTargetConstant(PermMask, DL, MVT::i8));

15864

}

15865

15866

/// Lower a vector shuffle by first fixing the 128-bit lanes and then

15867

/// shuffling each lane.

15868

///

15869

/// This attempts to create a repeated lane shuffle where each lane uses one

15870

/// or two of the lanes of the inputs. The lanes of the input vectors are

15871

/// shuffled in one or two independent shuffles to get the lanes into the

15872

/// position needed by the final shuffle.

15873

static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

15874

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

15875

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

15876

assert(!V2.isUndef() && "This is only useful with multiple inputs.")((!V2.isUndef() && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15876, __PRETTY_FUNCTION__));

15877

15878

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

15879

return SDValue();

15880

15881

int NumElts = Mask.size();

15882

int NumLanes = VT.getSizeInBits() / 128;

15883

int NumLaneElts = 128 / VT.getScalarSizeInBits();

15884

SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

15885

SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

15886

15887

// First pass will try to fill in the RepeatMask from lanes that need two

15888

// sources.

15889

for (int Lane = 0; Lane != NumLanes; ++Lane) {

15890

int Srcs[2] = {-1, -1};

15891

SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

15892

for (int i = 0; i != NumLaneElts; ++i) {

15893

int M = Mask[(Lane * NumLaneElts) + i];

15894

if (M < 0)

15895

continue;

15896

// Determine which of the possible input lanes (NumLanes from each source)

15897

// this element comes from. Assign that as one of the sources for this

15898

// lane. We can assign up to 2 sources for this lane. If we run out

15899

// sources we can't do anything.

15900

int LaneSrc = M / NumLaneElts;

15901

int Src;

15902

if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

15903

Src = 0;

15904

else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

15905

Src = 1;

15906

else

15907

return SDValue();

15908

15909

Srcs[Src] = LaneSrc;

15910

InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

15911

}

15912

15913

// If this lane has two sources, see if it fits with the repeat mask so far.

15914

if (Srcs[1] < 0)

15915

continue;

15916

15917

LaneSrcs[Lane][0] = Srcs[0];

15918

LaneSrcs[Lane][1] = Srcs[1];

15919

15920

auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

15921

assert(M1.size() == M2.size() && "Unexpected mask size")((M1.size() == M2.size() && "Unexpected mask size") ?
static_cast<void> (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15921, __PRETTY_FUNCTION__));

15922

for (int i = 0, e = M1.size(); i != e; ++i)

15923

if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

15924

return false;

15925

return true;

15926

};

15927

15928

auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

15929

assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((Mask.size() == MergedMask.size() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15929, __PRETTY_FUNCTION__));

15930

for (int i = 0, e = MergedMask.size(); i != e; ++i) {

15931

int M = Mask[i];

15932

if (M < 0)

15933

continue;

15934

assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15935, __PRETTY_FUNCTION__))

15935

"Unexpected mask element")(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15935, __PRETTY_FUNCTION__));

15936

MergedMask[i] = M;

15937

}

15938

};

15939

15940

if (MatchMasks(InLaneMask, RepeatMask)) {

15941

// Merge this lane mask into the final repeat mask.

15942

MergeMasks(InLaneMask, RepeatMask);

15943

continue;

15944

}

15945

15946

// Didn't find a match. Swap the operands and try again.

15947

std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

15948

ShuffleVectorSDNode::commuteMask(InLaneMask);

15949

15950

if (MatchMasks(InLaneMask, RepeatMask)) {

15951

// Merge this lane mask into the final repeat mask.

15952

MergeMasks(InLaneMask, RepeatMask);

15953

continue;

15954

}

15955

15956

// Couldn't find a match with the operands in either order.

15957

return SDValue();

15958

}

15959

15960

// Now handle any lanes with only one source.

15961

for (int Lane = 0; Lane != NumLanes; ++Lane) {

15962

// If this lane has already been processed, skip it.

15963

if (LaneSrcs[Lane][0] >= 0)

15964

continue;

15965

15966

for (int i = 0; i != NumLaneElts; ++i) {

15967

int M = Mask[(Lane * NumLaneElts) + i];

15968

if (M < 0)

15969

continue;

15970

15971

// If RepeatMask isn't defined yet we can define it ourself.

15972

if (RepeatMask[i] < 0)

15973

RepeatMask[i] = M % NumLaneElts;

15974

15975

if (RepeatMask[i] < NumElts) {

15976

if (RepeatMask[i] != M % NumLaneElts)

15977

return SDValue();

15978

LaneSrcs[Lane][0] = M / NumLaneElts;

15979

} else {

15980

if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

15981

return SDValue();

15982

LaneSrcs[Lane][1] = M / NumLaneElts;

15983

}

15984

}

15985

15986

if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

15987

return SDValue();

15988

}

15989

15990

SmallVector<int, 16> NewMask(NumElts, -1);

15991

for (int Lane = 0; Lane != NumLanes; ++Lane) {

15992

int Src = LaneSrcs[Lane][0];

15993

for (int i = 0; i != NumLaneElts; ++i) {

15994

int M = -1;

15995

if (Src >= 0)

15996

M = Src * NumLaneElts + i;

15997

NewMask[Lane * NumLaneElts + i] = M;

15998

}

15999

}

16000

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

16001

// Ensure we didn't get back the shuffle we started with.

16002

// FIXME: This is a hack to make up for some splat handling code in

16003

// getVectorShuffle.

16004

if (isa<ShuffleVectorSDNode>(NewV1) &&

16005

cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

16006

return SDValue();

16007

16008

for (int Lane = 0; Lane != NumLanes; ++Lane) {

16009

int Src = LaneSrcs[Lane][1];

16010

for (int i = 0; i != NumLaneElts; ++i) {

16011

int M = -1;

16012

if (Src >= 0)

16013

M = Src * NumLaneElts + i;

16014

NewMask[Lane * NumLaneElts + i] = M;

16015

}

16016

}

16017

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

16018

// Ensure we didn't get back the shuffle we started with.

16019

// FIXME: This is a hack to make up for some splat handling code in

16020

// getVectorShuffle.

16021

if (isa<ShuffleVectorSDNode>(NewV2) &&

16022

cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

16023

return SDValue();

16024

16025

for (int i = 0; i != NumElts; ++i) {

16026

NewMask[i] = RepeatMask[i % NumLaneElts];

16027

if (NewMask[i] < 0)

16028

continue;

16029

16030

NewMask[i] += (i / NumLaneElts) * NumLaneElts;

16031

}

16032

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

16033

}

16034

16035

/// If the input shuffle mask results in a vector that is undefined in all upper

16036

/// or lower half elements and that mask accesses only 2 halves of the

16037

/// shuffle's operands, return true. A mask of half the width with mask indexes

16038

/// adjusted to access the extracted halves of the original shuffle operands is

16039

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

16040

/// lower half of each input operand is accessed.

16041

static bool

16042

getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

16043

int &HalfIdx1, int &HalfIdx2) {

16044

assert((Mask.size() == HalfMask.size() * 2) &&(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16045, __PRETTY_FUNCTION__))

16045

"Expected input mask to be twice as long as output")(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16045, __PRETTY_FUNCTION__));

16046

16047

// Exactly one half of the result must be undef to allow narrowing.

16048

bool UndefLower = isUndefLowerHalf(Mask);

16049

bool UndefUpper = isUndefUpperHalf(Mask);

16050

if (UndefLower == UndefUpper)

16051

return false;

16052

16053

unsigned HalfNumElts = HalfMask.size();

16054

unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

16055

HalfIdx1 = -1;

16056

HalfIdx2 = -1;

16057

for (unsigned i = 0; i != HalfNumElts; ++i) {

16058

int M = Mask[i + MaskIndexOffset];

16059

if (M < 0) {

16060

HalfMask[i] = M;

16061

continue;

16062

}

16063

16064

// Determine which of the 4 half vectors this element is from.

16065

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

16066

int HalfIdx = M / HalfNumElts;

16067

16068

// Determine the element index into its half vector source.

16069

int HalfElt = M % HalfNumElts;

16070

16071

// We can shuffle with up to 2 half vectors, set the new 'half'

16072

// shuffle mask accordingly.

16073

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

16074

HalfMask[i] = HalfElt;

16075

HalfIdx1 = HalfIdx;

16076

continue;

16077

}

16078

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

16079

HalfMask[i] = HalfElt + HalfNumElts;

16080

HalfIdx2 = HalfIdx;

16081

continue;

16082

}

16083

16084

// Too many half vectors referenced.

16085

return false;

16086

}

16087

16088

return true;

16089

}

16090

16091

/// Given the output values from getHalfShuffleMask(), create a half width

16092

/// shuffle of extracted vectors followed by an insert back to full width.

16093

static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

16094

ArrayRef<int> HalfMask, int HalfIdx1,

16095

int HalfIdx2, bool UndefLower,

16096

SelectionDAG &DAG, bool UseConcat = false) {

16097

assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((V1.getValueType() == V2.getValueType() && "Different sized vectors?"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16097, __PRETTY_FUNCTION__));

16098

assert(V1.getValueType().isSimple() && "Expecting only simple types")((V1.getValueType().isSimple() && "Expecting only simple types"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16098, __PRETTY_FUNCTION__));

16099

16100

MVT VT = V1.getSimpleValueType();

16101

MVT HalfVT = VT.getHalfNumVectorElementsVT();

16102

unsigned HalfNumElts = HalfVT.getVectorNumElements();

16103

16104

auto getHalfVector = [&](int HalfIdx) {

16105

if (HalfIdx < 0)

16106

return DAG.getUNDEF(HalfVT);

16107

SDValue V = (HalfIdx < 2 ? V1 : V2);

16108

HalfIdx = (HalfIdx % 2) * HalfNumElts;

16109

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

16110

DAG.getIntPtrConstant(HalfIdx, DL));

16111

};

16112

16113

// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

16114

SDValue Half1 = getHalfVector(HalfIdx1);

16115

SDValue Half2 = getHalfVector(HalfIdx2);

16116

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

16117

if (UseConcat) {

16118

SDValue Op0 = V;

16119

SDValue Op1 = DAG.getUNDEF(HalfVT);

16120

if (UndefLower)

16121

std::swap(Op0, Op1);

16122

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

16123

}

16124

16125

unsigned Offset = UndefLower ? HalfNumElts : 0;

16126

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

16127

DAG.getIntPtrConstant(Offset, DL));

16128

}

16129

16130

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

16131

/// This allows for fast cases such as subvector extraction/insertion

16132

/// or shuffling smaller vector types which can lower more efficiently.

16133

static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

16134

SDValue V2, ArrayRef<int> Mask,

16135

const X86Subtarget &Subtarget,

16136

SelectionDAG &DAG) {

16137

assert((VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16138, __PRETTY_FUNCTION__))

16138

"Expected 256-bit or 512-bit vector")(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16138, __PRETTY_FUNCTION__));

16139

16140

bool UndefLower = isUndefLowerHalf(Mask);

16141

if (!UndefLower && !isUndefUpperHalf(Mask))

16142

return SDValue();

16143

16144

assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16145, __PRETTY_FUNCTION__))

16145

"Completely undef shuffle mask should have been simplified already")(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16145, __PRETTY_FUNCTION__));

16146

16147

// Upper half is undef and lower half is whole upper subvector.

16148

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

16149

MVT HalfVT = VT.getHalfNumVectorElementsVT();

16150

unsigned HalfNumElts = HalfVT.getVectorNumElements();

16151

if (!UndefLower &&

16152

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

16153

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

16154

DAG.getIntPtrConstant(HalfNumElts, DL));

16155

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

16156

DAG.getIntPtrConstant(0, DL));

16157

}

16158

16159

// Lower half is undef and upper half is whole lower subvector.

16160

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

16161

if (UndefLower &&

16162

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

16163

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

16164

DAG.getIntPtrConstant(0, DL));

16165

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

16166

DAG.getIntPtrConstant(HalfNumElts, DL));

16167

}

16168

16169

int HalfIdx1, HalfIdx2;

16170

SmallVector<int, 8> HalfMask(HalfNumElts);

16171

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

16172

return SDValue();

16173

16174

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"
) ? static_cast<void> (0) : __assert_fail ("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16174, __PRETTY_FUNCTION__));

16175

16176

// Only shuffle the halves of the inputs when useful.

16177

unsigned NumLowerHalves =

16178

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

16179

unsigned NumUpperHalves =

16180

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

16181

assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"
) ? static_cast<void> (0) : __assert_fail ("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16181, __PRETTY_FUNCTION__));

16182

16183

// Determine the larger pattern of undef/halves, then decide if it's worth

16184

// splitting the shuffle based on subtarget capabilities and types.

16185

unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

16186

if (!UndefLower) {

16187

// XXXXuuuu: no insert is needed.

16188

// Always extract lowers when setting lower - these are all free subreg ops.

16189

if (NumUpperHalves == 0)

16190

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

16191

UndefLower, DAG);

16192

16193

if (NumUpperHalves == 1) {

16194

// AVX2 has efficient 32/64-bit element cross-lane shuffles.

16195

if (Subtarget.hasAVX2()) {

16196

// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

16197

if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

16198

!is128BitUnpackShuffleMask(HalfMask) &&

16199

(!isSingleSHUFPSMask(HalfMask) ||

16200

Subtarget.hasFastVariableShuffle()))

16201

return SDValue();

16202

// If this is a unary shuffle (assume that the 2nd operand is

16203

// canonicalized to undef), then we can use vpermpd. Otherwise, we

16204

// are better off extracting the upper half of 1 operand and using a

16205

// narrow shuffle.

16206

if (EltWidth == 64 && V2.isUndef())

16207

return SDValue();

16208

}

16209

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

16210

if (Subtarget.hasAVX512() && VT.is512BitVector())

16211

return SDValue();

16212

// Extract + narrow shuffle is better than the wide alternative.

16213

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

16214

UndefLower, DAG);

16215

}

16216

16217

// Don't extract both uppers, instead shuffle and then extract.

16218

assert(NumUpperHalves == 2 && "Half vector count went wrong")((NumUpperHalves == 2 && "Half vector count went wrong"
) ? static_cast<void> (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16218, __PRETTY_FUNCTION__));

16219

return SDValue();

16220

}

16221

16222

// UndefLower - uuuuXXXX: an insert to high half is required if we split this.

16223

if (NumUpperHalves == 0) {

16224

// AVX2 has efficient 64-bit element cross-lane shuffles.

16225

// TODO: Refine to account for unary shuffle, splat, and other masks?

16226

if (Subtarget.hasAVX2() && EltWidth == 64)

16227

return SDValue();

16228

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

16229

if (Subtarget.hasAVX512() && VT.is512BitVector())

16230

return SDValue();

16231

// Narrow shuffle + insert is better than the wide alternative.

16232

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

16233

UndefLower, DAG);

16234

}

16235

16236

// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

16237

return SDValue();

16238

}

16239

16240

/// Test whether the specified input (0 or 1) is in-place blended by the

16241

/// given mask.

16242

///

16243

/// This returns true if the elements from a particular input are already in the

16244

/// slot required by the given mask and require no permutation.

16245

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

16246

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(((Input == 0 || Input == 1) && "Only two inputs to shuffles."
) ? static_cast<void> (0) : __assert_fail ("(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16246, __PRETTY_FUNCTION__));

16247

int Size = Mask.size();

16248

for (int i = 0; i < Size; ++i)

16249

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

16250

return false;

16251

16252

return true;

16253

}

16254

16255

/// Handle case where shuffle sources are coming from the same 128-bit lane and

16256

/// every lane can be represented as the same repeating mask - allowing us to

16257

/// shuffle the sources with the repeating shuffle and then permute the result

16258

/// to the destination lanes.

16259

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

16260

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16261

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

16262

int NumElts = VT.getVectorNumElements();

16263

int NumLanes = VT.getSizeInBits() / 128;

16264

int NumLaneElts = NumElts / NumLanes;

16265

16266

// On AVX2 we may be able to just shuffle the lowest elements and then

16267

// broadcast the result.

16268

if (Subtarget.hasAVX2()) {

16269

for (unsigned BroadcastSize : {16, 32, 64}) {

16270

if (BroadcastSize <= VT.getScalarSizeInBits())

16271

continue;

16272

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

16273

16274

// Attempt to match a repeating pattern every NumBroadcastElts,

16275

// accounting for UNDEFs but only references the lowest 128-bit

16276

// lane of the inputs.

16277

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

16278

for (int i = 0; i != NumElts; i += NumBroadcastElts)

16279

for (int j = 0; j != NumBroadcastElts; ++j) {

16280

int M = Mask[i + j];

16281

if (M < 0)

16282

continue;

16283

int &R = RepeatMask[j];

16284

if (0 != ((M % NumElts) / NumLaneElts))

16285

return false;

16286

if (0 <= R && R != M)

16287

return false;

16288

R = M;

16289

}

16290

return true;

16291

};

16292

16293

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

16294

if (!FindRepeatingBroadcastMask(RepeatMask))

16295

continue;

16296

16297

// Shuffle the (lowest) repeated elements in place for broadcast.

16298

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

16299

16300

// Shuffle the actual broadcast.

16301

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

16302

for (int i = 0; i != NumElts; i += NumBroadcastElts)

16303

for (int j = 0; j != NumBroadcastElts; ++j)

16304

BroadcastMask[i + j] = j;

16305

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

16306

BroadcastMask);

16307

}

16308

}

16309

16310

// Bail if the shuffle mask doesn't cross 128-bit lanes.

16311

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

16312

return SDValue();

16313

16314

// Bail if we already have a repeated lane shuffle mask.

16315

SmallVector<int, 8> RepeatedShuffleMask;

16316

if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))

16317

return SDValue();

16318

16319

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

16320

// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.

16321

int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;

16322

int NumSubLanes = NumLanes * SubLaneScale;

16323

int NumSubLaneElts = NumLaneElts / SubLaneScale;

16324

16325

// Check that all the sources are coming from the same lane and see if we can

16326

// form a repeating shuffle mask (local to each sub-lane). At the same time,

16327

// determine the source sub-lane for each destination sub-lane.

16328

int TopSrcSubLane = -1;

16329

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

16330

SmallVector<int, 8> RepeatedSubLaneMasks[2] = {

16331

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),

16332

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

16333

16334

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

16335

// Extract the sub-lane mask, check that it all comes from the same lane

16336

// and normalize the mask entries to come from the first lane.

16337

int SrcLane = -1;

16338

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

16339

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

16340

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

16341

if (M < 0)

16342

continue;

16343

int Lane = (M % NumElts) / NumLaneElts;

16344

if ((0 <= SrcLane) && (SrcLane != Lane))

16345

return SDValue();

16346

SrcLane = Lane;

16347

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

16348

SubLaneMask[Elt] = LocalM;

16349

}

16350

16351

// Whole sub-lane is UNDEF.

16352

if (SrcLane < 0)

16353

continue;

16354

16355

// Attempt to match against the candidate repeated sub-lane masks.

16356

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

16357

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

16358

for (int i = 0; i != NumSubLaneElts; ++i) {

16359

if (M1[i] < 0 || M2[i] < 0)

16360

continue;

16361

if (M1[i] != M2[i])

16362

return false;

16363

}

16364

return true;

16365

};

16366

16367

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

16368

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

16369

continue;

16370

16371

// Merge the sub-lane mask into the matching repeated sub-lane mask.

16372

for (int i = 0; i != NumSubLaneElts; ++i) {

16373

int M = SubLaneMask[i];

16374

if (M < 0)

16375

continue;

16376

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16377, __PRETTY_FUNCTION__))

16377

"Unexpected mask element")(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16377, __PRETTY_FUNCTION__));

16378

RepeatedSubLaneMask[i] = M;

16379

}

16380

16381

// Track the top most source sub-lane - by setting the remaining to UNDEF

16382

// we can greatly simplify shuffle matching.

16383

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

16384

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

16385

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

16386

break;

16387

}

16388

16389

// Bail if we failed to find a matching repeated sub-lane mask.

16390

if (Dst2SrcSubLanes[DstSubLane] < 0)

16391

return SDValue();

16392

}

16393

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16394, __PRETTY_FUNCTION__))

16394

"Unexpected source lane")((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16394, __PRETTY_FUNCTION__));

16395

16396

// Create a repeating shuffle mask for the entire vector.

16397

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

16398

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

16399

int Lane = SubLane / SubLaneScale;

16400

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

16401

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

16402

int M = RepeatedSubLaneMask[Elt];

16403

if (M < 0)

16404

continue;

16405

int Idx = (SubLane * NumSubLaneElts) + Elt;

16406

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

16407

}

16408

}

16409

SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

16410

16411

// Shuffle each source sub-lane to its destination.

16412

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

16413

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

16414

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

16415

if (SrcSubLane < 0)

16416

continue;

16417

for (int j = 0; j != NumSubLaneElts; ++j)

16418

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

16419

}

16420

16421

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

16422

SubLaneMask);

16423

}

16424

16425

static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

16426

bool &ForceV1Zero, bool &ForceV2Zero,

16427

unsigned &ShuffleImm, ArrayRef<int> Mask,

16428

const APInt &Zeroable) {

16429

int NumElts = VT.getVectorNumElements();

16430

assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16432, __PRETTY_FUNCTION__))

16431

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16432, __PRETTY_FUNCTION__))

16432

"Unexpected data type for VSHUFPD")((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16432, __PRETTY_FUNCTION__));

16433

assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16434, __PRETTY_FUNCTION__))

16434

"Illegal shuffle mask")((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16434, __PRETTY_FUNCTION__));

16435

16436

bool ZeroLane[2] = { true, true };

16437

for (int i = 0; i < NumElts; ++i)

16438

ZeroLane[i & 1] &= Zeroable[i];

16439

16440

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

16441

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

16442

ShuffleImm = 0;

16443

bool ShufpdMask = true;

16444

bool CommutableMask = true;

16445

for (int i = 0; i < NumElts; ++i) {

16446

if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

16447

continue;

16448

if (Mask[i] < 0)

16449

return false;

16450

int Val = (i & 6) + NumElts * (i & 1);

16451

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

16452

if (Mask[i] < Val || Mask[i] > Val + 1)

16453

ShufpdMask = false;

16454

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

16455

CommutableMask = false;

16456

ShuffleImm |= (Mask[i] % 2) << i;

16457

}

16458

16459

if (!ShufpdMask && !CommutableMask)

16460

return false;

16461

16462

if (!ShufpdMask && CommutableMask)

16463

std::swap(V1, V2);

16464

16465

ForceV1Zero = ZeroLane[0];

16466

ForceV2Zero = ZeroLane[1];

16467

return true;

16468

}

16469

16470

static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

16471

SDValue V2, ArrayRef<int> Mask,

16472

const APInt &Zeroable,

16473

const X86Subtarget &Subtarget,

16474

SelectionDAG &DAG) {

16475

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16476, __PRETTY_FUNCTION__))

16476

"Unexpected data type for VSHUFPD")(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16476, __PRETTY_FUNCTION__));

16477

16478

unsigned Immediate = 0;

16479

bool ForceV1Zero = false, ForceV2Zero = false;

16480

if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

16481

Mask, Zeroable))

16482

return SDValue();

16483

16484

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

16485

if (ForceV1Zero)

16486

V1 = getZeroVector(VT, Subtarget, DAG, DL);

16487

if (ForceV2Zero)

16488

V2 = getZeroVector(VT, Subtarget, DAG, DL);

16489

16490

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

16491

DAG.getTargetConstant(Immediate, DL, MVT::i8));

16492

}

16493

16494

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

16495

// by zeroable elements in the remaining 24 elements. Turn this into two

16496

// vmovqb instructions shuffled together.

16497

static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

16498

SDValue V1, SDValue V2,

16499

ArrayRef<int> Mask,

16500

const APInt &Zeroable,

16501

SelectionDAG &DAG) {

16502

assert(VT == MVT::v32i8 && "Unexpected type!")((VT == MVT::v32i8 && "Unexpected type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16502, __PRETTY_FUNCTION__));

16503

16504

// The first 8 indices should be every 8th element.

16505

if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

16506

return SDValue();

16507

16508

// Remaining elements need to be zeroable.

16509

if (Zeroable.countLeadingOnes() < (Mask.size() - 8))

16510

return SDValue();

16511

16512

V1 = DAG.getBitcast(MVT::v4i64, V1);

16513

V2 = DAG.getBitcast(MVT::v4i64, V2);

16514

16515

V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

16516

V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

16517

16518

// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

16519

// the upper bits of the result using an unpckldq.

16520

SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

16521

{ 0, 1, 2, 3, 16, 17, 18, 19,

16522

4, 5, 6, 7, 20, 21, 22, 23 });

16523

// Insert the unpckldq into a zero vector to widen to v32i8.

16524

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

16525

DAG.getConstant(0, DL, MVT::v32i8), Unpack,

16526

DAG.getIntPtrConstant(0, DL));

16527

}

16528

16529

16530

/// Handle lowering of 4-lane 64-bit floating point shuffles.

16531

///

16532

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

16533

/// isn't available.

16534

static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16535

const APInt &Zeroable, SDValue V1, SDValue V2,

16536

const X86Subtarget &Subtarget,

16537

SelectionDAG &DAG) {

16538

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16538, __PRETTY_FUNCTION__));

16539

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16539, __PRETTY_FUNCTION__));

16540

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16540, __PRETTY_FUNCTION__));

16541

16542

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

16543

Subtarget, DAG))

16544

return V;

16545

16546

if (V2.isUndef()) {

16547

// Check for being able to broadcast a single element.

16548

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

16549

Mask, Subtarget, DAG))

16550

return Broadcast;

16551

16552

// Use low duplicate instructions for masks that match their pattern.

16553

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))

16554

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

16555

16556

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

16557

// Non-half-crossing single input shuffles can be lowered with an

16558

// interleaved permutation.

16559

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

16560

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

16561

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

16562

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

16563

}

16564

16565

// With AVX2 we have direct support for this permutation.

16566

if (Subtarget.hasAVX2())

16567

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

16568

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

16569

16570

// Try to create an in-lane repeating shuffle mask and then shuffle the

16571

// results into the target lanes.

16572

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

16573

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

16574

return V;

16575

16576

// Try to permute the lanes and then use a per-lane permute.

16577

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

16578

Mask, DAG, Subtarget))

16579

return V;

16580

16581

// Otherwise, fall back.

16582

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

16583

DAG, Subtarget);

16584

}

16585

16586

// Use dedicated unpack instructions for masks that match their pattern.

16587

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

16588

return V;

16589

16590

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

16591

Zeroable, Subtarget, DAG))

16592

return Blend;

16593

16594

// Check if the blend happens to exactly fit that of SHUFPD.

16595

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

16596

Zeroable, Subtarget, DAG))

16597

return Op;

16598

16599

// If we have lane crossing shuffles AND they don't all come from the lower

16600

// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

16601

// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

16602

// canonicalize to a blend of splat which isn't necessary for this combine.

16603

if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

16604

!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

16605

(V1.getOpcode() != ISD::BUILD_VECTOR) &&

16606

(V2.getOpcode() != ISD::BUILD_VECTOR))

16607

if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,

16608

Mask, DAG))

16609

return Op;

16610

16611

// If we have one input in place, then we can permute the other input and

16612

// blend the result.

16613

if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))

16614

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

16615

Subtarget, DAG);

16616

16617

// Try to create an in-lane repeating shuffle mask and then shuffle the

16618

// results into the target lanes.

16619

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

16620

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

16621

return V;

16622

16623

// Try to simplify this by merging 128-bit lanes to enable a lane-based

16624

// shuffle. However, if we have AVX2 and either inputs are already in place,

16625

// we will be able to shuffle even across lanes the other input in a single

16626

// instruction so skip this pattern.

16627

if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||

16628

isShuffleMaskInputInPlace(1, Mask))))

16629

if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

16630

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

16631

return V;

16632

16633

// If we have VLX support, we can use VEXPAND.

16634

if (Subtarget.hasVLX())

16635

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,

16636

DAG, Subtarget))

16637

return V;

16638

16639

// If we have AVX2 then we always want to lower with a blend because an v4 we

16640

// can fully permute the elements.

16641

if (Subtarget.hasAVX2())

16642

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

16643

Subtarget, DAG);

16644

16645

// Otherwise fall back on generic lowering.

16646

return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,

16647

Subtarget, DAG);

16648

}

16649

16650

/// Handle lowering of 4-lane 64-bit integer shuffles.

16651

///

16652

/// This routine is only called when we have AVX2 and thus a reasonable

16653

/// instruction set for v4i64 shuffling..

16654

static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16655

const APInt &Zeroable, SDValue V1, SDValue V2,

16656

const X86Subtarget &Subtarget,

16657

SelectionDAG &DAG) {

16658

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16658, __PRETTY_FUNCTION__));

16659

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16659, __PRETTY_FUNCTION__));

16660

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16660, __PRETTY_FUNCTION__));

16661

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16661, __PRETTY_FUNCTION__));

16662

16663

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

16664

Subtarget, DAG))

16665

return V;

16666

16667

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

16668

Zeroable, Subtarget, DAG))

16669

return Blend;

16670

16671

// Check for being able to broadcast a single element.

16672

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

16673

Subtarget, DAG))

16674

return Broadcast;

16675

16676

if (V2.isUndef()) {

16677

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

16678

// can use lower latency instructions that will operate on both lanes.

16679

SmallVector<int, 2> RepeatedMask;

16680

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

16681

SmallVector<int, 4> PSHUFDMask;

16682

narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

16683

return DAG.getBitcast(

16684

MVT::v4i64,

16685

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

16686

DAG.getBitcast(MVT::v8i32, V1),

16687

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

16688

}

16689

16690

// AVX2 provides a direct instruction for permuting a single input across

16691

// lanes.

16692

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

16693

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

16694

}

16695

16696

// Try to use shift instructions.

16697

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,

16698

Zeroable, Subtarget, DAG))

16699

return Shift;

16700

16701

// If we have VLX support, we can use VALIGN or VEXPAND.

16702

if (Subtarget.hasVLX()) {

16703

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

16704

Subtarget, DAG))

16705

return Rotate;

16706

16707

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,

16708

DAG, Subtarget))

16709

return V;

16710

}

16711

16712

// Try to use PALIGNR.

16713

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

16714

Subtarget, DAG))

16715

return Rotate;

16716

16717

// Use dedicated unpack instructions for masks that match their pattern.

16718

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

16719

return V;

16720

16721

// If we have one input in place, then we can permute the other input and

16722

// blend the result.

16723

if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))

16724

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

16725

Subtarget, DAG);

16726

16727

// Try to create an in-lane repeating shuffle mask and then shuffle the

16728

// results into the target lanes.

16729

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

16730

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

16731

return V;

16732

16733

// Try to simplify this by merging 128-bit lanes to enable a lane-based

16734

// shuffle. However, if we have AVX2 and either inputs are already in place,

16735

// we will be able to shuffle even across lanes the other input in a single

16736

// instruction so skip this pattern.

16737

if (!isShuffleMaskInputInPlace(0, Mask) &&

16738

!isShuffleMaskInputInPlace(1, Mask))

16739

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

16740

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

16741

return Result;

16742

16743

// Otherwise fall back on generic blend lowering.

16744

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

16745

Subtarget, DAG);

16746

}

16747

16748

/// Handle lowering of 8-lane 32-bit floating point shuffles.

16749

///

16750

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

16751

/// isn't available.

16752

static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16753

const APInt &Zeroable, SDValue V1, SDValue V2,

16754

const X86Subtarget &Subtarget,

16755

SelectionDAG &DAG) {

16756

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16756, __PRETTY_FUNCTION__));

16757

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16757, __PRETTY_FUNCTION__));

16758

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16758, __PRETTY_FUNCTION__));

16759

16760

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

16761

Zeroable, Subtarget, DAG))

16762

return Blend;

16763

16764

// Check for being able to broadcast a single element.

16765

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

16766

Subtarget, DAG))

16767

return Broadcast;

16768

16769

// If the shuffle mask is repeated in each 128-bit lane, we have many more

16770

// options to efficiently lower the shuffle.

16771

SmallVector<int, 4> RepeatedMask;

16772

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

16773

assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16774, __PRETTY_FUNCTION__))

16774

"Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16774, __PRETTY_FUNCTION__));

16775

16776

// Use even/odd duplicate instructions for masks that match their pattern.

16777

if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))

16778

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

16779

if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))

16780

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

16781

16782

if (V2.isUndef())

16783

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

16784

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

16785

16786

// Use dedicated unpack instructions for masks that match their pattern.

16787

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

16788

return V;

16789

16790

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

16791

// have already handled any direct blends.

16792

return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

16793

}

16794

16795

// Try to create an in-lane repeating shuffle mask and then shuffle the

16796

// results into the target lanes.

16797

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

16798

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

16799

return V;

16800

16801

// If we have a single input shuffle with different shuffle patterns in the

16802

// two 128-bit lanes use the variable mask to VPERMILPS.

16803

if (V2.isUndef()) {

16804

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

16805

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

16806

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

16807

}

16808

if (Subtarget.hasAVX2()) {

16809

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

16810

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

16811

}

16812

// Otherwise, fall back.

16813

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

16814

DAG, Subtarget);

16815

}

16816

16817

// Try to simplify this by merging 128-bit lanes to enable a lane-based

16818

// shuffle.

16819

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

16820

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

16821

return Result;

16822

16823

// If we have VLX support, we can use VEXPAND.

16824

if (Subtarget.hasVLX())

16825

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,

16826

DAG, Subtarget))

16827

return V;

16828

16829

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

16830

// since after split we get a more efficient code using vpunpcklwd and

16831

// vpunpckhwd instrs than vblend.

16832

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))

16833

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,

16834

DAG);

16835

16836

// If we have AVX2 then we always want to lower with a blend because at v8 we

16837

// can fully permute the elements.

16838

if (Subtarget.hasAVX2())

16839

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

16840

Subtarget, DAG);

16841

16842

// Otherwise fall back on generic lowering.

16843

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,

16844

Subtarget, DAG);

16845

}

16846

16847

/// Handle lowering of 8-lane 32-bit integer shuffles.

16848

///

16849

/// This routine is only called when we have AVX2 and thus a reasonable

16850

/// instruction set for v8i32 shuffling..

16851

static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16852

const APInt &Zeroable, SDValue V1, SDValue V2,

16853

const X86Subtarget &Subtarget,

16854

SelectionDAG &DAG) {

16855

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16855, __PRETTY_FUNCTION__));

16856

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16856, __PRETTY_FUNCTION__));

16857

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16857, __PRETTY_FUNCTION__));

16858

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16858, __PRETTY_FUNCTION__));

16859

16860

// Whenever we can lower this as a zext, that instruction is strictly faster

16861

// than any alternative. It also allows us to fold memory operands into the

16862

// shuffle in many cases.

16863

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

16864

Zeroable, Subtarget, DAG))

16865

return ZExt;

16866

16867

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

16868

// since after split we get a more efficient code than vblend by using

16869

// vpunpcklwd and vpunpckhwd instrs.

16870

if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&

16871

!Subtarget.hasAVX512())

16872

return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,

16873

DAG);

16874

16875

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

16876

Zeroable, Subtarget, DAG))

16877

return Blend;

16878

16879

// Check for being able to broadcast a single element.

16880

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

16881

Subtarget, DAG))

16882

return Broadcast;

16883

16884

// If the shuffle mask is repeated in each 128-bit lane we can use more

16885

// efficient instructions that mirror the shuffles across the two 128-bit

16886

// lanes.

16887

SmallVector<int, 4> RepeatedMask;

16888

bool Is128BitLaneRepeatedShuffle =

16889

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

16890

if (Is128BitLaneRepeatedShuffle) {

16891

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16891, __PRETTY_FUNCTION__));

16892

if (V2.isUndef())

16893

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

16894

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

16895

16896

// Use dedicated unpack instructions for masks that match their pattern.

16897

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

16898

return V;

16899

}

16900

16901

// Try to use shift instructions.

16902

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,

16903

Zeroable, Subtarget, DAG))

16904

return Shift;

16905

16906

// If we have VLX support, we can use VALIGN or EXPAND.

16907

if (Subtarget.hasVLX()) {

16908

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

16909

Subtarget, DAG))

16910

return Rotate;

16911

16912

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,

16913

DAG, Subtarget))

16914

return V;

16915

}

16916

16917

// Try to use byte rotation instructions.

16918

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

16919

Subtarget, DAG))

16920

return Rotate;

16921

16922

// Try to create an in-lane repeating shuffle mask and then shuffle the

16923

// results into the target lanes.

16924

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

16925

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

16926

return V;

16927

16928

if (V2.isUndef()) {

16929

// Try to produce a fixed cross-128-bit lane permute followed by unpack

16930

// because that should be faster than the variable permute alternatives.

16931

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

16932

return V;

16933

16934

// If the shuffle patterns aren't repeated but it's a single input, directly

16935

// generate a cross-lane VPERMD instruction.

16936

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

16937

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

16938

}

16939

16940

// Assume that a single SHUFPS is faster than an alternative sequence of

16941

// multiple instructions (even if the CPU has a domain penalty).

16942

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

16943

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

16944

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

16945

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

16946

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

16947

CastV1, CastV2, DAG);

16948

return DAG.getBitcast(MVT::v8i32, ShufPS);

16949

}

16950

16951

// Try to simplify this by merging 128-bit lanes to enable a lane-based

16952

// shuffle.

16953

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

16954

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

16955

return Result;

16956

16957

// Otherwise fall back on generic blend lowering.

16958

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

16959

Subtarget, DAG);

16960

}

16961

16962

/// Handle lowering of 16-lane 16-bit integer shuffles.

16963

///

16964

/// This routine is only called when we have AVX2 and thus a reasonable

16965

/// instruction set for v16i16 shuffling..

16966

static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16967

const APInt &Zeroable, SDValue V1, SDValue V2,

16968

const X86Subtarget &Subtarget,

16969

SelectionDAG &DAG) {

16970

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16970, __PRETTY_FUNCTION__));

16971

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16971, __PRETTY_FUNCTION__));

16972

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16972, __PRETTY_FUNCTION__));

16973

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16973, __PRETTY_FUNCTION__));

16974

16975

// Whenever we can lower this as a zext, that instruction is strictly faster

16976

// than any alternative. It also allows us to fold memory operands into the

16977

// shuffle in many cases.

16978

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

16979

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16980

return ZExt;

16981

16982

// Check for being able to broadcast a single element.

16983

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

16984

Subtarget, DAG))

16985

return Broadcast;

16986

16987

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

16988

Zeroable, Subtarget, DAG))

16989

return Blend;

16990

16991

// Use dedicated unpack instructions for masks that match their pattern.

16992

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

16993

return V;

16994

16995

// Use dedicated pack instructions for masks that match their pattern.

16996

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

16997

Subtarget))

16998

return V;

16999

17000

// Try to use lower using a truncation.

17001

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

17002

Subtarget, DAG))

17003

return V;

17004

17005

// Try to use shift instructions.

17006

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,

17007

Zeroable, Subtarget, DAG))

17008

return Shift;

17009

17010

// Try to use byte rotation instructions.

17011

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

17012

Subtarget, DAG))

17013

return Rotate;

17014

17015

// Try to create an in-lane repeating shuffle mask and then shuffle the

17016

// results into the target lanes.

17017

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17018

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

17019

return V;

17020

17021

if (V2.isUndef()) {

17022

// Try to use bit rotation instructions.

17023

if (SDValue Rotate =

17024

lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

17025

return Rotate;

17026

17027

// Try to produce a fixed cross-128-bit lane permute followed by unpack

17028

// because that should be faster than the variable permute alternatives.

17029

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

17030

return V;

17031

17032

// There are no generalized cross-lane shuffle operations available on i16

17033

// element types.

17034

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

17035

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

17036

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

17037

return V;

17038

17039

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

17040

DAG, Subtarget);

17041

}

17042

17043

SmallVector<int, 8> RepeatedMask;

17044

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

17045

// As this is a single-input shuffle, the repeated mask should be

17046

// a strictly valid v8i16 mask that we can pass through to the v8i16

17047

// lowering to handle even the v16 case.

17048

return lowerV8I16GeneralSingleInputShuffle(

17049

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

17050

}

17051

}

17052

17053

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

17054

Zeroable, Subtarget, DAG))

17055

return PSHUFB;

17056

17057

// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

17058

if (Subtarget.hasBWI())

17059

return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);

17060

17061

// Try to simplify this by merging 128-bit lanes to enable a lane-based

17062

// shuffle.

17063

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

17064

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

17065

return Result;

17066

17067

// Try to permute the lanes and then use a per-lane permute.

17068

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

17069

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

17070

return V;

17071

17072

// Otherwise fall back on generic lowering.

17073

return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,

17074

Subtarget, DAG);

17075

}

17076

17077

/// Handle lowering of 32-lane 8-bit integer shuffles.

17078

///

17079

/// This routine is only called when we have AVX2 and thus a reasonable

17080

/// instruction set for v32i8 shuffling..

17081

static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17082

const APInt &Zeroable, SDValue V1, SDValue V2,

17083

const X86Subtarget &Subtarget,

17084

SelectionDAG &DAG) {

17085

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17085, __PRETTY_FUNCTION__));

17086

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17086, __PRETTY_FUNCTION__));

17087

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17087, __PRETTY_FUNCTION__));

17088

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17088, __PRETTY_FUNCTION__));

17089

17090

// Whenever we can lower this as a zext, that instruction is strictly faster

17091

// than any alternative. It also allows us to fold memory operands into the

17092

// shuffle in many cases.

17093

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

17094

Zeroable, Subtarget, DAG))

17095

return ZExt;

17096

17097

// Check for being able to broadcast a single element.

17098

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

17099

Subtarget, DAG))

17100

return Broadcast;

17101

17102

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

17103

Zeroable, Subtarget, DAG))

17104

return Blend;

17105

17106

// Use dedicated unpack instructions for masks that match their pattern.

17107

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

17108

return V;

17109

17110

// Use dedicated pack instructions for masks that match their pattern.

17111

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

17112

Subtarget))

17113

return V;

17114

17115

// Try to use lower using a truncation.

17116

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

17117

Subtarget, DAG))

17118

return V;

17119

17120

// Try to use shift instructions.

17121

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,

17122

Zeroable, Subtarget, DAG))

17123

return Shift;

17124

17125

// Try to use byte rotation instructions.

17126

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

17127

Subtarget, DAG))

17128

return Rotate;

17129

17130

// Try to use bit rotation instructions.

17131

if (V2.isUndef())

17132

if (SDValue Rotate =

17133

lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

17134

return Rotate;

17135

17136

// Try to create an in-lane repeating shuffle mask and then shuffle the

17137

// results into the target lanes.

17138

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17139

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

17140

return V;

17141

17142

// There are no generalized cross-lane shuffle operations available on i8

17143

// element types.

17144

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

17145

// Try to produce a fixed cross-128-bit lane permute followed by unpack

17146

// because that should be faster than the variable permute alternatives.

17147

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

17148

return V;

17149

17150

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

17151

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

17152

return V;

17153

17154

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

17155

DAG, Subtarget);

17156

}

17157

17158

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

17159

Zeroable, Subtarget, DAG))

17160

return PSHUFB;

17161

17162

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

17163

if (Subtarget.hasVBMI())

17164

return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);

17165

17166

// Try to simplify this by merging 128-bit lanes to enable a lane-based

17167

// shuffle.

17168

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

17169

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

17170

return Result;

17171

17172

// Try to permute the lanes and then use a per-lane permute.

17173

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

17174

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

17175

return V;

17176

17177

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

17178

// by zeroable elements in the remaining 24 elements. Turn this into two

17179

// vmovqb instructions shuffled together.

17180

if (Subtarget.hasVLX())

17181

if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

17182

Mask, Zeroable, DAG))

17183

return V;

17184

17185

// Otherwise fall back on generic lowering.

17186

return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,

17187

Subtarget, DAG);

17188

}

17189

17190

/// High-level routine to lower various 256-bit x86 vector shuffles.

17191

///

17192

/// This routine either breaks down the specific type of a 256-bit x86 vector

17193

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

17194

/// together based on the available instructions.

17195

static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

17196

SDValue V1, SDValue V2, const APInt &Zeroable,

17197

const X86Subtarget &Subtarget,

17198

SelectionDAG &DAG) {

17199

// If we have a single input to the zero element, insert that into V1 if we

17200

// can do so cheaply.

17201

int NumElts = VT.getVectorNumElements();

17202

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

17203

17204

if (NumV2Elements == 1 && Mask[0] >= NumElts)

17205

if (SDValue Insertion = lowerShuffleAsElementInsertion(

17206

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

17207

return Insertion;

17208

17209

// Handle special cases where the lower or upper half is UNDEF.

17210

if (SDValue V =

17211

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

17212

return V;

17213

17214

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

17215

// can check for those subtargets here and avoid much of the subtarget

17216

// querying in the per-vector-type lowering routines. With AVX1 we have

17217

// essentially *zero* ability to manipulate a 256-bit vector with integer

17218

// types. Since we'll use floating point types there eventually, just

17219

// immediately cast everything to a float and operate entirely in that domain.

17220

if (VT.isInteger() && !Subtarget.hasAVX2()) {

17221

int ElementBits = VT.getScalarSizeInBits();

17222

if (ElementBits < 32) {

17223

// No floating point type available, if we can't use the bit operations

17224

// for masking/blending then decompose into 128-bit vectors.

17225

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

17226

Subtarget, DAG))

17227

return V;

17228

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

17229

return V;

17230

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

17231

}

17232

17233

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

17234

VT.getVectorNumElements());

17235

V1 = DAG.getBitcast(FpVT, V1);

17236

V2 = DAG.getBitcast(FpVT, V2);

17237

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

17238

}

17239

17240

switch (VT.SimpleTy) {

17241

case MVT::v4f64:

17242

return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17243

case MVT::v4i64:

17244

return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17245

case MVT::v8f32:

17246

return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17247

case MVT::v8i32:

17248

return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17249

case MVT::v16i16:

17250

return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17251

case MVT::v32i8:

17252

return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17253

17254

default:

17255

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17255);

17256

}

17257

}

17258

17259

/// Try to lower a vector shuffle as a 128-bit shuffles.

17260

static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

17261

const APInt &Zeroable, SDValue V1, SDValue V2,

17262

const X86Subtarget &Subtarget,

17263

SelectionDAG &DAG) {

17264

assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17265, __PRETTY_FUNCTION__))

17265

"Unexpected element type size for 128bit shuffle.")((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17265, __PRETTY_FUNCTION__));

17266

17267

// To handle 256 bit vector requires VLX and most probably

17268

// function lowerV2X128VectorShuffle() is better solution.

17269

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17269, __PRETTY_FUNCTION__));

17270

17271

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

17272

SmallVector<int, 4> Widened128Mask;

17273

if (!canWidenShuffleElements(Mask, Widened128Mask))

17274

return SDValue();

17275

assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")((Widened128Mask.size() == 4 && "Shuffle widening mismatch"
) ? static_cast<void> (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17275, __PRETTY_FUNCTION__));

17276

17277

// Try to use an insert into a zero vector.

17278

if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

17279

(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

17280

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

17281

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

17282

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

17283

DAG.getIntPtrConstant(0, DL));

17284

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

17285

getZeroVector(VT, Subtarget, DAG, DL), LoV,

17286

DAG.getIntPtrConstant(0, DL));

17287

}

17288

17289

// Check for patterns which can be matched with a single insert of a 256-bit

17290

// subvector.

17291

bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});

17292

if (OnlyUsesV1 ||

17293

isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {

17294

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

17295

SDValue SubVec =

17296

DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

17297

DAG.getIntPtrConstant(0, DL));

17298

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

17299

DAG.getIntPtrConstant(4, DL));

17300

}

17301

17302

// See if this is an insertion of the lower 128-bits of V2 into V1.

17303

bool IsInsert = true;

17304

int V2Index = -1;

17305

for (int i = 0; i < 4; ++i) {

17306

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"
) ? static_cast<void> (0) : __assert_fail ("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17306, __PRETTY_FUNCTION__));

17307

if (Widened128Mask[i] < 0)

17308

continue;

17309

17310

// Make sure all V1 subvectors are in place.

17311

if (Widened128Mask[i] < 4) {

17312

if (Widened128Mask[i] != i) {

17313

IsInsert = false;

17314

break;

17315

}

17316

} else {

17317

// Make sure we only have a single V2 index and its the lowest 128-bits.

17318

if (V2Index >= 0 || Widened128Mask[i] != 4) {

17319

IsInsert = false;

17320

break;

17321

}

17322

V2Index = i;

17323

}

17324

}

17325

if (IsInsert && V2Index >= 0) {

17326

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17327

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

17328

DAG.getIntPtrConstant(0, DL));

17329

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

17330

}

17331

17332

// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

17333

// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

17334

// possible we at least ensure the lanes stay sequential to help later

17335

// combines.

17336

SmallVector<int, 2> Widened256Mask;

17337

if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

17338

Widened128Mask.clear();

17339

narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

17340

}

17341

17342

// Try to lower to vshuf64x2/vshuf32x4.

17343

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

17344

unsigned PermMask = 0;

17345

// Insure elements came from the same Op.

17346

for (int i = 0; i < 4; ++i) {

17347

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"
) ? static_cast<void> (0) : __assert_fail ("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17347, __PRETTY_FUNCTION__));

17348

if (Widened128Mask[i] < 0)

17349

continue;

17350

17351

SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

17352

unsigned OpIndex = i / 2;

17353

if (Ops[OpIndex].isUndef())

17354

Ops[OpIndex] = Op;

17355

else if (Ops[OpIndex] != Op)

17356

return SDValue();

17357

17358

// Convert the 128-bit shuffle mask selection values into 128-bit selection

17359

// bits defined by a vshuf64x2 instruction's immediate control byte.

17360

PermMask |= (Widened128Mask[i] % 4) << (i * 2);

17361

}

17362

17363

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

17364

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17365

}

17366

17367

/// Handle lowering of 8-lane 64-bit floating point shuffles.

17368

static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17369

const APInt &Zeroable, SDValue V1, SDValue V2,

17370

const X86Subtarget &Subtarget,

17371

SelectionDAG &DAG) {

17372

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17372, __PRETTY_FUNCTION__));

17373

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17373, __PRETTY_FUNCTION__));

17374

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17374, __PRETTY_FUNCTION__));

17375

17376

if (V2.isUndef()) {

17377

// Use low duplicate instructions for masks that match their pattern.

17378

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))

17379

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

17380

17381

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

17382

// Non-half-crossing single input shuffles can be lowered with an

17383

// interleaved permutation.

17384

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

17385

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

17386

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

17387

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

17388

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

17389

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

17390

}

17391

17392

SmallVector<int, 4> RepeatedMask;

17393

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

17394

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

17395

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

17396

}

17397

17398

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

17399

V2, Subtarget, DAG))

17400

return Shuf128;

17401

17402

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

17403

return Unpck;

17404

17405

// Check if the blend happens to exactly fit that of SHUFPD.

17406

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

17407

Zeroable, Subtarget, DAG))

17408

return Op;

17409

17410

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,

17411

DAG, Subtarget))

17412

return V;

17413

17414

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

17415

Zeroable, Subtarget, DAG))

17416

return Blend;

17417

17418

return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

17419

}

17420

17421

/// Handle lowering of 16-lane 32-bit floating point shuffles.

17422

static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17423

const APInt &Zeroable, SDValue V1, SDValue V2,

17424

const X86Subtarget &Subtarget,

17425

SelectionDAG &DAG) {

17426

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17426, __PRETTY_FUNCTION__));

17427

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17427, __PRETTY_FUNCTION__));

17428

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17428, __PRETTY_FUNCTION__));

17429

17430

// If the shuffle mask is repeated in each 128-bit lane, we have many more

17431

// options to efficiently lower the shuffle.

17432

SmallVector<int, 4> RepeatedMask;

17433

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

17434

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17434, __PRETTY_FUNCTION__));

17435

17436

// Use even/odd duplicate instructions for masks that match their pattern.

17437

if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))

17438

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

17439

if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))

17440

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

17441

17442

if (V2.isUndef())

17443

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

17444

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

17445

17446

// Use dedicated unpack instructions for masks that match their pattern.

17447

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

17448

return V;

17449

17450

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

17451

Zeroable, Subtarget, DAG))

17452

return Blend;

17453

17454

// Otherwise, fall back to a SHUFPS sequence.

17455

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

17456

}

17457

17458

// Try to create an in-lane repeating shuffle mask and then shuffle the

17459

// results into the target lanes.

17460

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17461

DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

17462

return V;

17463

17464

// If we have a single input shuffle with different shuffle patterns in the

17465

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

17466

if (V2.isUndef() &&

17467

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

17468

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

17469

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

17470

}

17471

17472

// If we have AVX512F support, we can use VEXPAND.

17473

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

17474

V1, V2, DAG, Subtarget))

17475

return V;

17476

17477

return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

17478

}

17479

17480

/// Handle lowering of 8-lane 64-bit integer shuffles.

17481

static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17482

const APInt &Zeroable, SDValue V1, SDValue V2,

17483

const X86Subtarget &Subtarget,

17484

SelectionDAG &DAG) {

17485

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17485, __PRETTY_FUNCTION__));

17486

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17486, __PRETTY_FUNCTION__));

17487

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17487, __PRETTY_FUNCTION__));

17488

17489

if (V2.isUndef()) {

17490

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

17491

// can use lower latency instructions that will operate on all four

17492

// 128-bit lanes.

17493

SmallVector<int, 2> Repeated128Mask;

17494

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

17495

SmallVector<int, 4> PSHUFDMask;

17496

narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

17497

return DAG.getBitcast(

17498

MVT::v8i64,

17499

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

17500

DAG.getBitcast(MVT::v16i32, V1),

17501

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

17502

}

17503

17504

SmallVector<int, 4> Repeated256Mask;

17505

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

17506

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

17507

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

17508

}

17509

17510

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

17511

V2, Subtarget, DAG))

17512

return Shuf128;

17513

17514

// Try to use shift instructions.

17515

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,

17516

Zeroable, Subtarget, DAG))

17517

return Shift;

17518

17519

// Try to use VALIGN.

17520

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

17521

Subtarget, DAG))

17522

return Rotate;

17523

17524

// Try to use PALIGNR.

17525

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

17526

Subtarget, DAG))

17527

return Rotate;

17528

17529

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

17530

return Unpck;

17531

// If we have AVX512F support, we can use VEXPAND.

17532

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,

17533

DAG, Subtarget))

17534

return V;

17535

17536

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

17537

Zeroable, Subtarget, DAG))

17538

return Blend;

17539

17540

return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

17541

}

17542

17543

/// Handle lowering of 16-lane 32-bit integer shuffles.

17544

static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17545

const APInt &Zeroable, SDValue V1, SDValue V2,

17546

const X86Subtarget &Subtarget,

17547

SelectionDAG &DAG) {

17548

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17548, __PRETTY_FUNCTION__));

17549

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17549, __PRETTY_FUNCTION__));

17550

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17550, __PRETTY_FUNCTION__));

17551

17552

// Whenever we can lower this as a zext, that instruction is strictly faster

17553

// than any alternative. It also allows us to fold memory operands into the

17554

// shuffle in many cases.

17555

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

17556

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

17557

return ZExt;

17558

17559

// If the shuffle mask is repeated in each 128-bit lane we can use more

17560

// efficient instructions that mirror the shuffles across the four 128-bit

17561

// lanes.

17562

SmallVector<int, 4> RepeatedMask;

17563

bool Is128BitLaneRepeatedShuffle =

17564

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

17565

if (Is128BitLaneRepeatedShuffle) {

17566

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17566, __PRETTY_FUNCTION__));

17567

if (V2.isUndef())

17568

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

17569

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

17570

17571

// Use dedicated unpack instructions for masks that match their pattern.

17572

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

17573

return V;

17574

}

17575

17576

// Try to use shift instructions.

17577

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,

17578

Zeroable, Subtarget, DAG))

17579

return Shift;

17580

17581

// Try to use VALIGN.

17582

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

17583

Subtarget, DAG))

17584

return Rotate;

17585

17586

// Try to use byte rotation instructions.

17587

if (Subtarget.hasBWI())

17588

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

17589

Subtarget, DAG))

17590

return Rotate;

17591

17592

// Assume that a single SHUFPS is faster than using a permv shuffle.

17593

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

17594

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

17595

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

17596

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

17597

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

17598

CastV1, CastV2, DAG);

17599

return DAG.getBitcast(MVT::v16i32, ShufPS);

17600

}

17601

17602

// Try to create an in-lane repeating shuffle mask and then shuffle the

17603

// results into the target lanes.

17604

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17605

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

17606

return V;

17607

17608

// If we have AVX512F support, we can use VEXPAND.

17609

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

17610

DAG, Subtarget))

17611

return V;

17612

17613

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

17614

Zeroable, Subtarget, DAG))

17615

return Blend;

17616

17617

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

17618

}

17619

17620

/// Handle lowering of 32-lane 16-bit integer shuffles.

17621

static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17622

const APInt &Zeroable, SDValue V1, SDValue V2,

17623

const X86Subtarget &Subtarget,

17624

SelectionDAG &DAG) {

17625

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17625, __PRETTY_FUNCTION__));

17626

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17626, __PRETTY_FUNCTION__));

17627

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17627, __PRETTY_FUNCTION__));

17628

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17628, __PRETTY_FUNCTION__));

17629

17630

// Whenever we can lower this as a zext, that instruction is strictly faster

17631

// than any alternative. It also allows us to fold memory operands into the

17632

// shuffle in many cases.

17633

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

17634

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

17635

return ZExt;

17636

17637

// Use dedicated unpack instructions for masks that match their pattern.

17638

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

17639

return V;

17640

17641

// Use dedicated pack instructions for masks that match their pattern.

17642

if (SDValue V =

17643

lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

17644

return V;

17645

17646

// Try to use shift instructions.

17647

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,

17648

Zeroable, Subtarget, DAG))

17649

return Shift;

17650

17651

// Try to use byte rotation instructions.

17652

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

17653

Subtarget, DAG))

17654

return Rotate;

17655

17656

if (V2.isUndef()) {

17657

// Try to use bit rotation instructions.

17658

if (SDValue Rotate =

17659

lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

17660

return Rotate;

17661

17662

SmallVector<int, 8> RepeatedMask;

17663

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

17664

// As this is a single-input shuffle, the repeated mask should be

17665

// a strictly valid v8i16 mask that we can pass through to the v8i16

17666

// lowering to handle even the v32 case.

17667

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

17668

RepeatedMask, Subtarget, DAG);

17669

}

17670

}

17671

17672

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

17673

Zeroable, Subtarget, DAG))

17674

return Blend;

17675

17676

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

17677

Zeroable, Subtarget, DAG))

17678

return PSHUFB;

17679

17680

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

17681

}

17682

17683

/// Handle lowering of 64-lane 8-bit integer shuffles.

17684

static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17685

const APInt &Zeroable, SDValue V1, SDValue V2,

17686

const X86Subtarget &Subtarget,

17687

SelectionDAG &DAG) {

17688

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17688, __PRETTY_FUNCTION__));

17689

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17689, __PRETTY_FUNCTION__));

17690

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17690, __PRETTY_FUNCTION__));

17691

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17691, __PRETTY_FUNCTION__));

17692

17693

// Whenever we can lower this as a zext, that instruction is strictly faster

17694

// than any alternative. It also allows us to fold memory operands into the

17695

// shuffle in many cases.

17696

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

17697

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

17698

return ZExt;

17699

17700

// Use dedicated unpack instructions for masks that match their pattern.

17701

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

17702

return V;

17703

17704

// Use dedicated pack instructions for masks that match their pattern.

17705

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,

17706

Subtarget))

17707

return V;

17708

17709

// Try to use shift instructions.

17710

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,

17711

Zeroable, Subtarget, DAG))

17712

return Shift;

17713

17714

// Try to use byte rotation instructions.

17715

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

17716

Subtarget, DAG))

17717

return Rotate;

17718

17719

// Try to use bit rotation instructions.

17720

if (V2.isUndef())

17721

if (SDValue Rotate =

17722

lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

17723

return Rotate;

17724

17725

// Lower as AND if possible.

17726

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

17727

Zeroable, Subtarget, DAG))

17728

return Masked;

17729

17730

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

17731

Zeroable, Subtarget, DAG))

17732

return PSHUFB;

17733

17734

// VBMI can use VPERMV/VPERMV3 byte shuffles.

17735

if (Subtarget.hasVBMI())

17736

return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

17737

17738

// Try to create an in-lane repeating shuffle mask and then shuffle the

17739

// results into the target lanes.

17740

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17741

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

17742

return V;

17743

17744

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

17745

Zeroable, Subtarget, DAG))

17746

return Blend;

17747

17748

// Try to simplify this by merging 128-bit lanes to enable a lane-based

17749

// shuffle.

17750

if (!V2.isUndef())

17751

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

17752

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

17753

return Result;

17754

17755

// FIXME: Implement direct support for this type!

17756

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);

17757

}

17758

17759

/// High-level routine to lower various 512-bit x86 vector shuffles.

17760

///

17761

/// This routine either breaks down the specific type of a 512-bit x86 vector

17762

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

17763

/// together based on the available instructions.

17764

static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

17765

MVT VT, SDValue V1, SDValue V2,

17766

const APInt &Zeroable,

17767

const X86Subtarget &Subtarget,

17768

SelectionDAG &DAG) {

17769

assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17770, __PRETTY_FUNCTION__))

17770

"Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17770, __PRETTY_FUNCTION__));

17771

17772

// If we have a single input to the zero element, insert that into V1 if we

17773

// can do so cheaply.

17774

int NumElts = Mask.size();

17775

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

17776

17777

if (NumV2Elements == 1 && Mask[0] >= NumElts)

17778

if (SDValue Insertion = lowerShuffleAsElementInsertion(

17779

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

17780

return Insertion;

17781

17782

// Handle special cases where the lower or upper half is UNDEF.

17783

if (SDValue V =

17784

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

17785

return V;

17786

17787

// Check for being able to broadcast a single element.

17788

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

17789

Subtarget, DAG))

17790

return Broadcast;

17791

17792

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

17793

// Try using bit ops for masking and blending before falling back to

17794

// splitting.

17795

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

17796

Subtarget, DAG))

17797

return V;

17798

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

17799

return V;

17800

17801

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

17802

}

17803

17804

// Dispatch to each element type for lowering. If we don't have support for

17805

// specific element type shuffles at 512 bits, immediately split them and

17806

// lower them. Each lowering routine of a given type is allowed to assume that

17807

// the requisite ISA extensions for that element type are available.

17808

switch (VT.SimpleTy) {

17809

case MVT::v8f64:

17810

return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17811

case MVT::v16f32:

17812

return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17813

case MVT::v8i64:

17814

return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17815

case MVT::v16i32:

17816

return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17817

case MVT::v32i16:

17818

return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17819

case MVT::v64i8:

17820

return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

17821

17822

default:

17823

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17823);

17824

}

17825

}

17826

17827

static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

17828

MVT VT, SDValue V1, SDValue V2,

17829

const X86Subtarget &Subtarget,

17830

SelectionDAG &DAG) {

17831

// Shuffle should be unary.

17832

if (!V2.isUndef())

17833

return SDValue();

17834

17835

int ShiftAmt = -1;

17836

int NumElts = Mask.size();

17837

for (int i = 0; i != NumElts; ++i) {

17838

int M = Mask[i];

17839

assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17840, __PRETTY_FUNCTION__))

17840

"Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17840, __PRETTY_FUNCTION__));

17841

if (M < 0)

17842

continue;

17843

17844

// The first non-undef element determines our shift amount.

17845

if (ShiftAmt < 0) {

17846

ShiftAmt = M - i;

17847

// Need to be shifting right.

17848

if (ShiftAmt <= 0)

17849

return SDValue();

17850

}

17851

// All non-undef elements must shift by the same amount.

17852

if (ShiftAmt != M - i)

17853

return SDValue();

17854

}

17855

assert(ShiftAmt >= 0 && "All undef?")((ShiftAmt >= 0 && "All undef?") ? static_cast<
void> (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17855, __PRETTY_FUNCTION__));

17856

17857

// Great we found a shift right.

17858

MVT WideVT = VT;

17859

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

17860

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

17861

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

17862

DAG.getUNDEF(WideVT), V1,

17863

DAG.getIntPtrConstant(0, DL));

17864

Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,

17865

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

17866

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

17867

DAG.getIntPtrConstant(0, DL));

17868

}

17869

17870

// Determine if this shuffle can be implemented with a KSHIFT instruction.

17871

// Returns the shift amount if possible or -1 if not. This is a simplified

17872

// version of matchShuffleAsShift.

17873

static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

17874

int MaskOffset, const APInt &Zeroable) {

17875

int Size = Mask.size();

17876

17877

auto CheckZeros = [&](int Shift, bool Left) {

17878

for (int j = 0; j < Shift; ++j)

17879

if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

17880

return false;

17881

17882

return true;

17883

};

17884

17885

auto MatchShift = [&](int Shift, bool Left) {

17886

unsigned Pos = Left ? Shift : 0;

17887

unsigned Low = Left ? 0 : Shift;

17888

unsigned Len = Size - Shift;

17889

return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

17890

};

17891

17892

for (int Shift = 1; Shift != Size; ++Shift)

17893

for (bool Left : {true, false})

17894

if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

17895

Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

17896

return Shift;

17897

}

17898

17899

return -1;

17900

}

17901

17902

17903

// Lower vXi1 vector shuffles.

17904

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

17905

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

17906

// vector, shuffle and then truncate it back.

17907

static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

17908

MVT VT, SDValue V1, SDValue V2,

17909

const APInt &Zeroable,

17910

const X86Subtarget &Subtarget,

17911

SelectionDAG &DAG) {

17912

assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17913, __PRETTY_FUNCTION__))

17913

"Cannot lower 512-bit vectors w/o basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17913, __PRETTY_FUNCTION__));

17914

17915

int NumElts = Mask.size();

17916

17917

// Try to recognize shuffles that are just padding a subvector with zeros.

17918

int SubvecElts = 0;

17919

int Src = -1;

17920

for (int i = 0; i != NumElts; ++i) {

17921

if (Mask[i] >= 0) {

17922

// Grab the source from the first valid mask. All subsequent elements need

17923

// to use this same source.

17924

if (Src < 0)

17925

Src = Mask[i] / NumElts;

17926

if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

17927

break;

17928

}

17929

17930

++SubvecElts;

17931

}

17932

assert(SubvecElts != NumElts && "Identity shuffle?")((SubvecElts != NumElts && "Identity shuffle?") ? static_cast
<void> (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17932, __PRETTY_FUNCTION__));

17933

17934

// Clip to a power 2.

17935

SubvecElts = PowerOf2Floor(SubvecElts);

17936

17937

// Make sure the number of zeroable bits in the top at least covers the bits

17938

// not covered by the subvector.

17939

if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {

17940

assert(Src >= 0 && "Expected a source!")((Src >= 0 && "Expected a source!") ? static_cast<
void> (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17940, __PRETTY_FUNCTION__));

17941

MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

17942

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,

17943

Src == 0 ? V1 : V2,

17944

DAG.getIntPtrConstant(0, DL));

17945

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

17946

DAG.getConstant(0, DL, VT),

17947

Extract, DAG.getIntPtrConstant(0, DL));

17948

}

17949

17950

// Try a simple shift right with undef elements. Later we'll try with zeros.

17951

if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,

17952

DAG))

17953

return Shift;

17954

17955

// Try to match KSHIFTs.

17956

unsigned Offset = 0;

17957

for (SDValue V : { V1, V2 }) {

17958

unsigned Opcode;

17959

int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

17960

if (ShiftAmt >= 0) {

17961

MVT WideVT = VT;

17962

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

17963

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

17964

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

17965

DAG.getUNDEF(WideVT), V,

17966

DAG.getIntPtrConstant(0, DL));

17967

// Widened right shifts need two shifts to ensure we shift in zeroes.

17968

if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

17969

int WideElts = WideVT.getVectorNumElements();

17970

// Shift left to put the original vector in the MSBs of the new size.

17971

Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

17972

DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

17973

// Increase the shift amount to account for the left shift.

17974

ShiftAmt += WideElts - NumElts;

17975

}

17976

17977

Res = DAG.getNode(Opcode, DL, WideVT, Res,

17978

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

17979

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

17980

DAG.getIntPtrConstant(0, DL));

17981

}

17982

Offset += NumElts; // Increment for next iteration.

17983

}

17984

17985

17986

17987

MVT ExtVT;

17988

switch (VT.SimpleTy) {

17989

default:

17990

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17990);

17991

case MVT::v2i1:

17992

ExtVT = MVT::v2i64;

17993

break;

17994

case MVT::v4i1:

17995

ExtVT = MVT::v4i32;

17996

break;

17997

case MVT::v8i1:

17998

// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

17999

// shuffle.

18000

ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

18001

break;

18002

case MVT::v16i1:

18003

// Take 512-bit type, unless we are avoiding 512-bit types and have the

18004

// 256-bit operation available.

18005

ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

18006

break;

18007

case MVT::v32i1:

18008

// Take 512-bit type, unless we are avoiding 512-bit types and have the

18009

// 256-bit operation available.

18010

assert(Subtarget.hasBWI() && "Expected AVX512BW support")((Subtarget.hasBWI() && "Expected AVX512BW support") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18010, __PRETTY_FUNCTION__));

18011

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

18012

break;

18013

case MVT::v64i1:

18014

// Fall back to scalarization. FIXME: We can do better if the shuffle

18015

// can be partitioned cleanly.

18016

if (!Subtarget.useBWIRegs())

18017

return SDValue();

18018

ExtVT = MVT::v64i8;

18019

break;

18020

}

18021

18022

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

18023

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

18024

18025

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

18026

// i1 was sign extended we can use X86ISD::CVT2MASK.

18027

int NumElems = VT.getVectorNumElements();

18028

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

18029

(Subtarget.hasDQI() && (NumElems < 32)))

18030

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

18031

Shuffle, ISD::SETGT);

18032

18033

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

18034

}

18035

18036

/// Helper function that returns true if the shuffle mask should be

18037

/// commuted to improve canonicalization.

18038

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

18039

int NumElements = Mask.size();

18040

18041

int NumV1Elements = 0, NumV2Elements = 0;

18042

for (int M : Mask)

18043

if (M < 0)

18044

continue;

18045

else if (M < NumElements)

18046

++NumV1Elements;

18047

else

18048

++NumV2Elements;

18049

18050

// Commute the shuffle as needed such that more elements come from V1 than

18051

// V2. This allows us to match the shuffle pattern strictly on how many

18052

// elements come from V1 without handling the symmetric cases.

18053

if (NumV2Elements > NumV1Elements)

18054

return true;

18055

18056

assert(NumV1Elements > 0 && "No V1 indices")((NumV1Elements > 0 && "No V1 indices") ? static_cast
<void> (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18056, __PRETTY_FUNCTION__));

18057

18058

if (NumV2Elements == 0)

18059

return false;

18060

18061

// When the number of V1 and V2 elements are the same, try to minimize the

18062

// number of uses of V2 in the low half of the vector. When that is tied,

18063

// ensure that the sum of indices for V1 is equal to or lower than the sum

18064

// indices for V2. When those are equal, try to ensure that the number of odd

18065

// indices for V1 is lower than the number of odd indices for V2.

18066

if (NumV1Elements == NumV2Elements) {

18067

int LowV1Elements = 0, LowV2Elements = 0;

18068

for (int M : Mask.slice(0, NumElements / 2))

18069

if (M >= NumElements)

18070

++LowV2Elements;

18071

else if (M >= 0)

18072

++LowV1Elements;

18073

if (LowV2Elements > LowV1Elements)

18074

return true;

18075

if (LowV2Elements == LowV1Elements) {

18076

int SumV1Indices = 0, SumV2Indices = 0;

18077

for (int i = 0, Size = Mask.size(); i < Size; ++i)

18078

if (Mask[i] >= NumElements)

18079

SumV2Indices += i;

18080

else if (Mask[i] >= 0)

18081

SumV1Indices += i;

18082

if (SumV2Indices < SumV1Indices)

18083

return true;

18084

if (SumV2Indices == SumV1Indices) {

18085

int NumV1OddIndices = 0, NumV2OddIndices = 0;

18086

for (int i = 0, Size = Mask.size(); i < Size; ++i)

18087

if (Mask[i] >= NumElements)

18088

NumV2OddIndices += i % 2;

18089

else if (Mask[i] >= 0)

18090

NumV1OddIndices += i % 2;

18091

if (NumV2OddIndices < NumV1OddIndices)

18092

return true;

18093

}

18094

}

18095

}

18096

18097

return false;

18098

}

18099

18100

/// Top-level lowering for x86 vector shuffles.

18101

///

18102

/// This handles decomposition, canonicalization, and lowering of all x86

18103

/// vector shuffles. Most of the specific lowering strategies are encapsulated

18104

/// above in helper routines. The canonicalization attempts to widen shuffles

18105

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

18106

/// s.t. only one of the two inputs needs to be tested, etc.

18107

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

18108

SelectionDAG &DAG) {

18109

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

18110

ArrayRef<int> OrigMask = SVOp->getMask();

18111

SDValue V1 = Op.getOperand(0);

18112

SDValue V2 = Op.getOperand(1);

18113

MVT VT = Op.getSimpleValueType();

18114

int NumElements = VT.getVectorNumElements();

18115

SDLoc DL(Op);

18116

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

18117

18118

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18119, __PRETTY_FUNCTION__))

18119

"Can't lower MMX shuffles")(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18119, __PRETTY_FUNCTION__));

18120

18121

bool V1IsUndef = V1.isUndef();

18122

bool V2IsUndef = V2.isUndef();

18123

if (V1IsUndef && V2IsUndef)

18124

return DAG.getUNDEF(VT);

18125

18126

// When we create a shuffle node we put the UNDEF node to second operand,

18127

// but in some cases the first operand may be transformed to UNDEF.

18128

// In this case we should just commute the node.

18129

if (V1IsUndef)

18130

return DAG.getCommutedVectorShuffle(*SVOp);

18131

18132

// Check for non-undef masks pointing at an undef vector and make the masks

18133

// undef as well. This makes it easier to match the shuffle based solely on

18134

// the mask.

18135

if (V2IsUndef &&

18136

any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

18137

SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());

18138

for (int &M : NewMask)

18139

if (M >= NumElements)

18140

M = -1;

18141

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

18142

}

18143

18144

// Check for illegal shuffle mask element index values.

18145

int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

18146

(void)MaskUpperLimit;

18147

assert(llvm::all_of(OrigMask,((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18149, __PRETTY_FUNCTION__))

18148

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18149, __PRETTY_FUNCTION__))

18149

"Out of bounds shuffle index")((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18149, __PRETTY_FUNCTION__));

18150

18151

// We actually see shuffles that are entirely re-arrangements of a set of

18152

// zero inputs. This mostly happens while decomposing complex shuffles into

18153

// simple ones. Directly lower these as a buildvector of zeros.

18154

APInt KnownUndef, KnownZero;

18155

computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

18156

18157

APInt Zeroable = KnownUndef | KnownZero;

18158

if (Zeroable.isAllOnesValue())

18159

return getZeroVector(VT, Subtarget, DAG, DL);

18160

18161

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

18162

18163

// Try to collapse shuffles into using a vector type with fewer elements but

18164

// wider element types. We cap this to not form integers or floating point

18165

// elements wider than 64 bits, but it might be interesting to form i128

18166

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

18167

SmallVector<int, 16> WidenedMask;

18168

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

18169

canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

18170

// Shuffle mask widening should not interfere with a broadcast opportunity

18171

// by obfuscating the operands with bitcasts.

18172

// TODO: Avoid lowering directly from this top-level function: make this

18173

// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

18174

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

18175

Subtarget, DAG))

18176

return Broadcast;

18177

18178

MVT NewEltVT = VT.isFloatingPoint()

18179

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

18180

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

18181

int NewNumElts = NumElements / 2;

18182

MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

18183

// Make sure that the new vector type is legal. For example, v2f64 isn't

18184

// legal on SSE1.

18185

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

18186

if (V2IsZero) {

18187

// Modify the new Mask to take all zeros from the all-zero vector.

18188

// Choose indices that are blend-friendly.

18189

bool UsedZeroVector = false;

18190

assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18191, __PRETTY_FUNCTION__))

18191

"V2's non-undef elements are used?!")((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18191, __PRETTY_FUNCTION__));

18192

for (int i = 0; i != NewNumElts; ++i)

18193

if (WidenedMask[i] == SM_SentinelZero) {

18194

WidenedMask[i] = i + NewNumElts;

18195

UsedZeroVector = true;

18196

}

18197

// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

18198

// some elements to be undef.

18199

if (UsedZeroVector)

18200

V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

18201

}

18202

V1 = DAG.getBitcast(NewVT, V1);

18203

V2 = DAG.getBitcast(NewVT, V2);

18204

return DAG.getBitcast(

18205

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

18206

}

18207

}

18208

18209

// Commute the shuffle if it will improve canonicalization.

18210

SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());

18211

if (canonicalizeShuffleMaskWithCommute(Mask)) {

18212

ShuffleVectorSDNode::commuteMask(Mask);

18213

std::swap(V1, V2);

18214

}

18215

18216

// For each vector width, delegate to a specialized lowering routine.

18217

if (VT.is128BitVector())

18218

return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

18219

18220

if (VT.is256BitVector())

18221

return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

18222

18223

if (VT.is512BitVector())

18224

return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

18225

18226

if (Is1BitVector)

18227

return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

18228

18229

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18229);

18230

}

18231

18232

/// Try to lower a VSELECT instruction to a vector shuffle.

18233

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

18234

const X86Subtarget &Subtarget,

18235

SelectionDAG &DAG) {

18236

SDValue Cond = Op.getOperand(0);

18237

SDValue LHS = Op.getOperand(1);

18238

SDValue RHS = Op.getOperand(2);

18239

MVT VT = Op.getSimpleValueType();

18240

18241

// Only non-legal VSELECTs reach this lowering, convert those into generic

18242

// shuffles and re-use the shuffle lowering path for blends.

18243

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

18244

SmallVector<int, 32> Mask;

18245

if (createShuffleMaskFromVSELECT(Mask, Cond))

18246

return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

18247

}

18248

18249

return SDValue();

18250

}

18251

18252

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

18253

SDValue Cond = Op.getOperand(0);

18254

SDValue LHS = Op.getOperand(1);

18255

SDValue RHS = Op.getOperand(2);

18256

18257

// A vselect where all conditions and data are constants can be optimized into

18258

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

18259

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

18260

ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

18261

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

18262

return SDValue();

18263

18264

// Try to lower this to a blend-style vector shuffle. This can handle all

18265

// constant condition cases.

18266

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

18267

return BlendOp;

18268

18269

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

18270

// with patterns on the mask registers on AVX-512.

18271

MVT CondVT = Cond.getSimpleValueType();

18272

unsigned CondEltSize = Cond.getScalarValueSizeInBits();

18273

if (CondEltSize == 1)

18274

return Op;

18275

18276

// Variable blends are only legal from SSE4.1 onward.

18277

if (!Subtarget.hasSSE41())

18278

return SDValue();

18279

18280

SDLoc dl(Op);

18281

MVT VT = Op.getSimpleValueType();

18282

unsigned EltSize = VT.getScalarSizeInBits();

18283

unsigned NumElts = VT.getVectorNumElements();

18284

18285

// Expand v32i16/v64i8 without BWI.

18286

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

18287

return SDValue();

18288

18289

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

18290

// into an i1 condition so that we can use the mask-based 512-bit blend

18291

// instructions.

18292

if (VT.getSizeInBits() == 512) {

18293

// Build a mask by testing the condition against zero.

18294

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

18295

SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

18296

DAG.getConstant(0, dl, CondVT),

18297

ISD::SETNE);

18298

// Now return a new VSELECT using the mask.

18299

return DAG.getSelect(dl, VT, Mask, LHS, RHS);

18300

}

18301

18302

// SEXT/TRUNC cases where the mask doesn't match the destination size.

18303

if (CondEltSize != EltSize) {

18304

// If we don't have a sign splat, rely on the expansion.

18305

if (CondEltSize != DAG.ComputeNumSignBits(Cond))

18306

return SDValue();

18307

18308

MVT NewCondSVT = MVT::getIntegerVT(EltSize);

18309

MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

18310

Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

18311

return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

18312

}

18313

18314

// Only some types will be legal on some subtargets. If we can emit a legal

18315

// VSELECT-matching blend, return Op, and but if we need to expand, return

18316

// a null value.

18317

switch (VT.SimpleTy) {

18318

default:

18319

// Most of the vector types have blends past SSE4.1.

18320

return Op;

18321

18322

case MVT::v32i8:

18323

// The byte blends for AVX vectors were introduced only in AVX2.

18324

if (Subtarget.hasAVX2())

18325

return Op;

18326

18327

return SDValue();

18328

18329

case MVT::v8i16:

18330

case MVT::v16i16: {

18331

// Bitcast everything to the vXi8 type and use a vXi8 vselect.

18332

MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

18333

Cond = DAG.getBitcast(CastVT, Cond);

18334

LHS = DAG.getBitcast(CastVT, LHS);

18335

RHS = DAG.getBitcast(CastVT, RHS);

18336

SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

18337

return DAG.getBitcast(VT, Select);

18338

}

18339

}

18340

}

18341

18342

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

18343

MVT VT = Op.getSimpleValueType();

18344

SDValue Vec = Op.getOperand(0);

18345

SDValue Idx = Op.getOperand(1);

18346

assert(isa<ConstantSDNode>(Idx) && "Constant index expected")((isa<ConstantSDNode>(Idx) && "Constant index expected"
) ? static_cast<void> (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18346, __PRETTY_FUNCTION__));

18347

SDLoc dl(Op);

18348

18349

if (!Vec.getSimpleValueType().is128BitVector())

18350

return SDValue();

18351

18352

if (VT.getSizeInBits() == 8) {

18353

// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

18354

// we're going to zero extend the register or fold the store.

18355

if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&

18356

!MayFoldIntoStore(Op))

18357

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

18358

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

18359

DAG.getBitcast(MVT::v4i32, Vec), Idx));

18360

18361

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);

18362

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

18363

}

18364

18365

if (VT == MVT::f32) {

18366

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

18367

// the result back to FR32 register. It's only worth matching if the

18368

// result has a single use which is a store or a bitcast to i32. And in

18369

// the case of a store, it's not worth it if the index is a constant 0,

18370

// because a MOVSSmr can be used instead, which is smaller and faster.

18371

if (!Op.hasOneUse())

18372

return SDValue();

18373

SDNode *User = *Op.getNode()->use_begin();

18374

if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

18375

(User->getOpcode() != ISD::BITCAST ||

18376

User->getValueType(0) != MVT::i32))

18377

return SDValue();

18378

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

18379

DAG.getBitcast(MVT::v4i32, Vec), Idx);

18380

return DAG.getBitcast(MVT::f32, Extract);

18381

}

18382

18383

if (VT == MVT::i32 || VT == MVT::i64)

18384

return Op;

18385

18386

return SDValue();

18387

}

18388

18389

/// Extract one bit from mask vector, like v16i1 or v8i1.

18390

/// AVX-512 feature.

18391

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

18392

const X86Subtarget &Subtarget) {

18393

SDValue Vec = Op.getOperand(0);

18394

SDLoc dl(Vec);

18395

MVT VecVT = Vec.getSimpleValueType();

18396

SDValue Idx = Op.getOperand(1);

18397

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

18398

MVT EltVT = Op.getSimpleValueType();

18399

18400

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18401, __PRETTY_FUNCTION__))

18401

"Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18401, __PRETTY_FUNCTION__));

18402

18403

// variable index can't be handled in mask registers,

18404

// extend vector to VR512/128

18405

if (!IdxC) {

18406

unsigned NumElts = VecVT.getVectorNumElements();

18407

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

18408

// than extending to 128/256bit.

18409

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

18410

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

18411

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

18412

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

18413

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

18414

}

18415

18416

unsigned IdxVal = IdxC->getZExtValue();

18417

if (IdxVal == 0) // the operation is legal

18418

return Op;

18419

18420

// Extend to natively supported kshift.

18421

unsigned NumElems = VecVT.getVectorNumElements();

18422

MVT WideVecVT = VecVT;

18423

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

18424

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

18425

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

18426

DAG.getUNDEF(WideVecVT), Vec,

18427

DAG.getIntPtrConstant(0, dl));

18428

}

18429

18430

// Use kshiftr instruction to move to the lower element.

18431

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

18432

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

18433

18434

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

18435

DAG.getIntPtrConstant(0, dl));

18436

}

18437

18438

SDValue

18439

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

18440

SelectionDAG &DAG) const {

18441

SDLoc dl(Op);

18442

SDValue Vec = Op.getOperand(0);

18443

MVT VecVT = Vec.getSimpleValueType();

18444

SDValue Idx = Op.getOperand(1);

18445

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

18446

18447

if (VecVT.getVectorElementType() == MVT::i1)

18448

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

18449

18450

if (!IdxC) {

18451

// Its more profitable to go through memory (1 cycles throughput)

18452

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

18453

// IACA tool was used to get performance estimation

18454

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

18455

//

18456

// example : extractelement <16 x i8> %a, i32 %i

18457

//

18458

// Block Throughput: 3.00 Cycles

18459

// Throughput Bottleneck: Port5

18460

//

18461

// | Num Of | Ports pressure in cycles | |

18462

// | Uops | 0 - DV | 5 | 6 | 7 | |

18463

// ---------------------------------------------

18464

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

18465

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

18466

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

18467

// Total Num Of Uops: 4

18468

//

18469

//

18470

// Block Throughput: 1.00 Cycles

18471

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

18472

//

18473

// | | Ports pressure in cycles | |

18474

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

18475

// ---------------------------------------------------------

18476

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

18477

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

18478

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

18479

// Total Num Of Uops: 4

18480

18481

return SDValue();

18482

}

18483

18484

unsigned IdxVal = IdxC->getZExtValue();

18485

18486

// If this is a 256-bit vector result, first extract the 128-bit vector and

18487

// then extract the element from the 128-bit vector.

18488

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

18489

// Get the 128-bit vector.

18490

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

18491

MVT EltVT = VecVT.getVectorElementType();

18492

18493

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

18494

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18494, __PRETTY_FUNCTION__));

18495

18496

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

18497

// this can be done with a mask.

18498

IdxVal &= ElemsPerChunk - 1;

18499

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

18500

DAG.getIntPtrConstant(IdxVal, dl));

18501

}

18502

18503

assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length"
) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18503, __PRETTY_FUNCTION__));

18504

18505

MVT VT = Op.getSimpleValueType();

18506

18507

if (VT.getSizeInBits() == 16) {

18508

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

18509

// we're going to zero extend the register or fold the store (SSE41 only).

18510

if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&

18511

!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))

18512

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

18513

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

18514

DAG.getBitcast(MVT::v4i32, Vec), Idx));

18515

18516

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);

18517

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

18518

}

18519

18520

if (Subtarget.hasSSE41())

18521

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

18522

return Res;

18523

18524

// TODO: We only extract a single element from v16i8, we can probably afford

18525

// to be more aggressive here before using the default approach of spilling to

18526

// stack.

18527

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

18528

// Extract either the lowest i32 or any i16, and extract the sub-byte.

18529

int DWordIdx = IdxVal / 4;

18530

if (DWordIdx == 0) {

18531

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

18532

DAG.getBitcast(MVT::v4i32, Vec),

18533

DAG.getIntPtrConstant(DWordIdx, dl));

18534

int ShiftVal = (IdxVal % 4) * 8;

18535

if (ShiftVal != 0)

18536

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

18537

DAG.getConstant(ShiftVal, dl, MVT::i8));

18538

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

18539

}

18540

18541

int WordIdx = IdxVal / 2;

18542

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

18543

DAG.getBitcast(MVT::v8i16, Vec),

18544

DAG.getIntPtrConstant(WordIdx, dl));

18545

int ShiftVal = (IdxVal % 2) * 8;

18546

if (ShiftVal != 0)

18547

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

18548

DAG.getConstant(ShiftVal, dl, MVT::i8));

18549

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

18550

}

18551

18552

if (VT.getSizeInBits() == 32) {

18553

if (IdxVal == 0)

18554

return Op;

18555

18556

// SHUFPS the element to the lowest double word, then movss.

18557

int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };

18558

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

18559

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

18560

DAG.getIntPtrConstant(0, dl));

18561

}

18562

18563

if (VT.getSizeInBits() == 64) {

18564

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

18565

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

18566

// to match extract_elt for f64.

18567

if (IdxVal == 0)

18568

return Op;

18569

18570

// UNPCKHPD the element to the lowest double word, then movsd.

18571

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

18572

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

18573

int Mask[2] = { 1, -1 };

18574

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

18575

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

18576

DAG.getIntPtrConstant(0, dl));

18577

}

18578

18579

return SDValue();

18580

}

18581

18582

/// Insert one bit to mask vector, like v16i1 or v8i1.

18583

/// AVX-512 feature.

18584

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

18585

const X86Subtarget &Subtarget) {

18586

SDLoc dl(Op);

18587

SDValue Vec = Op.getOperand(0);

18588

SDValue Elt = Op.getOperand(1);

18589

SDValue Idx = Op.getOperand(2);

18590

MVT VecVT = Vec.getSimpleValueType();

18591

18592

if (!isa<ConstantSDNode>(Idx)) {

18593

// Non constant index. Extend source and destination,

18594

// insert element and then truncate the result.

18595

unsigned NumElts = VecVT.getVectorNumElements();

18596

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

18597

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

18598

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

18599

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

18600

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

18601

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

18602

}

18603

18604

// Copy into a k-register, extract to v1i1 and insert_subvector.

18605

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

18606

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

18607

}

18608

18609

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

18610

SelectionDAG &DAG) const {

18611

MVT VT = Op.getSimpleValueType();

18612

MVT EltVT = VT.getVectorElementType();

18613

unsigned NumElts = VT.getVectorNumElements();

18614

18615

if (EltVT == MVT::i1)

18616

return InsertBitToMaskVector(Op, DAG, Subtarget);

18617

18618

SDLoc dl(Op);

18619

SDValue N0 = Op.getOperand(0);

18620

SDValue N1 = Op.getOperand(1);

18621

SDValue N2 = Op.getOperand(2);

18622

18623

auto *N2C = dyn_cast<ConstantSDNode>(N2);

18624

if (!N2C || N2C->getAPIntValue().uge(NumElts))

18625

return SDValue();

18626

uint64_t IdxVal = N2C->getZExtValue();

18627

18628

bool IsZeroElt = X86::isZeroNode(N1);

18629

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

18630

18631

// If we are inserting a element, see if we can do this more efficiently with

18632

// a blend shuffle with a rematerializable vector than a costly integer

18633

// insertion.

18634

if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&

18635

16 <= EltVT.getSizeInBits()) {

18636

SmallVector<int, 8> BlendMask;

18637

for (unsigned i = 0; i != NumElts; ++i)

18638

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

18639

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

18640

: getOnesVector(VT, DAG, dl);

18641

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

18642

}

18643

18644

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

18645

// into that, and then insert the subvector back into the result.

18646

if (VT.is256BitVector() || VT.is512BitVector()) {

18647

// With a 256-bit vector, we can insert into the zero element efficiently

18648

// using a blend if we have AVX or AVX2 and the right data type.

18649

if (VT.is256BitVector() && IdxVal == 0) {

18650

// TODO: It is worthwhile to cast integer to floating point and back

18651

// and incur a domain crossing penalty if that's what we'll end up

18652

// doing anyway after extracting to a 128-bit vector.

18653

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

18654

(Subtarget.hasAVX2() && EltVT == MVT::i32)) {

18655

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

18656

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

18657

DAG.getTargetConstant(1, dl, MVT::i8));

18658

}

18659

}

18660

18661

// Get the desired 128-bit vector chunk.

18662

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

18663

18664

// Insert the element into the desired chunk.

18665

unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();

18666

assert(isPowerOf2_32(NumEltsIn128))((isPowerOf2_32(NumEltsIn128)) ? static_cast<void> (0) :
__assert_fail ("isPowerOf2_32(NumEltsIn128)", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18666, __PRETTY_FUNCTION__));

18667

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

18668

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

18669

18670

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

18671

DAG.getIntPtrConstant(IdxIn128, dl));

18672

18673

// Insert the changed part back into the bigger vector

18674

return insert128BitVector(N0, V, IdxVal, DAG, dl);

18675

}

18676

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18676, __PRETTY_FUNCTION__));

18677

18678

// This will be just movd/movq/movss/movsd.

18679

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

18680

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

18681

EltVT == MVT::i64) {

18682

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

18683

return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

18684

}

18685

18686

// We can't directly insert an i8 or i16 into a vector, so zero extend

18687

// it to i32 first.

18688

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

18689

N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

18690

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);

18691

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

18692

N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

18693

return DAG.getBitcast(VT, N1);

18694

}

18695

}

18696

18697

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

18698

// argument. SSE41 required for pinsrb.

18699

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

18700

unsigned Opc;

18701

if (VT == MVT::v8i16) {

18702

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((Subtarget.hasSSE2() && "SSE2 required for PINSRW") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18702, __PRETTY_FUNCTION__));

18703

Opc = X86ISD::PINSRW;

18704

} else {

18705

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((VT == MVT::v16i8 && "PINSRB requires v16i8 vector")
? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18705, __PRETTY_FUNCTION__));

18706

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18706, __PRETTY_FUNCTION__));

18707

Opc = X86ISD::PINSRB;

18708

}

18709

18710

if (N1.getValueType() != MVT::i32)

18711

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

18712

if (N2.getValueType() != MVT::i32)

18713

N2 = DAG.getIntPtrConstant(IdxVal, dl);

18714

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

18715

}

18716

18717

if (Subtarget.hasSSE41()) {

18718

if (EltVT == MVT::f32) {

18719

// Bits [7:6] of the constant are the source select. This will always be

18720

// zero here. The DAG Combiner may combine an extract_elt index into

18721

// these bits. For example (insert (extract, 3), 2) could be matched by

18722

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

18723

// Bits [5:4] of the constant are the destination select. This is the

18724

// value of the incoming immediate.

18725

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

18726

// combine either bitwise AND or insert of float 0.0 to set these bits.

18727

18728

bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

18729

if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {

18730

// If this is an insertion of 32-bits into the low 32-bits of

18731

// a vector, we prefer to generate a blend with immediate rather

18732

// than an insertps. Blends are simpler operations in hardware and so

18733

// will always have equal or better performance than insertps.

18734

// But if optimizing for size and there's a load folding opportunity,

18735

// generate insertps because blendps does not have a 32-bit memory

18736

// operand form.

18737

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

18738

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

18739

DAG.getTargetConstant(1, dl, MVT::i8));

18740

}

18741

// Create this as a scalar to vector..

18742

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

18743

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

18744

DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

18745

}

18746

18747

// PINSR* works with constant index.

18748

if (EltVT == MVT::i32 || EltVT == MVT::i64)

18749

return Op;

18750

}

18751

18752

return SDValue();

18753

}

18754

18755

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

18756

SelectionDAG &DAG) {

18757

SDLoc dl(Op);

18758

MVT OpVT = Op.getSimpleValueType();

18759

18760

// It's always cheaper to replace a xor+movd with xorps and simplifies further

18761

// combines.

18762

if (X86::isZeroNode(Op.getOperand(0)))

18763

return getZeroVector(OpVT, Subtarget, DAG, dl);

18764

18765

// If this is a 256-bit vector result, first insert into a 128-bit

18766

// vector and then insert into the 256-bit vector.

18767

if (!OpVT.is128BitVector()) {

18768

// Insert into a 128-bit vector.

18769

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

18770

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

18771

OpVT.getVectorNumElements() / SizeFactor);

18772

18773

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

18774

18775

// Insert the 128-bit vector.

18776

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

18777

}

18778

assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18779, __PRETTY_FUNCTION__))

18779

"Expected an SSE type!")((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18779, __PRETTY_FUNCTION__));

18780

18781

// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.

18782

if (OpVT == MVT::v4i32)

18783

return Op;

18784

18785

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

18786

return DAG.getBitcast(

18787

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

18788

}

18789

18790

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

18791

// simple superregister reference or explicit instructions to insert

18792

// the upper bits of a vector.

18793

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

18794

SelectionDAG &DAG) {

18795

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((Op.getSimpleValueType().getVectorElementType() == MVT::i1) ?
static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18795, __PRETTY_FUNCTION__));

18796

18797

return insert1BitVector(Op, DAG, Subtarget);

18798

}

18799

18800

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

18801

SelectionDAG &DAG) {

18802

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18803, __PRETTY_FUNCTION__))

18803

"Only vXi1 extract_subvectors need custom lowering")((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18803, __PRETTY_FUNCTION__));

18804

18805

SDLoc dl(Op);

18806

SDValue Vec = Op.getOperand(0);

18807

uint64_t IdxVal = Op.getConstantOperandVal(1);

18808

18809

if (IdxVal == 0) // the operation is legal

18810

return Op;

18811

18812

MVT VecVT = Vec.getSimpleValueType();

18813

unsigned NumElems = VecVT.getVectorNumElements();

18814

18815

// Extend to natively supported kshift.

18816

MVT WideVecVT = VecVT;

18817

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

18818

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

18819

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

18820

DAG.getUNDEF(WideVecVT), Vec,

18821

DAG.getIntPtrConstant(0, dl));

18822

}

18823

18824

// Shift to the LSB.

18825

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

18826

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

18827

18828

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

18829

DAG.getIntPtrConstant(0, dl));

18830

}

18831

18832

// Returns the appropriate wrapper opcode for a global reference.

18833

unsigned X86TargetLowering::getGlobalWrapperKind(

18834

const GlobalValue *GV, const unsigned char OpFlags) const {

18835

// References to absolute symbols are never PC-relative.

18836

if (GV && GV->isAbsoluteSymbolRef())

18837

return X86ISD::Wrapper;

18838

18839

CodeModel::Model M = getTargetMachine().getCodeModel();

18840

if (Subtarget.isPICStyleRIPRel() &&

18841

(M == CodeModel::Small || M == CodeModel::Kernel))

18842

return X86ISD::WrapperRIP;

18843

18844

// GOTPCREL references must always use RIP.

18845

if (OpFlags == X86II::MO_GOTPCREL)

18846

return X86ISD::WrapperRIP;

18847

18848

return X86ISD::Wrapper;

18849

}

18850

18851

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

18852

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

18853

// one of the above mentioned nodes. It has to be wrapped because otherwise

18854

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

18855

// be used to form addressing mode. These wrapped nodes will be selected

18856

// into MOV32ri.

18857

SDValue

18858

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

18859

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

18860

18861

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

18862

// global base reg.

18863

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

18864

18865

auto PtrVT = getPointerTy(DAG.getDataLayout());

18866

SDValue Result = DAG.getTargetConstantPool(

18867

CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

18868

SDLoc DL(CP);

18869

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

18870

// With PIC, the address is actually $g + Offset.

18871

if (OpFlag) {

18872

Result =

18873

DAG.getNode(ISD::ADD, DL, PtrVT,

18874

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

18875

}

18876

18877

return Result;

18878

}

18879

18880

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

18881

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

18882

18883

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

18884

// global base reg.

18885

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

18886

18887

auto PtrVT = getPointerTy(DAG.getDataLayout());

18888

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

18889

SDLoc DL(JT);

18890

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

18891

18892

// With PIC, the address is actually $g + Offset.

18893

if (OpFlag)

18894

Result =

18895

DAG.getNode(ISD::ADD, DL, PtrVT,

18896

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

18897

18898

return Result;

18899

}

18900

18901

SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

18902

SelectionDAG &DAG) const {

18903

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

18904

}

18905

18906

SDValue

18907

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

18908

// Create the TargetBlockAddressAddress node.

18909

unsigned char OpFlags =

18910

Subtarget.classifyBlockAddressReference();

18911

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

18912

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

18913

SDLoc dl(Op);

18914

auto PtrVT = getPointerTy(DAG.getDataLayout());

18915

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

18916

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

18917

18918

// With PIC, the address is actually $g + Offset.

18919

if (isGlobalRelativeToPICBase(OpFlags)) {

18920

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

18921

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

18922

}

18923

18924

return Result;

18925

}

18926

18927

/// Creates target global address or external symbol nodes for calls or

18928

/// other uses.

18929

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

18930

bool ForCall) const {

18931

// Unpack the global address or external symbol.

18932

const SDLoc &dl = SDLoc(Op);

18933

const GlobalValue *GV = nullptr;

18934

int64_t Offset = 0;

18935

const char *ExternalSym = nullptr;

18936

if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

18937

GV = G->getGlobal();

18938

Offset = G->getOffset();

18939

} else {

18940

const auto *ES = cast<ExternalSymbolSDNode>(Op);

18941

ExternalSym = ES->getSymbol();

18942

}

18943

18944

// Calculate some flags for address lowering.

18945

const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

18946

unsigned char OpFlags;

18947

if (ForCall)

18948

OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

18949

else

18950

OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

18951

bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

18952

bool NeedsLoad = isGlobalStubReference(OpFlags);

18953

18954

CodeModel::Model M = DAG.getTarget().getCodeModel();

18955

auto PtrVT = getPointerTy(DAG.getDataLayout());

18956

SDValue Result;

18957

18958

if (GV) {

18959

// Create a target global address if this is a global. If possible, fold the

18960

// offset into the global address reference. Otherwise, ADD it on later.

18961

int64_t GlobalOffset = 0;

18962

if (OpFlags == X86II::MO_NO_FLAG &&

18963

X86::isOffsetSuitableForCodeModel(Offset, M)) {

18964

std::swap(GlobalOffset, Offset);

18965

}

18966

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

18967

} else {

18968

// If this is not a global address, this must be an external symbol.

18969

Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

18970

}

18971

18972

// If this is a direct call, avoid the wrapper if we don't need to do any

18973

// loads or adds. This allows SDAG ISel to match direct calls.

18974

if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

18975

return Result;

18976

18977

Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

18978

18979

// With PIC, the address is actually $g + Offset.

18980

if (HasPICReg) {

18981

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

18982

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

18983

}

18984

18985

// For globals that require a load from a stub to get the address, emit the

18986

// load.

18987

if (NeedsLoad)

18988

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

18989

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

18990

18991

// If there was a non-zero offset that we didn't fold, create an explicit

18992

// addition for it.

18993

if (Offset != 0)

18994

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

18995

DAG.getConstant(Offset, dl, PtrVT));

18996

18997

return Result;

18998

}

18999

19000

SDValue

19001

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

19002

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

19003

}

19004

19005

static SDValue

19006

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

19007

SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,

19008

unsigned char OperandFlags, bool LocalDynamic = false) {

19009

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

19010

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

19011

SDLoc dl(GA);

19012

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

19013

GA->getValueType(0),

19014

GA->getOffset(),

19015

OperandFlags);

19016

19017

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

19018

: X86ISD::TLSADDR;

19019

19020

if (InFlag) {

19021

SDValue Ops[] = { Chain, TGA, *InFlag };

19022

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

19023

} else {

19024

SDValue Ops[] = { Chain, TGA };

19025

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

19026

}

19027

19028

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

19029

MFI.setAdjustsStack(true);

19030

MFI.setHasCalls(true);

19031

19032

SDValue Flag = Chain.getValue(1);

19033

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);

19034

}

19035

19036

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

19037

static SDValue

19038

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

19039

const EVT PtrVT) {

19040

SDValue InFlag;

19041

SDLoc dl(GA); // ? function entry point might be better

19042

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

19043

DAG.getNode(X86ISD::GlobalBaseReg,

19044

SDLoc(), PtrVT), InFlag);

19045

InFlag = Chain.getValue(1);

19046

19047

return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);

19048

}

19049

19050

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit

19051

static SDValue

19052

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

19053

const EVT PtrVT) {

19054

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

19055

X86::RAX, X86II::MO_TLSGD);

19056

}

19057

19058

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

19059

SelectionDAG &DAG,

19060

const EVT PtrVT,

19061

bool is64Bit) {

19062

SDLoc dl(GA);

19063

19064

// Get the start address of the TLS block for this module.

19065

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

19066

.getInfo<X86MachineFunctionInfo>();

19067

MFI->incNumLocalDynamicTLSAccesses();

19068

19069

SDValue Base;

19070

if (is64Bit) {

19071

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,

19072

X86II::MO_TLSLD, /*LocalDynamic=*/true);

19073

} else {

19074

SDValue InFlag;

19075

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

19076

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);

19077

InFlag = Chain.getValue(1);

19078

Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,

19079

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

19080

}

19081

19082

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

19083

// of Base.

19084

19085

// Build x@dtpoff.

19086

unsigned char OperandFlags = X86II::MO_DTPOFF;

19087

unsigned WrapperKind = X86ISD::Wrapper;

19088

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

19089

GA->getValueType(0),

19090

GA->getOffset(), OperandFlags);

19091

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

19092

19093

// Add x@dtpoff with the base.

19094

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

19095

}

19096

19097

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

19098

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

19099

const EVT PtrVT, TLSModel::Model model,

19100

bool is64Bit, bool isPIC) {

19101

SDLoc dl(GA);

19102

19103

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

19104

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

19105

is64Bit ? 257 : 256));

19106

19107

SDValue ThreadPointer =

19108

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

19109

MachinePointerInfo(Ptr));

19110

19111

unsigned char OperandFlags = 0;

19112

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

19113

// initialexec.

19114

unsigned WrapperKind = X86ISD::Wrapper;

19115

if (model == TLSModel::LocalExec) {

19116

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

19117

} else if (model == TLSModel::InitialExec) {

19118

if (is64Bit) {

19119

OperandFlags = X86II::MO_GOTTPOFF;

19120

WrapperKind = X86ISD::WrapperRIP;

19121

} else {

19122

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

19123

}

19124

} else {

19125

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19125);

19126

}

19127

19128

// emit "addl x@ntpoff,%eax" (local exec)

19129

// or "addl x@indntpoff,%eax" (initial exec)

19130

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

19131

SDValue TGA =

19132

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

19133

GA->getOffset(), OperandFlags);

19134

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

19135

19136

if (model == TLSModel::InitialExec) {

19137

if (isPIC && !is64Bit) {

19138

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

19139

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

19140

Offset);

19141

}

19142

19143

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

19144

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

19145

}

19146

19147

// The address of the thread local variable is the add of the thread

19148

// pointer with the offset of the variable.

19149

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

19150

}

19151

19152

SDValue

19153

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

19154

19155

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

19156

19157

if (DAG.getTarget().useEmulatedTLS())

19158

return LowerToTLSEmulatedModel(GA, DAG);

19159

19160

const GlobalValue *GV = GA->getGlobal();

19161

auto PtrVT = getPointerTy(DAG.getDataLayout());

19162

bool PositionIndependent = isPositionIndependent();

19163

19164

if (Subtarget.isTargetELF()) {

19165

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

19166

switch (model) {

19167

case TLSModel::GeneralDynamic:

19168

if (Subtarget.is64Bit())

19169

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

19170

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

19171

case TLSModel::LocalDynamic:

19172

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,

19173

Subtarget.is64Bit());

19174

case TLSModel::InitialExec:

19175

case TLSModel::LocalExec:

19176

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

19177

PositionIndependent);

19178

}

19179

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19179);

19180

}

19181

19182

if (Subtarget.isTargetDarwin()) {

19183

// Darwin only has one model of TLS. Lower to that.

19184

unsigned char OpFlag = 0;

19185

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

19186

X86ISD::WrapperRIP : X86ISD::Wrapper;

19187

19188

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

19189

// global base reg.

19190

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

19191

if (PIC32)

19192

OpFlag = X86II::MO_TLVP_PIC_BASE;

19193

else

19194

OpFlag = X86II::MO_TLVP;

19195

SDLoc DL(Op);

19196

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

19197

GA->getValueType(0),

19198

GA->getOffset(), OpFlag);

19199

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

19200

19201

// With PIC32, the address is actually $g + Offset.

19202

if (PIC32)

19203

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

19204

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

19205

Offset);

19206

19207

// Lowering the machine isd will make sure everything is in the right

19208

// location.

19209

SDValue Chain = DAG.getEntryNode();

19210

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

19211

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

19212

SDValue Args[] = { Chain, Offset };

19213

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

19214

Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),

19215

DAG.getIntPtrConstant(0, DL, true),

19216

Chain.getValue(1), DL);

19217

19218

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

19219

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

19220

MFI.setAdjustsStack(true);

19221

19222

// And our return value (tls address) is in the standard call return value

19223

// location.

19224

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

19225

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

19226

}

19227

19228

if (Subtarget.isOSWindows()) {

19229

// Just use the implicit TLS architecture

19230

// Need to generate something similar to:

19231

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

19232

// ; from TEB

19233

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

19234

// mov rcx, qword [rdx+rcx*8]

19235

// mov eax, .tls$:tlsvar

19236

// [rax+rcx] contains the address

19237

// Windows 64bit: gs:0x58

19238

// Windows 32bit: fs:__tls_array

19239

19240

SDLoc dl(GA);

19241

SDValue Chain = DAG.getEntryNode();

19242

19243

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

19244

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

19245

// use its literal value of 0x2C.

19246

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

19247

? Type::getInt8PtrTy(*DAG.getContext(),

19248

256)

19249

: Type::getInt32PtrTy(*DAG.getContext(),

19250

257));

19251

19252

SDValue TlsArray = Subtarget.is64Bit()

19253

? DAG.getIntPtrConstant(0x58, dl)

19254

: (Subtarget.isTargetWindowsGNU()

19255

? DAG.getIntPtrConstant(0x2C, dl)

19256

: DAG.getExternalSymbol("_tls_array", PtrVT));

19257

19258

SDValue ThreadPointer =

19259

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

19260

19261

SDValue res;

19262

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

19263

res = ThreadPointer;

19264

} else {

19265

// Load the _tls_index variable

19266

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

19267

if (Subtarget.is64Bit())

19268

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

19269

MachinePointerInfo(), MVT::i32);

19270

else

19271

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

19272

19273

const DataLayout &DL = DAG.getDataLayout();

19274

SDValue Scale =

19275

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

19276

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

19277

19278

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

19279

}

19280

19281

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

19282

19283

// Get the offset of start of .tls section

19284

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

19285

GA->getValueType(0),

19286

GA->getOffset(), X86II::MO_SECREL);

19287

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

19288

19289

// The address of the thread local variable is the add of the thread

19290

// pointer with the offset of the variable.

19291

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

19292

}

19293

19294

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19294);

19295

}

19296

19297

/// Lower SRA_PARTS and friends, which return two i32 values

19298

/// and take a 2 x i32 value to shift plus a shift amount.

19299

/// TODO: Can this be moved to general expansion code?

19300

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

19301

assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ?
static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19301, __PRETTY_FUNCTION__));

19302

MVT VT = Op.getSimpleValueType();

19303

unsigned VTBits = VT.getSizeInBits();

19304

SDLoc dl(Op);

19305

bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;

19306

SDValue ShOpLo = Op.getOperand(0);

19307

SDValue ShOpHi = Op.getOperand(1);

19308

SDValue ShAmt = Op.getOperand(2);

19309

// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and

19310

// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away

19311

// during isel.

19312

SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,

19313

DAG.getConstant(VTBits - 1, dl, MVT::i8));

19314

SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,

19315

DAG.getConstant(VTBits - 1, dl, MVT::i8))

19316

: DAG.getConstant(0, dl, VT);

19317

19318

SDValue Tmp2, Tmp3;

19319

if (Op.getOpcode() == ISD::SHL_PARTS) {

19320

Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);

19321

Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);

19322

} else {

19323

Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);

19324

Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);

19325

}

19326

19327

// If the shift amount is larger or equal than the width of a part we can't

19328

// rely on the results of shld/shrd. Insert a test and select the appropriate

19329

// values for large shift amounts.

19330

SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,

19331

DAG.getConstant(VTBits, dl, MVT::i8));

19332

SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,

19333

DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);

19334

19335

SDValue Hi, Lo;

19336

if (Op.getOpcode() == ISD::SHL_PARTS) {

19337

Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);

19338

Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);

19339

} else {

19340

Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);

19341

Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);

19342

}

19343

19344

return DAG.getMergeValues({ Lo, Hi }, dl);

19345

}

19346

19347

static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

19348

SelectionDAG &DAG) {

19349

MVT VT = Op.getSimpleValueType();

19350

assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19351, __PRETTY_FUNCTION__))

19351

"Unexpected funnel shift opcode!")(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19351, __PRETTY_FUNCTION__));

19352

19353

SDLoc DL(Op);

19354

SDValue Op0 = Op.getOperand(0);

19355

SDValue Op1 = Op.getOperand(1);

19356

SDValue Amt = Op.getOperand(2);

19357

19358

bool IsFSHR = Op.getOpcode() == ISD::FSHR;

19359

19360

if (VT.isVector()) {

19361

assert(Subtarget.hasVBMI2() && "Expected VBMI2")((Subtarget.hasVBMI2() && "Expected VBMI2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVBMI2() && \"Expected VBMI2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19361, __PRETTY_FUNCTION__));

19362

19363

if (IsFSHR)

19364

std::swap(Op0, Op1);

19365

19366

APInt APIntShiftAmt;

19367

if (X86::isConstantSplat(Amt, APIntShiftAmt)) {

19368

uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());

19369

return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,

19370

Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19371

}

19372

19373

return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

19374

Op0, Op1, Amt);

19375

}

19376

assert((((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT ==
MVT::i64) && "Unexpected funnel shift type!") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19378, __PRETTY_FUNCTION__))

19377

(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT ==
MVT::i64) && "Unexpected funnel shift type!") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19378, __PRETTY_FUNCTION__))

19378

"Unexpected funnel shift type!")(((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT ==
MVT::i64) && "Unexpected funnel shift type!") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19378, __PRETTY_FUNCTION__));

19379

19380

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

19381

bool OptForSize = DAG.shouldOptForSize();

19382

bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

19383

19384

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

19385

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

19386

if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

19387

!isa<ConstantSDNode>(Amt)) {

19388

unsigned EltSizeInBits = VT.getScalarSizeInBits();

19389

SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

19390

SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

19391

Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

19392

Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

19393

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

19394

SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

19395

Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

19396

if (IsFSHR) {

19397

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

19398

} else {

19399

Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

19400

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

19401

}

19402

return DAG.getZExtOrTrunc(Res, DL, VT);

19403

}

19404

19405

if (VT == MVT::i8 || ExpandFunnel)

19406

return SDValue();

19407

19408

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

19409

if (VT == MVT::i16) {

19410

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

19411

DAG.getConstant(15, DL, Amt.getValueType()));

19412

unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

19413

return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

19414

}

19415

19416

return Op;

19417

}

19418

19419

// Try to use a packed vector operation to handle i64 on 32-bit targets when

19420

// AVX512DQ is enabled.

19421

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

19422

const X86Subtarget &Subtarget) {

19423

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19427, __PRETTY_FUNCTION__))

19424

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19427, __PRETTY_FUNCTION__))

19425

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19427, __PRETTY_FUNCTION__))

19426

Op.getOpcode() == ISD::UINT_TO_FP) &&(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19427, __PRETTY_FUNCTION__))

19427

"Unexpected opcode!")(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19427, __PRETTY_FUNCTION__));

19428

bool IsStrict = Op->isStrictFPOpcode();

19429

unsigned OpNo = IsStrict ? 1 : 0;

19430

SDValue Src = Op.getOperand(OpNo);

19431

MVT SrcVT = Src.getSimpleValueType();

19432

MVT VT = Op.getSimpleValueType();

19433

19434

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

19435

(VT != MVT::f32 && VT != MVT::f64))

19436

return SDValue();

19437

19438

// Pack the i64 into a vector, do the operation and extract.

19439

19440

// Using 256-bit to ensure result is 128-bits for f32 case.

19441

unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

19442

MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

19443

MVT VecVT = MVT::getVectorVT(VT, NumElts);

19444

19445

SDLoc dl(Op);

19446

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

19447

if (IsStrict) {

19448

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

19449

{Op.getOperand(0), InVec});

19450

SDValue Chain = CvtVec.getValue(1);

19451

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

19452

DAG.getIntPtrConstant(0, dl));

19453

return DAG.getMergeValues({Value, Chain}, dl);

19454

}

19455

19456

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

19457

19458

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

19459

DAG.getIntPtrConstant(0, dl));

19460

}

19461

19462

static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

19463

const X86Subtarget &Subtarget) {

19464

switch (Opcode) {

19465

case ISD::SINT_TO_FP:

19466

// TODO: Handle wider types with AVX/AVX512.

19467

if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

19468

return false;

19469

// CVTDQ2PS or (V)CVTDQ2PD

19470

return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);

19471

19472

case ISD::UINT_TO_FP:

19473

// TODO: Handle wider types and i64 elements.

19474

if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

19475

return false;

19476

// VCVTUDQ2PS or VCVTUDQ2PD

19477

return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;

19478

19479

default:

19480

return false;

19481

}

19482

}

19483

19484

/// Given a scalar cast operation that is extracted from a vector, try to

19485

/// vectorize the cast op followed by extraction. This will avoid an expensive

19486

/// round-trip between XMM and GPR.

19487

static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

19488

const X86Subtarget &Subtarget) {

19489

// TODO: This could be enhanced to handle smaller integer types by peeking

19490

// through an extend.

19491

SDValue Extract = Cast.getOperand(0);

19492

MVT DestVT = Cast.getSimpleValueType();

19493

if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

19494

!isa<ConstantSDNode>(Extract.getOperand(1)))

19495

return SDValue();

19496

19497

// See if we have a 128-bit vector cast op for this type of cast.

19498

SDValue VecOp = Extract.getOperand(0);

19499

MVT FromVT = VecOp.getSimpleValueType();

19500

unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

19501

MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

19502

MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

19503

if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

19504

return SDValue();

19505

19506

// If we are extracting from a non-zero element, first shuffle the source

19507

// vector to allow extracting from element zero.

19508

SDLoc DL(Cast);

19509

if (!isNullConstant(Extract.getOperand(1))) {

19510

SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

19511

Mask[0] = Extract.getConstantOperandVal(1);

19512

VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

19513

}

19514

// If the source vector is wider than 128-bits, extract the low part. Do not

19515

// create an unnecessarily wide vector cast op.

19516

if (FromVT != Vec128VT)

19517

VecOp = extract128BitVector(VecOp, 0, DAG, DL);

19518

19519

// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

19520

// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

19521

SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

19522

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

19523

DAG.getIntPtrConstant(0, DL));

19524

}

19525

19526

/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

19527

/// try to vectorize the cast ops. This will avoid an expensive round-trip

19528

/// between XMM and GPR.

19529

static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

19530

const X86Subtarget &Subtarget) {

19531

// TODO: Allow FP_TO_UINT.

19532

SDValue CastToInt = CastToFP.getOperand(0);

19533

MVT VT = CastToFP.getSimpleValueType();

19534

if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

19535

return SDValue();

19536

19537

MVT IntVT = CastToInt.getSimpleValueType();

19538

SDValue X = CastToInt.getOperand(0);

19539

MVT SrcVT = X.getSimpleValueType();

19540

if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

19541

return SDValue();

19542

19543

// See if we have 128-bit vector cast instructions for this type of cast.

19544

// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

19545

if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

19546

IntVT != MVT::i32)

19547

return SDValue();

19548

19549

unsigned SrcSize = SrcVT.getSizeInBits();

19550

unsigned IntSize = IntVT.getSizeInBits();

19551

unsigned VTSize = VT.getSizeInBits();

19552

MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

19553

MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

19554

MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

19555

19556

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

19557

unsigned ToIntOpcode =

19558

SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

19559

unsigned ToFPOpcode =

19560

IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

19561

19562

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

19563

//

19564

// We are not defining the high elements (for example, zero them) because

19565

// that could nullify any performance advantage that we hoped to gain from

19566

// this vector op hack. We do not expect any adverse effects (like denorm

19567

// penalties) with cast ops.

19568

SDLoc DL(CastToFP);

19569

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

19570

SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

19571

SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

19572

SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

19573

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

19574

}

19575

19576

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

19577

const X86Subtarget &Subtarget) {

19578

SDLoc DL(Op);

19579

bool IsStrict = Op->isStrictFPOpcode();

19580

MVT VT = Op->getSimpleValueType(0);

19581

SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

19582

19583

if (Subtarget.hasDQI()) {

19584

assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19584, __PRETTY_FUNCTION__));

19585

19586

assert((Src.getSimpleValueType() == MVT::v2i64 ||(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19588, __PRETTY_FUNCTION__))

19587

Src.getSimpleValueType() == MVT::v4i64) &&(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19588, __PRETTY_FUNCTION__))

19588

"Unsupported custom type")(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19588, __PRETTY_FUNCTION__));

19589

19590

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

19591

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19592, __PRETTY_FUNCTION__))

19592

"Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19592, __PRETTY_FUNCTION__));

19593

MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

19594

19595

// Need to concat with zero vector for strict fp to avoid spurious

19596

// exceptions.

19597

SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

19598

: DAG.getUNDEF(MVT::v8i64);

19599

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

19600

DAG.getIntPtrConstant(0, DL));

19601

SDValue Res, Chain;

19602

if (IsStrict) {

19603

Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

19604

{Op->getOperand(0), Src});

19605

Chain = Res.getValue(1);

19606

} else {

19607

Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

19608

}

19609

19610

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19611

DAG.getIntPtrConstant(0, DL));

19612

19613

if (IsStrict)

19614

return DAG.getMergeValues({Res, Chain}, DL);

19615

return Res;

19616

}

19617

19618

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

19619

Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

19620

if (VT != MVT::v4f32 || IsSigned)

19621

return SDValue();

19622

19623

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

19624

SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

19625

SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

19626

DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

19627

DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

19628

SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

19629

SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

19630

SmallVector<SDValue, 4> SignCvts(4);

19631

SmallVector<SDValue, 4> Chains(4);

19632

for (int i = 0; i != 4; ++i) {

19633

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

19634

DAG.getIntPtrConstant(i, DL));

19635

if (IsStrict) {

19636

SignCvts[i] =

19637

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

19638

{Op.getOperand(0), Elt});

19639

Chains[i] = SignCvts[i].getValue(1);

19640

} else {

19641

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

19642

}

19643

}

19644

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

19645

19646

SDValue Slow, Chain;

19647

if (IsStrict) {

19648

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

19649

Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

19650

{Chain, SignCvt, SignCvt});

19651

Chain = Slow.getValue(1);

19652

} else {

19653

Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

19654

}

19655

19656

IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

19657

SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

19658

19659

if (IsStrict)

19660

return DAG.getMergeValues({Cvt, Chain}, DL);

19661

19662

return Cvt;

19663

}

19664

19665

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

19666

SelectionDAG &DAG) const {

19667

bool IsStrict = Op->isStrictFPOpcode();

19668

unsigned OpNo = IsStrict ? 1 : 0;

19669

SDValue Src = Op.getOperand(OpNo);

19670

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

19671

MVT SrcVT = Src.getSimpleValueType();

19672

MVT VT = Op.getSimpleValueType();

19673

SDLoc dl(Op);

19674

19675

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

19676

return Extract;

19677

19678

if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

19679

return R;

19680

19681

if (SrcVT.isVector()) {

19682

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

19683

// Note: Since v2f64 is a legal type. We don't need to zero extend the

19684

// source for strict FP.

19685

if (IsStrict)

19686

return DAG.getNode(

19687

X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

19688

{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

19689

DAG.getUNDEF(SrcVT))});

19690

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

19691

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

19692

DAG.getUNDEF(SrcVT)));

19693

}

19694

if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

19695

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

19696

19697

return SDValue();

19698

}

19699

19700

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19701, __PRETTY_FUNCTION__))

19701

"Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19701, __PRETTY_FUNCTION__));

19702

19703

bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

19704

19705

// These are really Legal; return the operand so the caller accepts it as

19706

// Legal.

19707

if (SrcVT == MVT::i32 && UseSSEReg)

19708

return Op;

19709

if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

19710

return Op;

19711

19712

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

19713

return V;

19714

19715

// SSE doesn't have an i16 conversion so we need to promote.

19716

if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

19717

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

19718

if (IsStrict)

19719

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

19720

{Chain, Ext});

19721

19722

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

19723

}

19724

19725

if (VT == MVT::f128)

19726

return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));

19727

19728

SDValue ValueToStore = Src;

19729

if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

19730

// Bitcasting to f64 here allows us to do a single 64-bit store from

19731

// an SSE register, avoiding the store forwarding penalty that would come

19732

// with two 32-bit stores.

19733

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

19734

19735

unsigned Size = SrcVT.getStoreSize();

19736

Align Alignment(Size);

19737

MachineFunction &MF = DAG.getMachineFunction();

19738

auto PtrVT = getPointerTy(MF.getDataLayout());

19739

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

19740

MachinePointerInfo MPI =

19741

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

19742

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

19743

Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

19744

std::pair<SDValue, SDValue> Tmp =

19745

BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

19746

19747

if (IsStrict)

19748

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

19749

19750

return Tmp.first;

19751

}

19752

19753

std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

19754

EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

19755

MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

19756

// Build the FILD

19757

SDVTList Tys;

19758

bool useSSE = isScalarFPTypeInSSEReg(DstVT);

19759

if (useSSE)

19760

Tys = DAG.getVTList(MVT::f80, MVT::Other);

19761

else

19762

Tys = DAG.getVTList(DstVT, MVT::Other);

19763

19764

SDValue FILDOps[] = {Chain, Pointer};

19765

SDValue Result =

19766

DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

19767

Alignment, MachineMemOperand::MOLoad);

19768

Chain = Result.getValue(1);

19769

19770

if (useSSE) {

19771

MachineFunction &MF = DAG.getMachineFunction();

19772

unsigned SSFISize = DstVT.getStoreSize();

19773

int SSFI =

19774

MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

19775

auto PtrVT = getPointerTy(MF.getDataLayout());

19776

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

19777

Tys = DAG.getVTList(MVT::Other);

19778

SDValue FSTOps[] = {Chain, Result, StackSlot};

19779

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

19780

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

19781

MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

19782

19783

Chain =

19784

DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

19785

Result = DAG.getLoad(

19786

DstVT, DL, Chain, StackSlot,

19787

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

19788

Chain = Result.getValue(1);

19789

}

19790

19791

return { Result, Chain };

19792

}

19793

19794

/// Horizontal vector math instructions may be slower than normal math with

19795

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

19796

/// implementation, and likely shuffle complexity of the alternate sequence.

19797

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

19798

const X86Subtarget &Subtarget) {

19799

bool IsOptimizingSize = DAG.shouldOptForSize();

19800

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

19801

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

19802

}

19803

19804

/// 64-bit unsigned integer to double expansion.

19805

static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

19806

const X86Subtarget &Subtarget) {

19807

// This algorithm is not obvious. Here it is what we're trying to output:

19808

/*

19809

movq %rax, %xmm0

19810

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

19811

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

19812

#ifdef __SSE3__

19813

haddpd %xmm0, %xmm0

19814

#else

19815

pshufd $0x4e, %xmm0, %xmm1

19816

addpd %xmm1, %xmm0

19817

#endif

19818

*/

19819

19820

bool IsStrict = Op->isStrictFPOpcode();

19821

unsigned OpNo = IsStrict ? 1 : 0;

19822

SDLoc dl(Op);

19823

LLVMContext *Context = DAG.getContext();

19824

19825

// Build some magic constants.

19826

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

19827

Constant *C0 = ConstantDataVector::get(*Context, CV0);

19828

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

19829

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

19830

19831

SmallVector<Constant*,2> CV1;

19832

CV1.push_back(

19833

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

19834

APInt(64, 0x4330000000000000ULL))));

19835

CV1.push_back(

19836

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

19837

APInt(64, 0x4530000000000000ULL))));

19838

Constant *C1 = ConstantVector::get(CV1);

19839

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

19840

19841

// Load the 64-bit value into an XMM register.

19842

SDValue XR1 =

19843

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));

19844

SDValue CLod0 = DAG.getLoad(

19845

MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

19846

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

19847

SDValue Unpck1 =

19848

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

19849

19850

SDValue CLod1 = DAG.getLoad(

19851

MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

19852

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

19853

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

19854

SDValue Sub;

19855

SDValue Chain;

19856

// TODO: Are there any fast-math-flags to propagate here?

19857

if (IsStrict) {

19858

Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

19859

{Op.getOperand(0), XR2F, CLod1});

19860

Chain = Sub.getValue(1);

19861

} else

19862

Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

19863

SDValue Result;

19864

19865

if (!IsStrict && Subtarget.hasSSE3() &&

19866

shouldUseHorizontalOp(true, DAG, Subtarget)) {

19867

// FIXME: Do we need a STRICT version of FHADD?

19868

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

19869

} else {

19870

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

19871

if (IsStrict) {

19872

Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},

19873

{Chain, Shuffle, Sub});

19874

Chain = Result.getValue(1);

19875

} else

19876

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

19877

}

19878

Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

19879

DAG.getIntPtrConstant(0, dl));

19880

if (IsStrict)

19881

return DAG.getMergeValues({Result, Chain}, dl);

19882

19883

return Result;

19884

}

19885

19886

/// 32-bit unsigned integer to float expansion.

19887

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

19888

const X86Subtarget &Subtarget) {

19889

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

19890

SDLoc dl(Op);

19891

// FP constant to bias correct the final result.

19892

SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,

19893

MVT::f64);

19894

19895

// Load the 32-bit value into an XMM register.

19896

SDValue Load =

19897

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

19898

19899

// Zero out the upper parts of the register.

19900

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

19901

19902

// Or the load with the bias.

19903

SDValue Or = DAG.getNode(

19904

ISD::OR, dl, MVT::v2i64,

19905

DAG.getBitcast(MVT::v2i64, Load),

19906

DAG.getBitcast(MVT::v2i64,

19907

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

19908

Or =

19909

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

19910

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

19911

19912

if (Op.getNode()->isStrictFPOpcode()) {

19913

// Subtract the bias.

19914

// TODO: Are there any fast-math-flags to propagate here?

19915

SDValue Chain = Op.getOperand(0);

19916

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

19917

{Chain, Or, Bias});

19918

19919

if (Op.getValueType() == Sub.getValueType())

19920

return Sub;

19921

19922

// Handle final rounding.

19923

std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

19924

Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

19925

19926

return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

19927

}

19928

19929

// Subtract the bias.

19930

// TODO: Are there any fast-math-flags to propagate here?

19931

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

19932

19933

// Handle final rounding.

19934

return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

19935

}

19936

19937

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

19938

const X86Subtarget &Subtarget,

19939

const SDLoc &DL) {

19940

if (Op.getSimpleValueType() != MVT::v2f64)

19941

return SDValue();

19942

19943

bool IsStrict = Op->isStrictFPOpcode();

19944

19945

SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

19946

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19946, __PRETTY_FUNCTION__));

19947

19948

if (Subtarget.hasAVX512()) {

19949

if (!Subtarget.hasVLX()) {

19950

// Let generic type legalization widen this.

19951

if (!IsStrict)

19952

return SDValue();

19953

// Otherwise pad the integer input with 0s and widen the operation.

19954

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

19955

DAG.getConstant(0, DL, MVT::v2i32));

19956

SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

19957

{Op.getOperand(0), N0});

19958

SDValue Chain = Res.getValue(1);

19959

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

19960

DAG.getIntPtrConstant(0, DL));

19961

return DAG.getMergeValues({Res, Chain}, DL);

19962

}

19963

19964

// Legalize to v4i32 type.

19965

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

19966

DAG.getUNDEF(MVT::v2i32));

19967

if (IsStrict)

19968

return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

19969

{Op.getOperand(0), N0});

19970

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

19971

}

19972

19973

// Zero extend to 2i64, OR with the floating point representation of 2^52.

19974

// This gives us the floating point equivalent of 2^52 + the i32 integer

19975

// since double has 52-bits of mantissa. Then subtract 2^52 in floating

19976

// point leaving just our i32 integers in double format.

19977

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

19978

SDValue VBias =

19979

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);

19980

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

19981

DAG.getBitcast(MVT::v2i64, VBias));

19982

Or = DAG.getBitcast(MVT::v2f64, Or);

19983

19984

if (IsStrict)

19985

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

19986

{Op.getOperand(0), Or, VBias});

19987

return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

19988

}

19989

19990

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

19991

const X86Subtarget &Subtarget) {

19992

SDLoc DL(Op);

19993

bool IsStrict = Op->isStrictFPOpcode();

19994

SDValue V = Op->getOperand(IsStrict ? 1 : 0);

19995

MVT VecIntVT = V.getSimpleValueType();

19996

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19997, __PRETTY_FUNCTION__))

19997

"Unsupported custom type")(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19997, __PRETTY_FUNCTION__));

19998

19999

if (Subtarget.hasAVX512()) {

20000

// With AVX512, but not VLX we need to widen to get a 512-bit result type.

20001

assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20001, __PRETTY_FUNCTION__));

20002

MVT VT = Op->getSimpleValueType(0);

20003

20004

// v8i32->v8f64 is legal with AVX512 so just return it.

20005

if (VT == MVT::v8f64)

20006

return Op;

20007

20008

assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20009, __PRETTY_FUNCTION__))

20009

"Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20009, __PRETTY_FUNCTION__));

20010

MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

20011

MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

20012

// Need to concat with zero vector for strict fp to avoid spurious

20013

// exceptions.

20014

SDValue Tmp =

20015

IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

20016

V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

20017

DAG.getIntPtrConstant(0, DL));

20018

SDValue Res, Chain;

20019

if (IsStrict) {

20020

Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

20021

{Op->getOperand(0), V});

20022

Chain = Res.getValue(1);

20023

} else {

20024

Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

20025

}

20026

20027

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

20028

DAG.getIntPtrConstant(0, DL));

20029

20030

if (IsStrict)

20031

return DAG.getMergeValues({Res, Chain}, DL);

20032

return Res;

20033

}

20034

20035

if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

20036

Op->getSimpleValueType(0) == MVT::v4f64) {

20037

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

20038

Constant *Bias = ConstantFP::get(

20039

*DAG.getContext(),

20040

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

20041

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

20042

SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

20043

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

20044

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

20045

SDValue VBias = DAG.getMemIntrinsicNode(

20046

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

20047

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

20048

MachineMemOperand::MOLoad);

20049

20050

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

20051

DAG.getBitcast(MVT::v4i64, VBias));

20052

Or = DAG.getBitcast(MVT::v4f64, Or);

20053

20054

if (IsStrict)

20055

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

20056

{Op.getOperand(0), Or, VBias});

20057

return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

20058

}

20059

20060

// The algorithm is the following:

20061

// #ifdef __SSE4_1__

20062

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

20063

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

20064

// (uint4) 0x53000000, 0xaa);

20065

// #else

20066

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

20067

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

20068

// #endif

20069

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

20070

// return (float4) lo + fhi;

20071

20072

bool Is128 = VecIntVT == MVT::v4i32;

20073

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

20074

// If we convert to something else than the supported type, e.g., to v4f64,

20075

// abort early.

20076

if (VecFloatVT != Op->getSimpleValueType(0))

20077

return SDValue();

20078

20079

// In the #idef/#else code, we have in common:

20080

// - The vector of constants:

20081

// -- 0x4b000000

20082

// -- 0x53000000

20083

// - A shift:

20084

// -- v >> 16

20085

20086

// Create the splat vector for 0x4b000000.

20087

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

20088

// Create the splat vector for 0x53000000.

20089

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

20090

20091

// Create the right shift.

20092

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

20093

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

20094

20095

SDValue Low, High;

20096

if (Subtarget.hasSSE41()) {

20097

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

20098

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

20099

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

20100

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

20101

// Low will be bitcasted right away, so do not bother bitcasting back to its

20102

// original type.

20103

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

20104

VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

20105

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

20106

// (uint4) 0x53000000, 0xaa);

20107

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

20108

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

20109

// High will be bitcasted right away, so do not bother bitcasting back to

20110

// its original type.

20111

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

20112

VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

20113

} else {

20114

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

20115

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

20116

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

20117

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

20118

20119

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

20120

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

20121

}

20122

20123

// Create the vector constant for (0x1.0p39f + 0x1.0p23f).

20124

SDValue VecCstFSub = DAG.getConstantFP(

20125

APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

20126

20127

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

20128

// NOTE: By using fsub of a positive constant instead of fadd of a negative

20129

// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

20130

// enabled. See PR24512.

20131

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

20132

// TODO: Are there any fast-math-flags to propagate here?

20133

// (float4) lo;

20134

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

20135

// return (float4) lo + fhi;

20136

if (IsStrict) {

20137

SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

20138

{Op.getOperand(0), HighBitcast, VecCstFSub});

20139

return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

20140

{FHigh.getValue(1), LowBitcast, FHigh});

20141

}

20142

20143

SDValue FHigh =

20144

DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

20145

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

20146

}

20147

20148

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

20149

const X86Subtarget &Subtarget) {

20150

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

20151

SDValue N0 = Op.getOperand(OpNo);

20152

MVT SrcVT = N0.getSimpleValueType();

20153

SDLoc dl(Op);

20154

20155

switch (SrcVT.SimpleTy) {

20156

default:

20157

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20157);

20158

case MVT::v2i32:

20159

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

20160

case MVT::v4i32:

20161

case MVT::v8i32:

20162

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

20163

case MVT::v2i64:

20164

case MVT::v4i64:

20165

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

20166

}

20167

}

20168

20169

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

20170

SelectionDAG &DAG) const {

20171

bool IsStrict = Op->isStrictFPOpcode();

20172

unsigned OpNo = IsStrict ? 1 : 0;

20173

SDValue Src = Op.getOperand(OpNo);

20174

SDLoc dl(Op);

20175

auto PtrVT = getPointerTy(DAG.getDataLayout());

20176

MVT SrcVT = Src.getSimpleValueType();

20177

MVT DstVT = Op->getSimpleValueType(0);

20178

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

20179

20180

if (DstVT == MVT::f128)

20181

return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));

20182

20183

if (DstVT.isVector())

20184

return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

20185

20186

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

20187

return Extract;

20188

20189

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

20190

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

20191

// Conversions from unsigned i32 to f32/f64 are legal,

20192

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

20193

return Op;

20194

}

20195

20196

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

20197

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

20198

Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

20199

if (IsStrict)

20200

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

20201

{Chain, Src});

20202

return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

20203

}

20204

20205

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

20206

return V;

20207

20208

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)

20209

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

20210

if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)

20211

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

20212

if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)

20213

return SDValue();

20214

20215

// Make a 64-bit buffer, and use it to build an FILD.

20216

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

20217

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

20218

Align SlotAlign(8);

20219

MachinePointerInfo MPI =

20220

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

20221

if (SrcVT == MVT::i32) {

20222

SDValue OffsetSlot =

20223

DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);

20224

SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

20225

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

20226

OffsetSlot, MPI.getWithOffset(4), SlotAlign);

20227

std::pair<SDValue, SDValue> Tmp =

20228

BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

20229

if (IsStrict)

20230

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

20231

20232

return Tmp.first;

20233

}

20234

20235

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20235, __PRETTY_FUNCTION__));

20236

SDValue ValueToStore = Src;

20237

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

20238

// Bitcasting to f64 here allows us to do a single 64-bit store from

20239

// an SSE register, avoiding the store forwarding penalty that would come

20240

// with two 32-bit stores.

20241

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

20242

}

20243

SDValue Store =

20244

DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

20245

// For i64 source, we need to add the appropriate power of 2 if the input

20246

// was negative. We must be careful to do the computation in x87 extended

20247

// precision, not in SSE.

20248

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

20249

SDValue Ops[] = { Store, StackSlot };

20250

SDValue Fild =

20251

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

20252

SlotAlign, MachineMemOperand::MOLoad);

20253

Chain = Fild.getValue(1);

20254

20255

20256

// Check whether the sign bit is set.

20257

SDValue SignSet = DAG.getSetCC(

20258

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

20259

Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

20260

20261

// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

20262

APInt FF(64, 0x5F80000000000000ULL);

20263

SDValue FudgePtr = DAG.getConstantPool(

20264

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

20265

Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

20266

20267

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

20268

SDValue Zero = DAG.getIntPtrConstant(0, dl);

20269

SDValue Four = DAG.getIntPtrConstant(4, dl);

20270

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

20271

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

20272

20273

// Load the value out, extending it from f32 to f80.

20274

SDValue Fudge = DAG.getExtLoad(

20275

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

20276

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

20277

CPAlignment);

20278

Chain = Fudge.getValue(1);

20279

// Extend everything to 80 bits to force it to be done on x87.

20280

// TODO: Are there any fast-math-flags to propagate here?

20281

if (IsStrict) {

20282

SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},

20283

{Chain, Fild, Fudge});

20284

// STRICT_FP_ROUND can't handle equal types.

20285

if (DstVT == MVT::f80)

20286

return Add;

20287

return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

20288

{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

20289

}

20290

SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);

20291

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

20292

DAG.getIntPtrConstant(0, dl));

20293

}

20294

20295

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

20296

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

20297

// just return an SDValue().

20298

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

20299

// to i16, i32 or i64, and we lower it to a legal sequence and return the

20300

// result.

20301

SDValue

20302

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

20303

bool IsSigned, SDValue &Chain) const {

20304

bool IsStrict = Op->isStrictFPOpcode();

20305

SDLoc DL(Op);

20306

20307

EVT DstTy = Op.getValueType();

20308

SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

20309

EVT TheVT = Value.getValueType();

20310

auto PtrVT = getPointerTy(DAG.getDataLayout());

20311

20312

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

20313

// f16 must be promoted before using the lowering in this routine.

20314

// fp128 does not use this lowering.

20315

return SDValue();

20316

}

20317

20318

// If using FIST to compute an unsigned i64, we'll need some fixup

20319

// to handle values above the maximum signed i64. A FIST is always

20320

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

20321

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

20322

20323

// FIXME: This does not generate an invalid exception if the input does not

20324

// fit in i32. PR44019

20325

if (!IsSigned && DstTy != MVT::i64) {

20326

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

20327

// The low 32 bits of the fist result will have the correct uint32 result.

20328

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast
<void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20328, __PRETTY_FUNCTION__));

20329

DstTy = MVT::i64;

20330

}

20331

20332

assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20334, __PRETTY_FUNCTION__))

20333

DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20334, __PRETTY_FUNCTION__))

20334

"Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20334, __PRETTY_FUNCTION__));

20335

20336

// We lower FP->int64 into FISTP64 followed by a load from a temporary

20337

// stack slot.

20338

MachineFunction &MF = DAG.getMachineFunction();

20339

unsigned MemSize = DstTy.getStoreSize();

20340

int SSFI =

20341

MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

20342

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

20343

20344

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

20345

20346

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

20347

20348

if (UnsignedFixup) {

20349

//

20350

// Conversion to unsigned i64 is implemented with a select,

20351

// depending on whether the source value fits in the range

20352

// of a signed i64. Let Thresh be the FP equivalent of

20353

// 0x8000000000000000ULL.

20354

//

20355

// Adjust = (Value < Thresh) ? 0 : 0x80000000;

20356

// FltOfs = (Value < Thresh) ? 0 : 0x80000000;

20357

// FistSrc = (Value - FltOfs);

20358

// Fist-to-mem64 FistSrc

20359

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

20360

// to XOR'ing the high 32 bits with Adjust.

20361

//

20362

// Being a power of 2, Thresh is exactly representable in all FP formats.

20363

// For X87 we'd like to use the smallest FP type for this constant, but

20364

// for DAG type consistency we have to match the FP operand type.

20365

20366

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

20367

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

20368

bool LosesInfo = false;

20369

if (TheVT == MVT::f64)

20370

// The rounding mode is irrelevant as the conversion should be exact.

20371

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

20372

&LosesInfo);

20373

else if (TheVT == MVT::f80)

20374

Status = Thresh.convert(APFloat::x87DoubleExtended(),

20375

APFloat::rmNearestTiesToEven, &LosesInfo);

20376

20377

assert(Status == APFloat::opOK && !LosesInfo &&((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20378, __PRETTY_FUNCTION__))

20378

"FP conversion should have been exact")((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20378, __PRETTY_FUNCTION__));

20379

20380

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

20381

20382

EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

20383

*DAG.getContext(), TheVT);

20384

SDValue Cmp;

20385

if (IsStrict) {

20386

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, SDNodeFlags(),

20387

Chain, /*IsSignaling*/ true);

20388

Chain = Cmp.getValue(1);

20389

} else {

20390

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);

20391

}

20392

20393

Adjust = DAG.getSelect(DL, MVT::i64, Cmp,

20394

DAG.getConstant(0, DL, MVT::i64),

20395

DAG.getConstant(APInt::getSignMask(64),

20396

DL, MVT::i64));

20397

SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,

20398

DAG.getConstantFP(0.0, DL, TheVT),

20399

ThreshVal);

20400

20401

if (IsStrict) {

20402

Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

20403

{ Chain, Value, FltOfs });

20404

Chain = Value.getValue(1);

20405

} else

20406

Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

20407

}

20408

20409

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

20410

20411

// FIXME This causes a redundant load/store if the SSE-class value is already

20412

// in memory, such as if it is on the callstack.

20413

if (isScalarFPTypeInSSEReg(TheVT)) {

20414

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20414, __PRETTY_FUNCTION__));

20415

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

20416

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

20417

SDValue Ops[] = { Chain, StackSlot };

20418

20419

unsigned FLDSize = TheVT.getStoreSize();

20420

assert(FLDSize <= MemSize && "Stack slot not big enough")((FLDSize <= MemSize && "Stack slot not big enough"
) ? static_cast<void> (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20420, __PRETTY_FUNCTION__));

20421

MachineMemOperand *MMO = MF.getMachineMemOperand(

20422

MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

20423

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

20424

Chain = Value.getValue(1);

20425

}

20426

20427

// Build the FP_TO_INT*_IN_MEM

20428

MachineMemOperand *MMO = MF.getMachineMemOperand(

20429

MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

20430

SDValue Ops[] = { Chain, Value, StackSlot };

20431

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

20432

DAG.getVTList(MVT::Other),

20433

Ops, DstTy, MMO);

20434

20435

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

20436

Chain = Res.getValue(1);

20437

20438

// If we need an unsigned fixup, XOR the result with adjust.

20439

if (UnsignedFixup)

20440

Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

20441

20442

return Res;

20443

}

20444

20445

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

20446

const X86Subtarget &Subtarget) {

20447

MVT VT = Op.getSimpleValueType();

20448

SDValue In = Op.getOperand(0);

20449

MVT InVT = In.getSimpleValueType();

20450

SDLoc dl(Op);

20451

unsigned Opc = Op.getOpcode();

20452

20453

assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20453, __PRETTY_FUNCTION__));

20454

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20455, __PRETTY_FUNCTION__))

20455

"Unexpected extension opcode")(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20455, __PRETTY_FUNCTION__));

20456

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20457, __PRETTY_FUNCTION__))

20457

"Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20457, __PRETTY_FUNCTION__));

20458

assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20461, __PRETTY_FUNCTION__))

20459

VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20461, __PRETTY_FUNCTION__))

20460

VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20461, __PRETTY_FUNCTION__))

20461

"Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20461, __PRETTY_FUNCTION__));

20462

assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20465, __PRETTY_FUNCTION__))

20463

InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20465, __PRETTY_FUNCTION__))

20464

InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20465, __PRETTY_FUNCTION__))

20465

"Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20465, __PRETTY_FUNCTION__));

20466

20467

unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

20468

20469

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

20470

assert(InVT == MVT::v32i8 && "Unexpected VT!")((InVT == MVT::v32i8 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20470, __PRETTY_FUNCTION__));

20471

return splitVectorIntUnary(Op, DAG);

20472

}

20473

20474

if (Subtarget.hasInt256())

20475

return Op;

20476

20477

// Optimize vectors in AVX mode:

20478

//

20479

// v8i16 -> v8i32

20480

// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.

20481

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

20482

// Concat upper and lower parts.

20483

//

20484

// v4i32 -> v4i64

20485

// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.

20486

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

20487

// Concat upper and lower parts.

20488

//

20489

MVT HalfVT = VT.getHalfNumVectorElementsVT();

20490

SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

20491

20492

// Short-circuit if we can determine that each 128-bit half is the same value.

20493

// Otherwise, this is difficult to match and optimize.

20494

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

20495

if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

20496

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

20497

20498

SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

20499

SDValue Undef = DAG.getUNDEF(InVT);

20500

bool NeedZero = Opc == ISD::ZERO_EXTEND;

20501

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

20502

OpHi = DAG.getBitcast(HalfVT, OpHi);

20503

20504

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

20505

}

20506

20507

// Helper to split and extend a v16i1 mask to v16i8 or v16i16.

20508

static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

20509

const SDLoc &dl, SelectionDAG &DAG) {

20510

assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20510, __PRETTY_FUNCTION__));

20511

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

20512

DAG.getIntPtrConstant(0, dl));

20513

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

20514

DAG.getIntPtrConstant(8, dl));

20515

Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

20516

Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

20517

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

20518

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20519

}

20520

20521

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

20522

const X86Subtarget &Subtarget,

20523

SelectionDAG &DAG) {

20524

MVT VT = Op->getSimpleValueType(0);

20525

SDValue In = Op->getOperand(0);

20526

MVT InVT = In.getSimpleValueType();

20527

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20527, __PRETTY_FUNCTION__));

20528

SDLoc DL(Op);

20529

unsigned NumElts = VT.getVectorNumElements();

20530

20531

// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

20532

// avoids a constant pool load.

20533

if (VT.getVectorElementType() != MVT::i8) {

20534

SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

20535

return DAG.getNode(ISD::SRL, DL, VT, Extend,

20536

DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

20537

}

20538

20539

// Extend VT if BWI is not supported.

20540

MVT ExtVT = VT;

20541

if (!Subtarget.hasBWI()) {

20542

// If v16i32 is to be avoided, we'll need to split and concatenate.

20543

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

20544

return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

20545

20546

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

20547

}

20548

20549

// Widen to 512-bits if VLX is not supported.

20550

MVT WideVT = ExtVT;

20551

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

20552

NumElts *= 512 / ExtVT.getSizeInBits();

20553

InVT = MVT::getVectorVT(MVT::i1, NumElts);

20554

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

20555

In, DAG.getIntPtrConstant(0, DL));

20556

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

20557

NumElts);

20558

}

20559

20560

SDValue One = DAG.getConstant(1, DL, WideVT);

20561

SDValue Zero = DAG.getConstant(0, DL, WideVT);

20562

20563

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

20564

20565

// Truncate if we had to extend above.

20566

if (VT != ExtVT) {

20567

WideVT = MVT::getVectorVT(MVT::i8, NumElts);

20568

SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

20569

}

20570

20571

// Extract back to 128/256-bit if we widened.

20572

if (WideVT != VT)

20573

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

20574

DAG.getIntPtrConstant(0, DL));

20575

20576

return SelectedVal;

20577

}

20578

20579

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

20580

SelectionDAG &DAG) {

20581

SDValue In = Op.getOperand(0);

20582

MVT SVT = In.getSimpleValueType();

20583

20584

if (SVT.getVectorElementType() == MVT::i1)

20585

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

20586

20587

assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20587, __PRETTY_FUNCTION__));

20588

return LowerAVXExtend(Op, DAG, Subtarget);

20589

}

20590

20591

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

20592

/// It makes use of the fact that vectors with enough leading sign/zero bits

20593

/// prevent the PACKSS/PACKUS from saturating the results.

20594

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

20595

/// within each 128-bit lane.

20596

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

20597

const SDLoc &DL, SelectionDAG &DAG,

20598

const X86Subtarget &Subtarget) {

20599

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20600, __PRETTY_FUNCTION__))

20600

"Unexpected PACK opcode")(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20600, __PRETTY_FUNCTION__));

20601

assert(DstVT.isVector() && "VT not a vector?")((DstVT.isVector() && "VT not a vector?") ? static_cast
<void> (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20601, __PRETTY_FUNCTION__));

20602

20603

// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

20604

if (!Subtarget.hasSSE2())

20605

return SDValue();

20606

20607

EVT SrcVT = In.getValueType();

20608

20609

// No truncation required, we might get here due to recursive calls.

20610

if (SrcVT == DstVT)

20611

return In;

20612

20613

// We only support vector truncation to 64bits or greater from a

20614

// 128bits or greater source.

20615

unsigned DstSizeInBits = DstVT.getSizeInBits();

20616

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

20617

if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)

20618

return SDValue();

20619

20620

unsigned NumElems = SrcVT.getVectorNumElements();

20621

if (!isPowerOf2_32(NumElems))

20622

return SDValue();

20623

20624

LLVMContext &Ctx = *DAG.getContext();

20625

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((DstVT.getVectorNumElements() == NumElems && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20625, __PRETTY_FUNCTION__));

20626

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((SrcSizeInBits > DstSizeInBits && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20626, __PRETTY_FUNCTION__));

20627

20628

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

20629

20630

// Pack to the largest type possible:

20631

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

20632

EVT InVT = MVT::i16, OutVT = MVT::i8;

20633

if (SrcVT.getScalarSizeInBits() > 16 &&

20634

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

20635

InVT = MVT::i32;

20636

OutVT = MVT::i16;

20637

}

20638

20639

// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.

20640

if (SrcVT.is128BitVector()) {

20641

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

20642

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

20643

In = DAG.getBitcast(InVT, In);

20644

SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

20645

Res = extractSubVector(Res, 0, DAG, DL, 64);

20646

return DAG.getBitcast(DstVT, Res);

20647

}

20648

20649

// Split lower/upper subvectors.

20650

SDValue Lo, Hi;

20651

std::tie(Lo, Hi) = splitVector(In, DAG, DL);

20652

20653

unsigned SubSizeInBits = SrcSizeInBits / 2;

20654

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

20655

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

20656

20657

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

20658

if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

20659

Lo = DAG.getBitcast(InVT, Lo);

20660

Hi = DAG.getBitcast(InVT, Hi);

20661

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

20662

return DAG.getBitcast(DstVT, Res);

20663

}

20664

20665

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

20666

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

20667

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

20668

Lo = DAG.getBitcast(InVT, Lo);

20669

Hi = DAG.getBitcast(InVT, Hi);

20670

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

20671

20672

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

20673

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

20674

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

20675

SmallVector<int, 64> Mask;

20676

int Scale = 64 / OutVT.getScalarSizeInBits();

20677

narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

20678

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

20679

20680

if (DstVT.is256BitVector())

20681

return DAG.getBitcast(DstVT, Res);

20682

20683

// If 512bit -> 128bit truncate another stage.

20684

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

20685

Res = DAG.getBitcast(PackedVT, Res);

20686

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

20687

}

20688

20689

// Recursively pack lower/upper subvectors, concat result and pack again.

20690

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20690, __PRETTY_FUNCTION__));

20691

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

20692

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

20693

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

20694

20695

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

20696

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

20697

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

20698

}

20699

20700

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

20701

const X86Subtarget &Subtarget) {

20702

20703

SDLoc DL(Op);

20704

MVT VT = Op.getSimpleValueType();

20705

SDValue In = Op.getOperand(0);

20706

MVT InVT = In.getSimpleValueType();

20707

20708

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20708, __PRETTY_FUNCTION__));

20709

20710

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

20711

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

20712

if (InVT.getScalarSizeInBits() <= 16) {

20713

if (Subtarget.hasBWI()) {

20714

// legal, will go to VPMOVB2M, VPMOVW2M

20715

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

20716

// We need to shift to get the lsb into sign position.

20717

// Shift packed bytes not supported natively, bitcast to word

20718

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

20719

In = DAG.getNode(ISD::SHL, DL, ExtVT,

20720

DAG.getBitcast(ExtVT, In),

20721

DAG.getConstant(ShiftInx, DL, ExtVT));

20722

In = DAG.getBitcast(InVT, In);

20723

}

20724

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

20725

In, ISD::SETGT);

20726

}

20727

// Use TESTD/Q, extended vector to packed dword/qword.

20728

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20729, __PRETTY_FUNCTION__))

20729

"Unexpected vector type.")(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20729, __PRETTY_FUNCTION__));

20730

unsigned NumElts = InVT.getVectorNumElements();

20731

assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20731, __PRETTY_FUNCTION__));

20732

// We need to change to a wider element type that we have support for.

20733

// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

20734

// For 16 element vectors we extend to v16i32 unless we are explicitly

20735

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

20736

// we need to split into two 8 element vectors which we can extend to v8i32,

20737

// truncate and concat the results. There's an additional complication if

20738

// the original type is v16i8. In that case we can't split the v16i8

20739

// directly, so we need to shuffle high elements to low and use

20740

// sign_extend_vector_inreg.

20741

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

20742

SDValue Lo, Hi;

20743

if (InVT == MVT::v16i8) {

20744

Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

20745

Hi = DAG.getVectorShuffle(

20746

InVT, DL, In, In,

20747

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

20748

Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

20749

} else {

20750

assert(InVT == MVT::v16i16 && "Unexpected VT!")((InVT == MVT::v16i16 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20750, __PRETTY_FUNCTION__));

20751

Lo = extract128BitVector(In, 0, DAG, DL);

20752

Hi = extract128BitVector(In, 8, DAG, DL);

20753

}

20754

// We're split now, just emit two truncates and a concat. The two

20755

// truncates will trigger legalization to come back to this function.

20756

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

20757

Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

20758

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

20759

}

20760

// We either have 8 elements or we're allowed to use 512-bit vectors.

20761

// If we have VLX, we want to use the narrowest vector that can get the

20762

// job done so we use vXi32.

20763

MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

20764

MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

20765

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

20766

InVT = ExtVT;

20767

ShiftInx = InVT.getScalarSizeInBits() - 1;

20768

}

20769

20770

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

20771

// We need to shift to get the lsb into sign position.

20772

In = DAG.getNode(ISD::SHL, DL, InVT, In,

20773

DAG.getConstant(ShiftInx, DL, InVT));

20774

}

20775

// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

20776

if (Subtarget.hasDQI())

20777

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

20778

return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

20779

}

20780

20781

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

20782

SDLoc DL(Op);

20783

MVT VT = Op.getSimpleValueType();

20784

SDValue In = Op.getOperand(0);

20785

MVT InVT = In.getSimpleValueType();

20786

unsigned InNumEltBits = InVT.getScalarSizeInBits();

20787

20788

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20789, __PRETTY_FUNCTION__))

20789

"Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20789, __PRETTY_FUNCTION__));

20790

20791

// If we're called by the type legalizer, handle a few cases.

20792

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

20793

if (!TLI.isTypeLegal(InVT)) {

20794

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

20795

VT.is128BitVector()) {

20796

assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(((InVT == MVT::v16i64 || Subtarget.hasVLX()) && "Unexpected subtarget!"
) ? static_cast<void> (0) : __assert_fail ("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20797, __PRETTY_FUNCTION__))

20797

"Unexpected subtarget!")(((InVT == MVT::v16i64 || Subtarget.hasVLX()) && "Unexpected subtarget!"
) ? static_cast<void> (0) : __assert_fail ("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20797, __PRETTY_FUNCTION__));

20798

// The default behavior is to truncate one step, concatenate, and then

20799

// truncate the remainder. We'd rather produce two 64-bit results and

20800

// concatenate those.

20801

SDValue Lo, Hi;

20802

std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

20803

20804

EVT LoVT, HiVT;

20805

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

20806

20807

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

20808

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

20809

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

20810

}

20811

20812

// Otherwise let default legalization handle it.

20813

return SDValue();

20814

}

20815

20816

if (VT.getVectorElementType() == MVT::i1)

20817

return LowerTruncateVecI1(Op, DAG, Subtarget);

20818

20819

// vpmovqb/w/d, vpmovdb/w, vpmovwb

20820

if (Subtarget.hasAVX512()) {

20821

if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

20822

assert(VT == MVT::v32i8 && "Unexpected VT!")((VT == MVT::v32i8 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20822, __PRETTY_FUNCTION__));

20823

return splitVectorIntUnary(Op, DAG);

20824

}

20825

20826

// word to byte only under BWI. Otherwise we have to promoted to v16i32

20827

// and then truncate that. But we should only do that if we haven't been

20828

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

20829

// handled by isel patterns.

20830

if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

20831

Subtarget.canExtendTo512DQ())

20832

return Op;

20833

}

20834

20835

unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

20836

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

20837

20838

// Truncate with PACKUS if we are truncating a vector with leading zero bits

20839

// that extend all the way to the packed/truncated value.

20840

// Pre-SSE41 we can only use PACKUSWB.

20841

KnownBits Known = DAG.computeKnownBits(In);

20842

if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())

20843

if (SDValue V =

20844

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

20845

return V;

20846

20847

// Truncate with PACKSS if we are truncating a vector with sign-bits that

20848

// extend all the way to the packed/truncated value.

20849

if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))

20850

if (SDValue V =

20851

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

20852

return V;

20853

20854

// Handle truncation of V256 to V128 using shuffles.

20855

assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((VT.is128BitVector() && InVT.is256BitVector() &&
"Unexpected types!") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20855, __PRETTY_FUNCTION__));

20856

20857

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

20858

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

20859

if (Subtarget.hasInt256()) {

20860

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

20861

In = DAG.getBitcast(MVT::v8i32, In);

20862

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

20863

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

20864

DAG.getIntPtrConstant(0, DL));

20865

}

20866

20867

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

20868

DAG.getIntPtrConstant(0, DL));

20869

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

20870

DAG.getIntPtrConstant(2, DL));

20871

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

20872

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

20873

static const int ShufMask[] = {0, 2, 4, 6};

20874

return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);

20875

}

20876

20877

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

20878

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

20879

if (Subtarget.hasInt256()) {

20880

In = DAG.getBitcast(MVT::v32i8, In);

20881

20882

// The PSHUFB mask:

20883

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

20884

-1, -1, -1, -1, -1, -1, -1, -1,

20885

16, 17, 20, 21, 24, 25, 28, 29,

20886

-1, -1, -1, -1, -1, -1, -1, -1 };

20887

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

20888

In = DAG.getBitcast(MVT::v4i64, In);

20889

20890

static const int ShufMask2[] = {0, 2, -1, -1};

20891

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

20892

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

20893

DAG.getIntPtrConstant(0, DL));

20894

return DAG.getBitcast(VT, In);

20895

}

20896

20897

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

20898

DAG.getIntPtrConstant(0, DL));

20899

20900

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

20901

DAG.getIntPtrConstant(4, DL));

20902

20903

OpLo = DAG.getBitcast(MVT::v16i8, OpLo);

20904

OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

20905

20906

// The PSHUFB mask:

20907

static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,

20908

-1, -1, -1, -1, -1, -1, -1, -1};

20909

20910

OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);

20911

OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

20912

20913

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

20914

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

20915

20916

// The MOVLHPS Mask:

20917

static const int ShufMask2[] = {0, 1, 4, 5};

20918

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

20919

return DAG.getBitcast(MVT::v8i16, res);

20920

}

20921

20922

if (VT == MVT::v16i8 && InVT == MVT::v16i16) {

20923

// Use an AND to zero uppper bits for PACKUS.

20924

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

20925

20926

SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

20927

DAG.getIntPtrConstant(0, DL));

20928

SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

20929

DAG.getIntPtrConstant(8, DL));

20930

return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);

20931

}

20932

20933

llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20933);

20934

}

20935

20936

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

20937

bool IsStrict = Op->isStrictFPOpcode();

20938

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

20939

Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

20940

MVT VT = Op->getSimpleValueType(0);

20941

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

20942

MVT SrcVT = Src.getSimpleValueType();

20943

SDLoc dl(Op);

20944

20945

if (VT.isVector()) {

20946

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

20947

MVT ResVT = MVT::v4i32;

20948

MVT TruncVT = MVT::v4i1;

20949

unsigned Opc;

20950

if (IsStrict)

20951

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

20952

else

20953

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

20954

20955

if (!IsSigned && !Subtarget.hasVLX()) {

20956

assert(Subtarget.useAVX512Regs() && "Unexpected features!")((Subtarget.useAVX512Regs() && "Unexpected features!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20956, __PRETTY_FUNCTION__));

20957

// Widen to 512-bits.

20958

ResVT = MVT::v8i32;

20959

TruncVT = MVT::v8i1;

20960

Opc = Op.getOpcode();

20961

// Need to concat with zero vector for strict fp to avoid spurious

20962

// exceptions.

20963

// TODO: Should we just do this for non-strict as well?

20964

SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

20965

: DAG.getUNDEF(MVT::v8f64);

20966

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

20967

DAG.getIntPtrConstant(0, dl));

20968

}

20969

SDValue Res, Chain;

20970

if (IsStrict) {

20971

Res =

20972

DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});

20973

Chain = Res.getValue(1);

20974

} else {

20975

Res = DAG.getNode(Opc, dl, ResVT, Src);

20976

}

20977

20978

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

20979

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

20980

DAG.getIntPtrConstant(0, dl));

20981

if (IsStrict)

20982

return DAG.getMergeValues({Res, Chain}, dl);

20983

return Res;

20984

}

20985

20986

// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

20987

if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

20988

assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20988, __PRETTY_FUNCTION__));

20989

assert(Subtarget.useAVX512Regs() && "Requires avx512f")((Subtarget.useAVX512Regs() && "Requires avx512f") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20989, __PRETTY_FUNCTION__));

20990

return Op;

20991

}

20992

20993

// Widen vXi32 fp_to_uint with avx512f to 512-bit source.

20994

if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

20995

(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {

20996

assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20996, __PRETTY_FUNCTION__));

20997

assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20998, __PRETTY_FUNCTION__))

20998

"Unexpected features!")((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20998, __PRETTY_FUNCTION__));

20999

MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

21000

MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

21001

// Need to concat with zero vector for strict fp to avoid spurious

21002

// exceptions.

21003

// TODO: Should we just do this for non-strict as well?

21004

SDValue Tmp =

21005

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

21006

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

21007

DAG.getIntPtrConstant(0, dl));

21008

21009

SDValue Res, Chain;

21010

if (IsStrict) {

21011

Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

21012

{Op->getOperand(0), Src});

21013

Chain = Res.getValue(1);

21014

} else {

21015

Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

21016

}

21017

21018

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

21019

DAG.getIntPtrConstant(0, dl));

21020

21021

if (IsStrict)

21022

return DAG.getMergeValues({Res, Chain}, dl);

21023

return Res;

21024

}

21025

21026

// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

21027

if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

21028

(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {

21029

assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21030, __PRETTY_FUNCTION__))

21030

!Subtarget.hasVLX() && "Unexpected features!")((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21030, __PRETTY_FUNCTION__));

21031

MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

21032

// Need to concat with zero vector for strict fp to avoid spurious

21033

// exceptions.

21034

// TODO: Should we just do this for non-strict as well?

21035

SDValue Tmp =

21036

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

21037

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

21038

DAG.getIntPtrConstant(0, dl));

21039

21040

SDValue Res, Chain;

21041

if (IsStrict) {

21042

Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

21043

{Op->getOperand(0), Src});

21044

Chain = Res.getValue(1);

21045

} else {

21046

Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

21047

}

21048

21049

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

21050

DAG.getIntPtrConstant(0, dl));

21051

21052

if (IsStrict)

21053

return DAG.getMergeValues({Res, Chain}, dl);

21054

return Res;

21055

}

21056

21057

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

21058

if (!Subtarget.hasVLX()) {

21059

// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

21060

// legalizer and then widened again by vector op legalization.

21061

if (!IsStrict)

21062

return SDValue();

21063

21064

SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

21065

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

21066

{Src, Zero, Zero, Zero});

21067

Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

21068

{Op->getOperand(0), Tmp});

21069

SDValue Chain = Tmp.getValue(1);

21070

Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

21071

DAG.getIntPtrConstant(0, dl));

21072

if (IsStrict)

21073

return DAG.getMergeValues({Tmp, Chain}, dl);

21074

return Tmp;

21075

}

21076

21077

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")((Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL") ? static_cast<void> (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21077, __PRETTY_FUNCTION__));

21078

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

21079

DAG.getUNDEF(MVT::v2f32));

21080

if (IsStrict) {

21081

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

21082

: X86ISD::STRICT_CVTTP2UI;

21083

return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

21084

}

21085

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

21086

return DAG.getNode(Opc, dl, VT, Tmp);

21087

}

21088

21089

return SDValue();

21090

}

21091

21092

assert(!VT.isVector())((!VT.isVector()) ? static_cast<void> (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21092, __PRETTY_FUNCTION__));

21093

21094

bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

21095

21096

if (!IsSigned && UseSSEReg) {

21097

// Conversions from f32/f64 with AVX512 should be legal.

21098

if (Subtarget.hasAVX512())

21099

return Op;

21100

21101

// Use default expansion for i64.

21102

if (VT == MVT::i64)

21103

return SDValue();

21104

21105

assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21105, __PRETTY_FUNCTION__));

21106

21107

// Promote i32 to i64 and use a signed operation on 64-bit targets.

21108

// FIXME: This does not generate an invalid exception if the input does not

21109

// fit in i32. PR44019

21110

if (Subtarget.is64Bit()) {

21111

SDValue Res, Chain;

21112

if (IsStrict) {

21113

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},

21114

{ Op.getOperand(0), Src });

21115

Chain = Res.getValue(1);

21116

} else

21117

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

21118

21119

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

21120

if (IsStrict)

21121

return DAG.getMergeValues({ Res, Chain }, dl);

21122

return Res;

21123

}

21124

21125

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

21126

// use fisttp which will be handled later.

21127

if (!Subtarget.hasSSE3())

21128

return SDValue();

21129

}

21130

21131

// Promote i16 to i32 if we can use a SSE operation or the type is f128.

21132

// FIXME: This does not generate an invalid exception if the input does not

21133

// fit in i16. PR44019

21134

if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

21135

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? static_cast<void> (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21135, __PRETTY_FUNCTION__));

21136

SDValue Res, Chain;

21137

if (IsStrict) {

21138

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},

21139

{ Op.getOperand(0), Src });

21140

Chain = Res.getValue(1);

21141

} else

21142

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

21143

21144

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

21145

if (IsStrict)

21146

return DAG.getMergeValues({ Res, Chain }, dl);

21147

return Res;

21148

}

21149

21150

// If this is a FP_TO_SINT using SSEReg we're done.

21151

if (UseSSEReg && IsSigned)

21152

return Op;

21153

21154

// fp128 needs to use a libcall.

21155

if (SrcVT == MVT::f128) {

21156

RTLIB::Libcall LC;

21157

if (IsSigned)

21158

LC = RTLIB::getFPTOSINT(SrcVT, VT);

21159

else

21160

LC = RTLIB::getFPTOUINT(SrcVT, VT);

21161

21162

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

21163

MakeLibCallOptions CallOptions;

21164

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

21165

SDLoc(Op), Chain);

21166

21167

if (IsStrict)

21168

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

21169

21170

return Tmp.first;

21171

}

21172

21173

// Fall back to X87.

21174

SDValue Chain;

21175

if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

21176

if (IsStrict)

21177

return DAG.getMergeValues({V, Chain}, dl);

21178

return V;

21179

}

21180

21181

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21181);

21182

}

21183

21184

SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

21185

SelectionDAG &DAG) const {

21186

SDValue Src = Op.getOperand(0);

21187

MVT SrcVT = Src.getSimpleValueType();

21188

21189

// If the source is in an SSE register, the node is Legal.

21190

if (isScalarFPTypeInSSEReg(SrcVT))

21191

return Op;

21192

21193

return LRINT_LLRINTHelper(Op.getNode(), DAG);

21194

}

21195

21196

SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

21197

SelectionDAG &DAG) const {

21198

EVT DstVT = N->getValueType(0);

21199

SDValue Src = N->getOperand(0);

21200

EVT SrcVT = Src.getValueType();

21201

21202

if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

21203

// f16 must be promoted before using the lowering in this routine.

21204

// fp128 does not use this lowering.

21205

return SDValue();

21206

}

21207

21208

SDLoc DL(N);

21209

SDValue Chain = DAG.getEntryNode();

21210

21211

bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

21212

21213

// If we're converting from SSE, the stack slot needs to hold both types.

21214

// Otherwise it only needs to hold the DstVT.

21215

EVT OtherVT = UseSSE ? SrcVT : DstVT;

21216

SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

21217

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

21218

MachinePointerInfo MPI =

21219

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

21220

21221

if (UseSSE) {

21222

assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")((DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21222, __PRETTY_FUNCTION__));

21223

Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

21224

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

21225

SDValue Ops[] = { Chain, StackPtr };

21226

21227

Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

21228

/*Align*/ None, MachineMemOperand::MOLoad);

21229

Chain = Src.getValue(1);

21230

}

21231

21232

SDValue StoreOps[] = { Chain, Src, StackPtr };

21233

Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

21234

StoreOps, DstVT, MPI, /*Align*/ None,

21235

MachineMemOperand::MOStore);

21236

21237

return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

21238

}

21239

21240

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

21241

bool IsStrict = Op->isStrictFPOpcode();

21242

21243

SDLoc DL(Op);

21244

MVT VT = Op.getSimpleValueType();

21245

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

21246

MVT SVT = In.getSimpleValueType();

21247

21248

if (VT == MVT::f128) {

21249

RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);

21250

return LowerF128Call(Op, DAG, LC);

21251

}

21252

21253

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21253, __PRETTY_FUNCTION__));

21254

21255

SDValue Res =

21256

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

21257

if (IsStrict)

21258

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

21259

{Op->getOperand(0), Res});

21260

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

21261

}

21262

21263

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

21264

bool IsStrict = Op->isStrictFPOpcode();

21265

21266

MVT VT = Op.getSimpleValueType();

21267

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

21268

MVT SVT = In.getSimpleValueType();

21269

21270

// It's legal except when f128 is involved

21271

if (SVT != MVT::f128)

21272

return Op;

21273

21274

RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);

21275

21276

// FP_ROUND node has a second operand indicating whether it is known to be

21277

// precise. That doesn't take part in the LibCall so we can't directly use

21278

// LowerF128Call.

21279

21280

SDLoc dl(Op);

21281

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

21282

MakeLibCallOptions CallOptions;

21283

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,

21284

dl, Chain);

21285

21286

if (IsStrict)

21287

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

21288

21289

return Tmp.first;

21290

}

21291

21292

static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

21293

bool IsStrict = Op->isStrictFPOpcode();

21294

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21295

assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21296, __PRETTY_FUNCTION__))

21296

"Unexpected VT!")((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21296, __PRETTY_FUNCTION__));

21297

21298

SDLoc dl(Op);

21299

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

21300

DAG.getConstant(0, dl, MVT::v8i16), Src,

21301

DAG.getIntPtrConstant(0, dl));

21302

21303

SDValue Chain;

21304

if (IsStrict) {

21305

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

21306

{Op.getOperand(0), Res});

21307

Chain = Res.getValue(1);

21308

} else {

21309

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

21310

}

21311

21312

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

21313

DAG.getIntPtrConstant(0, dl));

21314

21315

if (IsStrict)

21316

return DAG.getMergeValues({Res, Chain}, dl);

21317

21318

return Res;

21319

}

21320

21321

static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

21322

bool IsStrict = Op->isStrictFPOpcode();

21323

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21324

assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21325, __PRETTY_FUNCTION__))

21325

"Unexpected VT!")((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21325, __PRETTY_FUNCTION__));

21326

21327

SDLoc dl(Op);

21328

SDValue Res, Chain;

21329

if (IsStrict) {

21330

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

21331

DAG.getConstantFP(0, dl, MVT::v4f32), Src,

21332

DAG.getIntPtrConstant(0, dl));

21333

Res = DAG.getNode(

21334

X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

21335

{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

21336

Chain = Res.getValue(1);

21337

} else {

21338

// FIXME: Should we use zeros for upper elements for non-strict?

21339

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

21340

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

21341

DAG.getTargetConstant(4, dl, MVT::i32));

21342

}

21343

21344

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

21345

DAG.getIntPtrConstant(0, dl));

21346

21347

if (IsStrict)

21348

return DAG.getMergeValues({Res, Chain}, dl);

21349

21350

return Res;

21351

}

21352

21353

/// Depending on uarch and/or optimizing for size, we might prefer to use a

21354

/// vector operation in place of the typical scalar operation.

21355

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

21356

const X86Subtarget &Subtarget) {

21357

// If both operands have other uses, this is probably not profitable.

21358

SDValue LHS = Op.getOperand(0);

21359

SDValue RHS = Op.getOperand(1);

21360

if (!LHS.hasOneUse() && !RHS.hasOneUse())

21361

return Op;

21362

21363

// FP horizontal add/sub were added with SSE3. Integer with SSSE3.

21364

bool IsFP = Op.getSimpleValueType().isFloatingPoint();

21365

if (IsFP && !Subtarget.hasSSE3())

21366

return Op;

21367

if (!IsFP && !Subtarget.hasSSSE3())

21368

return Op;

21369

21370

// Extract from a common vector.

21371

if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

21372

RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

21373

LHS.getOperand(0) != RHS.getOperand(0) ||

21374

!isa<ConstantSDNode>(LHS.getOperand(1)) ||

21375

!isa<ConstantSDNode>(RHS.getOperand(1)) ||

21376

!shouldUseHorizontalOp(true, DAG, Subtarget))

21377

return Op;

21378

21379

// Allow commuted 'hadd' ops.

21380

// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

21381

unsigned HOpcode;

21382

switch (Op.getOpcode()) {

21383

case ISD::ADD: HOpcode = X86ISD::HADD; break;

21384

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

21385

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

21386

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

21387

default:

21388

llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21388);

21389

}

21390

unsigned LExtIndex = LHS.getConstantOperandVal(1);

21391

unsigned RExtIndex = RHS.getConstantOperandVal(1);

21392

if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

21393

(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

21394

std::swap(LExtIndex, RExtIndex);

21395

21396

if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

21397

return Op;

21398

21399

SDValue X = LHS.getOperand(0);

21400

EVT VecVT = X.getValueType();

21401

unsigned BitWidth = VecVT.getSizeInBits();

21402

unsigned NumLanes = BitWidth / 128;

21403

unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

21404

assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21405, __PRETTY_FUNCTION__))

21405

"Not expecting illegal vector widths here")(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21405, __PRETTY_FUNCTION__));

21406

21407

// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

21408

// equivalent, so extract the 256/512-bit source op to 128-bit if we can.

21409

SDLoc DL(Op);

21410

if (BitWidth == 256 || BitWidth == 512) {

21411

unsigned LaneIdx = LExtIndex / NumEltsPerLane;

21412

X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

21413

LExtIndex %= NumEltsPerLane;

21414

}

21415

21416

// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

21417

// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

21418

// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

21419

// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

21420

SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

21421

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

21422

DAG.getIntPtrConstant(LExtIndex / 2, DL));

21423

}

21424

21425

/// Depending on uarch and/or optimizing for size, we might prefer to use a

21426

/// vector operation in place of the typical scalar operation.

21427

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

21428

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21429, __PRETTY_FUNCTION__))

21429

"Only expecting float/double")(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21429, __PRETTY_FUNCTION__));

21430

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

21431

}

21432

21433

/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

21434

/// This mode isn't supported in hardware on X86. But as long as we aren't

21435

/// compiling with trapping math, we can emulate this with

21436

/// floor(X + copysign(nextafter(0.5, 0.0), X)).

21437

static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

21438

SDValue N0 = Op.getOperand(0);

21439

SDLoc dl(Op);

21440

MVT VT = Op.getSimpleValueType();

21441

21442

// N0 += copysign(nextafter(0.5, 0.0), N0)

21443

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

21444

bool Ignored;

21445

APFloat Point5Pred = APFloat(0.5f);

21446

Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

21447

Point5Pred.next(/*nextDown*/true);

21448

21449

SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

21450

DAG.getConstantFP(Point5Pred, dl, VT), N0);

21451

N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

21452

21453

// Truncate the result to remove fraction.

21454

return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

21455

}

21456

21457

/// The only differences between FABS and FNEG are the mask and the logic op.

21458

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

21459

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

21460

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21461, __PRETTY_FUNCTION__))

21461

"Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21461, __PRETTY_FUNCTION__));

21462

21463

bool IsFABS = (Op.getOpcode() == ISD::FABS);

21464

21465

// If this is a FABS and it has an FNEG user, bail out to fold the combination

21466

// into an FNABS. We'll lower the FABS after that if it is still in use.

21467

if (IsFABS)

21468

for (SDNode *User : Op->uses())

21469

if (User->getOpcode() == ISD::FNEG)

21470

return Op;

21471

21472

SDLoc dl(Op);

21473

MVT VT = Op.getSimpleValueType();

21474

21475

bool IsF128 = (VT == MVT::f128);

21476

assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21479, __PRETTY_FUNCTION__))

21477

VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21479, __PRETTY_FUNCTION__))

21478

VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21479, __PRETTY_FUNCTION__))

21479

"Unexpected type in LowerFABSorFNEG")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21479, __PRETTY_FUNCTION__));

21480

21481

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

21482

// decide if we should generate a 16-byte constant mask when we only need 4 or

21483

// 8 bytes for the scalar case.

21484

21485

// There are no scalar bitwise logical SSE/AVX instructions, so we

21486

// generate a 16-byte vector constant and logic op even for the scalar case.

21487

// Using a 16-byte mask allows folding the load of the mask with

21488

// the logic op, so it can save (~4 bytes) on code size.

21489

bool IsFakeVector = !VT.isVector() && !IsF128;

21490

MVT LogicVT = VT;

21491

if (IsFakeVector)

21492

LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

21493

21494

unsigned EltBits = VT.getScalarSizeInBits();

21495

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

21496

APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

21497

APInt::getSignMask(EltBits);

21498

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

21499

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

21500

21501

SDValue Op0 = Op.getOperand(0);

21502

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

21503

unsigned LogicOp = IsFABS ? X86ISD::FAND :

21504

IsFNABS ? X86ISD::FOR :

21505

X86ISD::FXOR;

21506

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

21507

21508

if (VT.isVector() || IsF128)

21509

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

21510

21511

// For the scalar case extend to a 128-bit vector, perform the logic op,

21512

// and extract the scalar result back out.

21513

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

21514

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

21515

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

21516

DAG.getIntPtrConstant(0, dl));

21517

}

21518

21519

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

21520

SDValue Mag = Op.getOperand(0);

21521

SDValue Sign = Op.getOperand(1);

21522

SDLoc dl(Op);

21523

21524

// If the sign operand is smaller, extend it first.

21525

MVT VT = Op.getSimpleValueType();

21526

if (Sign.getSimpleValueType().bitsLT(VT))

21527

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

21528

21529

// And if it is bigger, shrink it first.

21530

if (Sign.getSimpleValueType().bitsGT(VT))

21531

Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

21532

21533

// At this point the operands and the result should have the same

21534

// type, and that won't be f80 since that is not custom lowered.

21535

bool IsF128 = (VT == MVT::f128);

21536

assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21539, __PRETTY_FUNCTION__))

21537

VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21539, __PRETTY_FUNCTION__))

21538

VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21539, __PRETTY_FUNCTION__))

21539

"Unexpected type in LowerFCOPYSIGN")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21539, __PRETTY_FUNCTION__));

21540

21541

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

21542

21543

// Perform all scalar logic operations as 16-byte vectors because there are no

21544

// scalar FP logic instructions in SSE.

21545

// TODO: This isn't necessary. If we used scalar types, we might avoid some

21546

// unnecessary splats, but we might miss load folding opportunities. Should

21547

// this decision be based on OptimizeForSize?

21548

bool IsFakeVector = !VT.isVector() && !IsF128;

21549

MVT LogicVT = VT;

21550

if (IsFakeVector)

21551

LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

21552

21553

// The mask constants are automatically splatted for vector types.

21554

unsigned EltSizeInBits = VT.getScalarSizeInBits();

21555

SDValue SignMask = DAG.getConstantFP(

21556

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

21557

SDValue MagMask = DAG.getConstantFP(

21558

APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

21559

21560

// First, clear all bits but the sign bit from the second operand (sign).

21561

if (IsFakeVector)

21562

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

21563

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

21564

21565

// Next, clear the sign bit from the first operand (magnitude).

21566

// TODO: If we had general constant folding for FP logic ops, this check

21567

// wouldn't be necessary.

21568

SDValue MagBits;

21569

if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

21570

APFloat APF = Op0CN->getValueAPF();

21571

APF.clearSign();

21572

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

21573

} else {

21574

// If the magnitude operand wasn't a constant, we need to AND out the sign.

21575

if (IsFakeVector)

21576

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

21577

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

21578

}

21579

21580

// OR the magnitude value with the sign bit.

21581

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

21582

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

21583

DAG.getIntPtrConstant(0, dl));

21584

}

21585

21586

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

21587

SDValue N0 = Op.getOperand(0);

21588

SDLoc dl(Op);

21589

MVT VT = Op.getSimpleValueType();

21590

21591

MVT OpVT = N0.getSimpleValueType();

21592

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21593, __PRETTY_FUNCTION__))

21593

"Unexpected type for FGETSIGN")(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21593, __PRETTY_FUNCTION__));

21594

21595

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

21596

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

21597

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

21598

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

21599

Res = DAG.getZExtOrTrunc(Res, dl, VT);

21600

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

21601

return Res;

21602

}

21603

21604

/// Helper for creating a X86ISD::SETCC node.

21605

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

21606

SelectionDAG &DAG) {

21607

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

21608

DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

21609

}

21610

21611

/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))

21612

/// style scalarized (associative) reduction patterns. Partial reductions

21613

/// are supported when the pointer SrcMask is non-null.

21614

/// TODO - move this to SelectionDAG?

21615

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

21616

SmallVectorImpl<SDValue> &SrcOps,

21617

SmallVectorImpl<APInt> *SrcMask = nullptr) {

21618

SmallVector<SDValue, 8> Opnds;

21619

DenseMap<SDValue, APInt> SrcOpMap;

21620

EVT VT = MVT::Other;

21621

21622

// Recognize a special case where a vector is casted into wide integer to

21623

// test all 0s.

21624

assert(Op.getOpcode() == unsigned(BinOp) &&((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21625, __PRETTY_FUNCTION__))

21625

"Unexpected bit reduction opcode")((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21625, __PRETTY_FUNCTION__));

21626

Opnds.push_back(Op.getOperand(0));

21627

Opnds.push_back(Op.getOperand(1));

21628

21629

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

21630

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

21631

// BFS traverse all BinOp operands.

21632

if (I->getOpcode() == unsigned(BinOp)) {

21633

Opnds.push_back(I->getOperand(0));

21634

Opnds.push_back(I->getOperand(1));

21635

// Re-evaluate the number of nodes to be traversed.

21636

e += 2; // 2 more nodes (LHS and RHS) are pushed.

21637

continue;

21638

}

21639

21640

// Quit if a non-EXTRACT_VECTOR_ELT

21641

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

21642

return false;

21643

21644

// Quit if without a constant index.

21645

auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

21646

if (!Idx)

21647

return false;

21648

21649

SDValue Src = I->getOperand(0);

21650

DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

21651

if (M == SrcOpMap.end()) {

21652

VT = Src.getValueType();

21653

// Quit if not the same type.

21654

if (SrcOpMap.begin() != SrcOpMap.end() &&

21655

VT != SrcOpMap.begin()->first.getValueType())

21656

return false;

21657

unsigned NumElts = VT.getVectorNumElements();

21658

APInt EltCount = APInt::getNullValue(NumElts);

21659

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

21660

SrcOps.push_back(Src);

21661

}

21662

21663

// Quit if element already used.

21664

unsigned CIdx = Idx->getZExtValue();

21665

if (M->second[CIdx])

21666

return false;

21667

M->second.setBit(CIdx);

21668

}

21669

21670

if (SrcMask) {

21671

// Collect the source partial masks.

21672

for (SDValue &SrcOp : SrcOps)

21673

SrcMask->push_back(SrcOpMap[SrcOp]);

21674

} else {

21675

// Quit if not all elements are used.

21676

for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),

21677

E = SrcOpMap.end();

21678

I != E; ++I) {

21679

if (!I->second.isAllOnesValue())

21680

return false;

21681

}

21682

}

21683

21684

return true;

21685

}

21686

21687

// Helper function for comparing all bits of a vector against zero.

21688

static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,

21689

const APInt &Mask,

21690

const X86Subtarget &Subtarget,

21691

SelectionDAG &DAG, X86::CondCode &X86CC) {

21692

EVT VT = V.getValueType();

21693

assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&((Mask.getBitWidth() == VT.getScalarSizeInBits() && "Element Mask vs Vector bitwidth mismatch"
) ? static_cast<void> (0) : __assert_fail ("Mask.getBitWidth() == VT.getScalarSizeInBits() && \"Element Mask vs Vector bitwidth mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21694, __PRETTY_FUNCTION__))

21694

"Element Mask vs Vector bitwidth mismatch")((Mask.getBitWidth() == VT.getScalarSizeInBits() && "Element Mask vs Vector bitwidth mismatch"
) ? static_cast<void> (0) : __assert_fail ("Mask.getBitWidth() == VT.getScalarSizeInBits() && \"Element Mask vs Vector bitwidth mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21694, __PRETTY_FUNCTION__));

21695

21696

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21696, __PRETTY_FUNCTION__));

21697

X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

21698

21699

auto MaskBits = [&](SDValue Src) {

21700

if (Mask.isAllOnesValue())

21701

return Src;

21702

EVT SrcVT = Src.getValueType();

21703

SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

21704

return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

21705

};

21706

21707

// For sub-128-bit vector, cast to (legal) integer and compare with zero.

21708

if (VT.getSizeInBits() < 128) {

21709

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

21710

if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))

21711

return SDValue();

21712

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

21713

DAG.getBitcast(IntVT, MaskBits(V)),

21714

DAG.getConstant(0, DL, IntVT));

21715

}

21716

21717

// Quit if not splittable to 128/256-bit vector.

21718

if (!isPowerOf2_32(VT.getSizeInBits()))

21719

return SDValue();

21720

21721

// Split down to 128/256-bit vector.

21722

unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;

21723

while (VT.getSizeInBits() > TestSize) {

21724

auto Split = DAG.SplitVector(V, DL);

21725

VT = Split.first.getValueType();

21726

V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

21727

}

21728

21729

bool UsePTEST = Subtarget.hasSSE41();

21730

if (UsePTEST) {

21731

MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

21732

V = DAG.getBitcast(TestVT, MaskBits(V));

21733

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

21734

}

21735

21736

// Without PTEST, a masked v2i64 or-reduction is not faster than

21737

// scalarization.

21738

if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)

21739

return SDValue();

21740

21741

V = DAG.getBitcast(MVT::v16i8, MaskBits(V));

21742

V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,

21743

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

21744

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

21745

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

21746

DAG.getConstant(0xFFFF, DL, MVT::i32));

21747

}

21748

21749

// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to

21750

// CMP(MOVMSK(PCMPEQB(X,0))).

21751

static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,

21752

const SDLoc &DL,

21753

const X86Subtarget &Subtarget,

21754

SelectionDAG &DAG, SDValue &X86CC) {

21755

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21755, __PRETTY_FUNCTION__));

21756

21757

if (!Subtarget.hasSSE2() || !Op->hasOneUse())

21758

return SDValue();

21759

21760

// Check whether we're masking/truncating an OR-reduction result, in which

21761

// case track the masked bits.

21762

APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());

21763

switch (Op.getOpcode()) {

21764

case ISD::TRUNCATE: {

21765

SDValue Src = Op.getOperand(0);

21766

Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

21767

Op.getScalarValueSizeInBits());

21768

Op = Src;

21769

break;

21770

}

21771

case ISD::AND: {

21772

if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

21773

Mask = Cst->getAPIntValue();

21774

Op = Op.getOperand(0);

21775

}

21776

break;

21777

}

21778

}

21779

21780

SmallVector<SDValue, 8> VecIns;

21781

if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {

21782

EVT VT = VecIns[0].getValueType();

21783

assert(llvm::all_of(VecIns,((llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType
(); }) && "Reduction source vector mismatch") ? static_cast
<void> (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21785, __PRETTY_FUNCTION__))

21784

[VT](SDValue V) { return VT == V.getValueType(); }) &&((llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType
(); }) && "Reduction source vector mismatch") ? static_cast
<void> (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21785, __PRETTY_FUNCTION__))

21785

"Reduction source vector mismatch")((llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType
(); }) && "Reduction source vector mismatch") ? static_cast
<void> (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21785, __PRETTY_FUNCTION__));

21786

21787

// Quit if less than 128-bits or not splittable to 128/256-bit vector.

21788

if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))

21789

return SDValue();

21790

21791

// If more than one full vector is evaluated, OR them first before PTEST.

21792

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

21793

Slot += 2, e += 1) {

21794

// Each iteration will OR 2 nodes and append the result until there is

21795

// only 1 node left, i.e. the final OR'd value of all vectors.

21796

SDValue LHS = VecIns[Slot];

21797

SDValue RHS = VecIns[Slot + 1];

21798

VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));

21799

}

21800

21801

X86::CondCode CCode;

21802

if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,

21803

DAG, CCode)) {

21804

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

21805

return V;

21806

}

21807

}

21808

21809

if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

21810

ISD::NodeType BinOp;

21811

if (SDValue Match =

21812

DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {

21813

X86::CondCode CCode;

21814

if (SDValue V =

21815

LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {

21816

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

21817

return V;

21818

}

21819

}

21820

}

21821

21822

return SDValue();

21823

}

21824

21825

/// return true if \c Op has a use that doesn't just read flags.

21826

static bool hasNonFlagsUse(SDValue Op) {

21827

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

21828

++UI) {

21829

SDNode *User = *UI;

21830

unsigned UOpNo = UI.getOperandNo();

21831

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

21832

// Look pass truncate.

21833

UOpNo = User->use_begin().getOperandNo();

21834

User = *User->use_begin();

21835

}

21836

21837

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

21838

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

21839

return true;

21840

}

21841

return false;

21842

}

21843

21844

// Transform to an x86-specific ALU node with flags if there is a chance of

21845

// using an RMW op or only the flags are used. Otherwise, leave

21846

// the node alone and emit a 'cmp' or 'test' instruction.

21847

static bool isProfitableToUseFlagOp(SDValue Op) {

21848

for (SDNode *U : Op->uses())

21849

if (U->getOpcode() != ISD::CopyToReg &&

21850

U->getOpcode() != ISD::SETCC &&

21851

U->getOpcode() != ISD::STORE)

21852

return false;

21853

21854

return true;

21855

}

21856

21857

/// Emit nodes that will be selected as "test Op0,Op0", or something

21858

/// equivalent.

21859

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

21860

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

21861

// CF and OF aren't always set the way we want. Determine which

21862

// of these we need.

21863

bool NeedCF = false;

21864

bool NeedOF = false;

21865

switch (X86CC) {

21866

default: break;

21867

case X86::COND_A: case X86::COND_AE:

21868

case X86::COND_B: case X86::COND_BE:

21869

NeedCF = true;

21870

break;

21871

case X86::COND_G: case X86::COND_GE:

21872

case X86::COND_L: case X86::COND_LE:

21873

case X86::COND_O: case X86::COND_NO: {

21874

// Check if we really need to set the

21875

// Overflow flag. If NoSignedWrap is present

21876

// that is not actually needed.

21877

switch (Op->getOpcode()) {

21878

case ISD::ADD:

21879

case ISD::SUB:

21880

case ISD::MUL:

21881

case ISD::SHL:

21882

if (Op.getNode()->getFlags().hasNoSignedWrap())

21883

break;

21884

LLVM_FALLTHROUGH[[gnu::fallthrough]];

21885

default:

21886

NeedOF = true;

21887

break;

21888

}

21889

break;

21890

}

21891

}

21892

// See if we can use the EFLAGS value from the operand instead of

21893

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

21894

// we prove that the arithmetic won't overflow, we can't use OF or CF.

21895

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

21896

// Emit a CMP with 0, which is the TEST pattern.

21897

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

21898

DAG.getConstant(0, dl, Op.getValueType()));

21899

}

21900

unsigned Opcode = 0;

21901

unsigned NumOperands = 0;

21902

21903

SDValue ArithOp = Op;

21904

21905

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

21906

// which may be the result of a CAST. We use the variable 'Op', which is the

21907

// non-casted variable when we check for possible users.

21908

switch (ArithOp.getOpcode()) {

21909

case ISD::AND:

21910

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

21911

// because a TEST instruction will be better.

21912

if (!hasNonFlagsUse(Op))

21913

break;

21914

21915

LLVM_FALLTHROUGH[[gnu::fallthrough]];

21916

case ISD::ADD:

21917

case ISD::SUB:

21918

case ISD::OR:

21919

case ISD::XOR:

21920

if (!isProfitableToUseFlagOp(Op))

21921

break;

21922

21923

// Otherwise use a regular EFLAGS-setting instruction.

21924

switch (ArithOp.getOpcode()) {

21925

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21925);

21926

case ISD::ADD: Opcode = X86ISD::ADD; break;

21927

case ISD::SUB: Opcode = X86ISD::SUB; break;

21928

case ISD::XOR: Opcode = X86ISD::XOR; break;

21929

case ISD::AND: Opcode = X86ISD::AND; break;

21930

case ISD::OR: Opcode = X86ISD::OR; break;

21931

}

21932

21933

NumOperands = 2;

21934

break;

21935

case X86ISD::ADD:

21936

case X86ISD::SUB:

21937

case X86ISD::OR:

21938

case X86ISD::XOR:

21939

case X86ISD::AND:

21940

return SDValue(Op.getNode(), 1);

21941

case ISD::SSUBO:

21942

case ISD::USUBO: {

21943

// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

21944

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

21945

return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

21946

Op->getOperand(1)).getValue(1);

21947

}

21948

default:

21949

break;

21950

}

21951

21952

if (Opcode == 0) {

21953

// Emit a CMP with 0, which is the TEST pattern.

21954

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

21955

DAG.getConstant(0, dl, Op.getValueType()));

21956

}

21957

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

21958

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

21959

21960

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

21961

DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

21962

return SDValue(New.getNode(), 1);

21963

}

21964

21965

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

21966

/// equivalent.

21967

static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

21968

const SDLoc &dl, SelectionDAG &DAG,

21969

const X86Subtarget &Subtarget) {

21970

if (isNullConstant(Op1))

21971

return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

21972

21973

EVT CmpVT = Op0.getValueType();

21974

21975

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21976, __PRETTY_FUNCTION__))

21976

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21976, __PRETTY_FUNCTION__));

21977

21978

// Only promote the compare up to I32 if it is a 16 bit operation

21979

// with an immediate. 16 bit immediates are to be avoided.

21980

if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&

21981

!DAG.getMachineFunction().getFunction().hasMinSize()) {

21982

ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);

21983

ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);

21984

// Don't do this if the immediate can fit in 8-bits.

21985

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

21986

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

21987

unsigned ExtendOp =

21988

isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

21989

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

21990

// For equality comparisons try to use SIGN_EXTEND if the input was

21991

// truncate from something with enough sign bits.

21992

if (Op0.getOpcode() == ISD::TRUNCATE) {

21993

SDValue In = Op0.getOperand(0);

21994

unsigned EffBits =

21995

In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;

21996

if (EffBits <= 16)

21997

ExtendOp = ISD::SIGN_EXTEND;

21998

} else if (Op1.getOpcode() == ISD::TRUNCATE) {

21999

SDValue In = Op1.getOperand(0);

22000

unsigned EffBits =

22001

In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;

22002

if (EffBits <= 16)

22003

ExtendOp = ISD::SIGN_EXTEND;

22004

}

22005

}

22006

22007

CmpVT = MVT::i32;

22008

Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

22009

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

22010

}

22011

}

22012

22013

// Try to shrink i64 compares if the input has enough zero bits.

22014

// FIXME: Do this for non-constant compares for constant on LHS?

22015

if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

22016

Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

22017

cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

22018

DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

22019

CmpVT = MVT::i32;

22020

Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

22021

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

22022

}

22023

22024

// 0-x == y --> x+y == 0

22025

// 0-x != y --> x+y != 0

22026

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

22027

Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

22028

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

22029

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

22030

return Add.getValue(1);

22031

}

22032

22033

// x == 0-y --> x+y == 0

22034

// x != 0-y --> x+y != 0

22035

if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

22036

Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

22037

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

22038

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

22039

return Add.getValue(1);

22040

}

22041

22042

// Use SUB instead of CMP to enable CSE between SUB and CMP.

22043

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

22044

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

22045

return Sub.getValue(1);

22046

}

22047

22048

/// Check if replacement of SQRT with RSQRT should be disabled.

22049

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

22050

EVT VT = Op.getValueType();

22051

22052

// We never want to use both SQRT and RSQRT instructions for the same input.

22053

if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

22054

return false;

22055

22056

if (VT.isVector())

22057

return Subtarget.hasFastVectorFSQRT();

22058

return Subtarget.hasFastScalarFSQRT();

22059

}

22060

22061

/// The minimum architected relative accuracy is 2^-12. We need one

22062

/// Newton-Raphson step to have a good float result (24 bits of precision).

22063

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

22064

SelectionDAG &DAG, int Enabled,

22065

int &RefinementSteps,

22066

bool &UseOneConstNR,

22067

bool Reciprocal) const {

22068

EVT VT = Op.getValueType();

22069

22070

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

22071

// It is likely not profitable to do this for f64 because a double-precision

22072

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

22073

// instructions: convert to single, rsqrtss, convert back to double, refine

22074

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

22075

// along with FMA, this could be a throughput win.

22076

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

22077

// after legalize types.

22078

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

22079

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

22080

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

22081

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

22082

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

22083

if (RefinementSteps == ReciprocalEstimate::Unspecified)

22084

RefinementSteps = 1;

22085

22086

UseOneConstNR = false;

22087

// There is no FSQRT for 512-bits, but there is RSQRT14.

22088

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

22089

return DAG.getNode(Opcode, SDLoc(Op), VT, Op);

22090

}

22091

return SDValue();

22092

}

22093

22094

/// The minimum architected relative accuracy is 2^-12. We need one

22095

/// Newton-Raphson step to have a good float result (24 bits of precision).

22096

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

22097

int Enabled,

22098

int &RefinementSteps) const {

22099

EVT VT = Op.getValueType();

22100

22101

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

22102

// It is likely not profitable to do this for f64 because a double-precision

22103

// reciprocal estimate with refinement on x86 prior to FMA requires

22104

// 15 instructions: convert to single, rcpss, convert back to double, refine

22105

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

22106

// along with FMA, this could be a throughput win.

22107

22108

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

22109

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

22110

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

22111

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

22112

// Enable estimate codegen with 1 refinement step for vector division.

22113

// Scalar division estimates are disabled because they break too much

22114

// real-world code. These defaults are intended to match GCC behavior.

22115

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

22116

return SDValue();

22117

22118

if (RefinementSteps == ReciprocalEstimate::Unspecified)

22119

RefinementSteps = 1;

22120

22121

// There is no FSQRT for 512-bits, but there is RCP14.

22122

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

22123

return DAG.getNode(Opcode, SDLoc(Op), VT, Op);

22124

}

22125

return SDValue();

22126

}

22127

22128

/// If we have at least two divisions that use the same divisor, convert to

22129

/// multiplication by a reciprocal. This may need to be adjusted for a given

22130

/// CPU if a division's cost is not at least twice the cost of a multiplication.

22131

/// This is because we still need one division to calculate the reciprocal and

22132

/// then we need two multiplies by that reciprocal as replacements for the

22133

/// original divisions.

22134

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

22135

return 2;

22136

}

22137

22138

SDValue

22139

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

22140

SelectionDAG &DAG,

22141

SmallVectorImpl<SDNode *> &Created) const {

22142

AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

22143

if (isIntDivCheap(N->getValueType(0), Attr))

22144

return SDValue(N,0); // Lower SDIV as SDIV

22145

22146

assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22147, __PRETTY_FUNCTION__))

22147

"Unexpected divisor!")(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22147, __PRETTY_FUNCTION__));

22148

22149

// Only perform this transform if CMOV is supported otherwise the select

22150

// below will become a branch.

22151

if (!Subtarget.hasCMov())

22152

return SDValue();

22153

22154

// fold (sdiv X, pow2)

22155

EVT VT = N->getValueType(0);

22156

// FIXME: Support i8.

22157

if (VT != MVT::i16 && VT != MVT::i32 &&

22158

!(Subtarget.is64Bit() && VT == MVT::i64))

22159

return SDValue();

22160

22161

unsigned Lg2 = Divisor.countTrailingZeros();

22162

22163

// If the divisor is 2 or -2, the default expansion is better.

22164

if (Lg2 == 1)

22165

return SDValue();

22166

22167

SDLoc DL(N);

22168

SDValue N0 = N->getOperand(0);

22169

SDValue Zero = DAG.getConstant(0, DL, VT);

22170

APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);

22171

SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

22172

22173

// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.

22174

SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);

22175

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);

22176

SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

22177

22178

Created.push_back(Cmp.getNode());

22179

Created.push_back(Add.getNode());

22180

Created.push_back(CMov.getNode());

22181

22182

// Divide by pow2.

22183

SDValue SRA =

22184

DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

22185

22186

// If we're dividing by a positive value, we're done. Otherwise, we must

22187

// negate the result.

22188

if (Divisor.isNonNegative())

22189

return SRA;

22190

22191

Created.push_back(SRA.getNode());

22192

return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);

22193

}

22194

22195

/// Result of 'and' is compared against zero. Change to a BT node if possible.

22196

/// Returns the BT node and the condition code needed to use it.

22197

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,

22198

const SDLoc &dl, SelectionDAG &DAG,

22199

SDValue &X86CC) {

22200

assert(And.getOpcode() == ISD::AND && "Expected AND node!")((And.getOpcode() == ISD::AND && "Expected AND node!"
) ? static_cast<void> (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22200, __PRETTY_FUNCTION__));

22201

SDValue Op0 = And.getOperand(0);

22202

SDValue Op1 = And.getOperand(1);

22203

if (Op0.getOpcode() == ISD::TRUNCATE)

22204

Op0 = Op0.getOperand(0);

22205

if (Op1.getOpcode() == ISD::TRUNCATE)

22206

Op1 = Op1.getOperand(0);

22207

22208

SDValue Src, BitNo;

22209

if (Op1.getOpcode() == ISD::SHL)

22210

std::swap(Op0, Op1);

22211

if (Op0.getOpcode() == ISD::SHL) {

22212

if (isOneConstant(Op0.getOperand(0))) {

22213

// If we looked past a truncate, check that it's only truncating away

22214

// known zeros.

22215

unsigned BitWidth = Op0.getValueSizeInBits();

22216

unsigned AndBitWidth = And.getValueSizeInBits();

22217

if (BitWidth > AndBitWidth) {

22218

KnownBits Known = DAG.computeKnownBits(Op0);

22219

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

22220

return SDValue();

22221

}

22222

Src = Op1;

22223

BitNo = Op0.getOperand(1);

22224

}

22225

} else if (Op1.getOpcode() == ISD::Constant) {

22226

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

22227

uint64_t AndRHSVal = AndRHS->getZExtValue();

22228

SDValue AndLHS = Op0;

22229

22230

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

22231

Src = AndLHS.getOperand(0);

22232

BitNo = AndLHS.getOperand(1);

22233

} else {

22234

// Use BT if the immediate can't be encoded in a TEST instruction or we

22235

// are optimizing for size and the immedaite won't fit in a byte.

22236

bool OptForSize = DAG.shouldOptForSize();

22237

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

22238

isPowerOf2_64(AndRHSVal)) {

22239

Src = AndLHS;

22240

BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

22241

Src.getValueType());

22242

}

22243

}

22244

}

22245

22246

// No patterns found, give up.

22247

if (!Src.getNode())

22248

return SDValue();

22249

22250

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

22251

// instruction. Since the shift amount is in-range-or-undefined, we know

22252

// that doing a bittest on the i32 value is ok. We extend to i32 because

22253

// the encoding for the i16 version is larger than the i32 version.

22254

// Also promote i16 to i32 for performance / code size reason.

22255

if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)

22256

Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

22257

22258

// See if we can use the 32-bit instruction instead of the 64-bit one for a

22259

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

22260

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

22261

// known to be zero.

22262

if (Src.getValueType() == MVT::i64 &&

22263

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

22264

Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

22265

22266

// If the operand types disagree, extend the shift amount to match. Since

22267

// BT ignores high bits (like shifts) we can use anyextend.

22268

if (Src.getValueType() != BitNo.getValueType())

22269

BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

22270

22271

X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,

22272

dl, MVT::i8);

22273

return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);

22274

}

22275

22276

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

22277

/// CMPs.

22278

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

22279

SDValue &Op1, bool &IsAlwaysSignaling) {

22280

unsigned SSECC;

22281

bool Swap = false;

22282

22283

// SSE Condition code mapping:

22284

// 0 - EQ

22285

// 1 - LT

22286

// 2 - LE

22287

// 3 - UNORD

22288

// 4 - NEQ

22289

// 5 - NLT

22290

// 6 - NLE

22291

// 7 - ORD

22292

switch (SetCCOpcode) {

22293

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22293);

22294

case ISD::SETOEQ:

22295

case ISD::SETEQ: SSECC = 0; break;

22296

case ISD::SETOGT:

22297

case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

22298

case ISD::SETLT:

22299

case ISD::SETOLT: SSECC = 1; break;

22300

case ISD::SETOGE:

22301

case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

22302

case ISD::SETLE:

22303

case ISD::SETOLE: SSECC = 2; break;

22304

case ISD::SETUO: SSECC = 3; break;

22305

case ISD::SETUNE:

22306

case ISD::SETNE: SSECC = 4; break;

22307

case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

22308

case ISD::SETUGE: SSECC = 5; break;

22309

case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

22310

case ISD::SETUGT: SSECC = 6; break;

22311

case ISD::SETO: SSECC = 7; break;

22312

case ISD::SETUEQ: SSECC = 8; break;

22313

case ISD::SETONE: SSECC = 12; break;

22314

}

22315

if (Swap)

22316

std::swap(Op0, Op1);

22317

22318

switch (SetCCOpcode) {

22319

default:

22320

IsAlwaysSignaling = true;

22321

break;

22322

case ISD::SETEQ:

22323

case ISD::SETOEQ:

22324

case ISD::SETUEQ:

22325

case ISD::SETNE:

22326

case ISD::SETONE:

22327

case ISD::SETUNE:

22328

case ISD::SETO:

22329

case ISD::SETUO:

22330

IsAlwaysSignaling = false;

22331

break;

22332

}

22333

22334

return SSECC;

22335

}

22336

22337

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

22338

/// concatenate the result back.

22339

static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {

22340

EVT VT = Op.getValueType();

22341

22342

assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation")((Op.getOpcode() == ISD::SETCC && "Unsupported operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::SETCC && \"Unsupported operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22342, __PRETTY_FUNCTION__));

22343

assert(Op.getOperand(0).getValueType().isInteger() &&((Op.getOperand(0).getValueType().isInteger() && VT ==
Op.getOperand(0).getValueType() && "Unsupported VTs!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().isInteger() && VT == Op.getOperand(0).getValueType() && \"Unsupported VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22344, __PRETTY_FUNCTION__))

22344

VT == Op.getOperand(0).getValueType() && "Unsupported VTs!")((Op.getOperand(0).getValueType().isInteger() && VT ==
Op.getOperand(0).getValueType() && "Unsupported VTs!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().isInteger() && VT == Op.getOperand(0).getValueType() && \"Unsupported VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22344, __PRETTY_FUNCTION__));

22345

22346

SDLoc dl(Op);

22347

SDValue CC = Op.getOperand(2);

22348

22349

// Extract the LHS Lo/Hi vectors

22350

SDValue LHS1, LHS2;

22351

std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

22352

22353

// Extract the RHS Lo/Hi vectors

22354

SDValue RHS1, RHS2;

22355

std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

22356

22357

// Issue the operation on the smaller types and concatenate the result back

22358

EVT LoVT, HiVT;

22359

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

22360

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

22361

DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

22362

DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

22363

}

22364

22365

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

22366

22367

SDValue Op0 = Op.getOperand(0);

22368

SDValue Op1 = Op.getOperand(1);

22369

SDValue CC = Op.getOperand(2);

22370

MVT VT = Op.getSimpleValueType();

22371

SDLoc dl(Op);

22372

22373

assert(VT.getVectorElementType() == MVT::i1 &&((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22374, __PRETTY_FUNCTION__))

22374

"Cannot set masked compare for this operation")((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22374, __PRETTY_FUNCTION__));

22375

22376

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

22377

22378

// Prefer SETGT over SETLT.

22379

if (SetCCOpcode == ISD::SETLT) {

22380

SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

22381

std::swap(Op0, Op1);

22382

}

22383

22384

return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

22385

}

22386

22387

/// Given a buildvector constant, return a new vector constant with each element

22388

/// incremented or decremented. If incrementing or decrementing would result in

22389

/// unsigned overflow or underflow or this is not a simple vector constant,

22390

/// return an empty value.

22391

static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {

22392

auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

22393

if (!BV)

22394

return SDValue();

22395

22396

MVT VT = V.getSimpleValueType();

22397

MVT EltVT = VT.getVectorElementType();

22398

unsigned NumElts = VT.getVectorNumElements();

22399

SmallVector<SDValue, 8> NewVecC;

22400

SDLoc DL(V);

22401

for (unsigned i = 0; i < NumElts; ++i) {

22402

auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

22403

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

22404

return SDValue();

22405

22406

// Avoid overflow/underflow.

22407

const APInt &EltC = Elt->getAPIntValue();

22408

if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))

22409

return SDValue();

22410

22411

NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

22412

}

22413

22414

return DAG.getBuildVector(VT, DL, NewVecC);

22415

}

22416

22417

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

22418

/// Op0 u<= Op1:

22419

/// t = psubus Op0, Op1

22420

/// pcmpeq t, <0..0>

22421

static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

22422

ISD::CondCode Cond, const SDLoc &dl,

22423

const X86Subtarget &Subtarget,

22424

SelectionDAG &DAG) {

22425

if (!Subtarget.hasSSE2())

22426

return SDValue();

22427

22428

MVT VET = VT.getVectorElementType();

22429

if (VET != MVT::i8 && VET != MVT::i16)

22430

return SDValue();

22431

22432

switch (Cond) {

22433

default:

22434

return SDValue();

22435

case ISD::SETULT: {

22436

// If the comparison is against a constant we can turn this into a

22437

// setule. With psubus, setule does not require a swap. This is

22438

// beneficial because the constant in the register is no longer

22439

// destructed as the destination so it can be hoisted out of a loop.

22440

// Only do this pre-AVX since vpcmp* is no longer destructive.

22441

if (Subtarget.hasAVX())

22442

return SDValue();

22443

SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);

22444

if (!ULEOp1)

22445

return SDValue();

22446

Op1 = ULEOp1;

22447

break;

22448

}

22449

case ISD::SETUGT: {

22450

// If the comparison is against a constant, we can turn this into a setuge.

22451

// This is beneficial because materializing a constant 0 for the PCMPEQ is

22452

// probably cheaper than XOR+PCMPGT using 2 different vector constants:

22453

// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

22454

SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);

22455

if (!UGEOp1)

22456

return SDValue();

22457

Op1 = Op0;

22458

Op0 = UGEOp1;

22459

break;

22460

}

22461

// Psubus is better than flip-sign because it requires no inversion.

22462

case ISD::SETUGE:

22463

std::swap(Op0, Op1);

22464

break;

22465

case ISD::SETULE:

22466

break;

22467

}

22468

22469

SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

22470

return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

22471

DAG.getConstant(0, dl, VT));

22472

}

22473

22474

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

22475

SelectionDAG &DAG) {

22476

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

22477

Op.getOpcode() == ISD::STRICT_FSETCCS;

22478

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

22479

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

22480

SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

22481

MVT VT = Op->getSimpleValueType(0);

22482

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

22483

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

22484

SDLoc dl(Op);

22485

22486

if (isFP) {

22487

#ifndef NDEBUG

22488

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

22489

assert(EltVT == MVT::f32 || EltVT == MVT::f64)((EltVT == MVT::f32 || EltVT == MVT::f64) ? static_cast<void
> (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22489, __PRETTY_FUNCTION__));

22490

#endif

22491

22492

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

22493

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

22494

22495

// If we have a strict compare with a vXi1 result and the input is 128/256

22496

// bits we can't use a masked compare unless we have VLX. If we use a wider

22497

// compare like we do for non-strict, we might trigger spurious exceptions

22498

// from the upper elements. Instead emit a AVX compare and convert to mask.

22499

unsigned Opc;

22500

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

22501

(!IsStrict || Subtarget.hasVLX() ||

22502

Op0.getSimpleValueType().is512BitVector())) {

22503

assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22503, __PRETTY_FUNCTION__));

22504

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

22505

} else {

22506

Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

22507

// The SSE/AVX packed FP comparison nodes are defined with a

22508

// floating-point vector result that matches the operand type. This allows

22509

// them to work with an SSE1 target (integer vector types are not legal).

22510

VT = Op0.getSimpleValueType();

22511

}

22512

22513

SDValue Cmp;

22514

bool IsAlwaysSignaling;

22515

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

22516

if (!Subtarget.hasAVX()) {

22517

// TODO: We could use following steps to handle a quiet compare with

22518

// signaling encodings.

22519

// 1. Get ordered masks from a quiet ISD::SETO

22520

// 2. Use the masks to mask potential unordered elements in operand A, B

22521

// 3. Get the compare results of masked A, B

22522

// 4. Calculating final result using the mask and result from 3

22523

// But currently, we just fall back to scalar operations.

22524

if (IsStrict && IsAlwaysSignaling && !IsSignaling)

22525

return SDValue();

22526

22527

// Insert an extra signaling instruction to raise exception.

22528

if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

22529

SDValue SignalCmp = DAG.getNode(

22530

Opc, dl, {VT, MVT::Other},

22531

{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

22532

// FIXME: It seems we need to update the flags of all new strict nodes.

22533

// Otherwise, mayRaiseFPException in MI will return false due to

22534

// NoFPExcept = false by default. However, I didn't find it in other

22535

// patches.

22536

SignalCmp->setFlags(Op->getFlags());

22537

Chain = SignalCmp.getValue(1);

22538

}

22539

22540

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

22541

// emit two comparisons and a logic op to tie them together.

22542

if (SSECC >= 8) {

22543

// LLVM predicate is SETUEQ or SETONE.

22544

unsigned CC0, CC1;

22545

unsigned CombineOpc;

22546

if (Cond == ISD::SETUEQ) {

22547

CC0 = 3; // UNORD

22548

CC1 = 0; // EQ

22549

CombineOpc = X86ISD::FOR;

22550

} else {

22551

assert(Cond == ISD::SETONE)((Cond == ISD::SETONE) ? static_cast<void> (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22551, __PRETTY_FUNCTION__));

22552

CC0 = 7; // ORD

22553

CC1 = 4; // NEQ

22554

CombineOpc = X86ISD::FAND;

22555

}

22556

22557

SDValue Cmp0, Cmp1;

22558

if (IsStrict) {

22559

Cmp0 = DAG.getNode(

22560

Opc, dl, {VT, MVT::Other},

22561

{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

22562

Cmp1 = DAG.getNode(

22563

Opc, dl, {VT, MVT::Other},

22564

{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

22565

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

22566

Cmp1.getValue(1));

22567

} else {

22568

Cmp0 = DAG.getNode(

22569

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

22570

Cmp1 = DAG.getNode(

22571

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

22572

}

22573

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

22574

} else {

22575

if (IsStrict) {

22576

Cmp = DAG.getNode(

22577

Opc, dl, {VT, MVT::Other},

22578

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

22579

Chain = Cmp.getValue(1);

22580

} else

22581

Cmp = DAG.getNode(

22582

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

22583

}

22584

} else {

22585

// Handle all other FP comparisons here.

22586

if (IsStrict) {

22587

// Make a flip on already signaling CCs before setting bit 4 of AVX CC.

22588

SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

22589

Cmp = DAG.getNode(

22590

Opc, dl, {VT, MVT::Other},

22591

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

22592

Chain = Cmp.getValue(1);

22593

} else

22594

Cmp = DAG.getNode(

22595

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

22596

}

22597

22598

if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {

22599

// We emitted a compare with an XMM/YMM result. Finish converting to a

22600

// mask register using a vptestm.

22601

EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

22602

Cmp = DAG.getBitcast(CastVT, Cmp);

22603

Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

22604

DAG.getConstant(0, dl, CastVT), ISD::SETNE);

22605

} else {

22606

// If this is SSE/AVX CMPP, bitcast the result back to integer to match

22607

// the result type of SETCC. The bitcast is expected to be optimized

22608

// away during combining/isel.

22609

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

22610

}

22611

22612

if (IsStrict)

22613

return DAG.getMergeValues({Cmp, Chain}, dl);

22614

22615

return Cmp;

22616

}

22617

22618

assert(!IsStrict && "Strict SETCC only handles FP operands.")((!IsStrict && "Strict SETCC only handles FP operands."
) ? static_cast<void> (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22618, __PRETTY_FUNCTION__));

22619

22620

MVT VTOp0 = Op0.getSimpleValueType();

22621

(void)VTOp0;

22622

assert(VTOp0 == Op1.getSimpleValueType() &&((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22623, __PRETTY_FUNCTION__))

22623

"Expected operands with same type!")((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22623, __PRETTY_FUNCTION__));

22624

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22625, __PRETTY_FUNCTION__))

22625

"Invalid number of packed elements for source and destination!")((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22625, __PRETTY_FUNCTION__));

22626

22627

// The non-AVX512 code below works under the assumption that source and

22628

// destination types are the same.

22629

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22630, __PRETTY_FUNCTION__))

22630

"Value types for source and destination must be the same!")(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22630, __PRETTY_FUNCTION__));

22631

22632

// The result is boolean, but operands are int/float

22633

if (VT.getVectorElementType() == MVT::i1) {

22634

// In AVX-512 architecture setcc returns mask with i1 elements,

22635

// But there is no compare instruction for i8 and i16 elements in KNL.

22636

assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22637, __PRETTY_FUNCTION__))

22637

"Unexpected operand type")(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22637, __PRETTY_FUNCTION__));

22638

return LowerIntVSETCC_AVX512(Op, DAG);

22639

}

22640

22641

// Lower using XOP integer comparisons.

22642

if (VT.is128BitVector() && Subtarget.hasXOP()) {

22643

// Translate compare code to XOP PCOM compare mode.

22644

unsigned CmpMode = 0;

22645

switch (Cond) {

22646

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22646);

22647

case ISD::SETULT:

22648

case ISD::SETLT: CmpMode = 0x00; break;

22649

case ISD::SETULE:

22650

case ISD::SETLE: CmpMode = 0x01; break;

22651

case ISD::SETUGT:

22652

case ISD::SETGT: CmpMode = 0x02; break;

22653

case ISD::SETUGE:

22654

case ISD::SETGE: CmpMode = 0x03; break;

22655

case ISD::SETEQ: CmpMode = 0x04; break;

22656

case ISD::SETNE: CmpMode = 0x05; break;

22657

}

22658

22659

// Are we comparing unsigned or signed integers?

22660

unsigned Opc =

22661

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

22662

22663

return DAG.getNode(Opc, dl, VT, Op0, Op1,

22664

DAG.getTargetConstant(CmpMode, dl, MVT::i8));

22665

}

22666

22667

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

22668

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

22669

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

22670

SDValue BC0 = peekThroughBitcasts(Op0);

22671

if (BC0.getOpcode() == ISD::AND) {

22672

APInt UndefElts;

22673

SmallVector<APInt, 64> EltBits;

22674

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

22675

VT.getScalarSizeInBits(), UndefElts,

22676

EltBits, false, false)) {

22677

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

22678

Cond = ISD::SETEQ;

22679

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

22680

}

22681

}

22682

}

22683

}

22684

22685

// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

22686

if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

22687

Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

22688

ConstantSDNode *C1 = isConstOrConstSplat(Op1);

22689

if (C1 && C1->getAPIntValue().isPowerOf2()) {

22690

unsigned BitWidth = VT.getScalarSizeInBits();

22691

unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

22692

22693

SDValue Result = Op0.getOperand(0);

22694

Result = DAG.getNode(ISD::SHL, dl, VT, Result,

22695

DAG.getConstant(ShiftAmt, dl, VT));

22696

Result = DAG.getNode(ISD::SRA, dl, VT, Result,

22697

DAG.getConstant(BitWidth - 1, dl, VT));

22698

return Result;

22699

}

22700

}

22701

22702

// Break 256-bit integer vector compare into smaller ones.

22703

if (VT.is256BitVector() && !Subtarget.hasInt256())

22704

return splitIntVSETCC(Op, DAG);

22705

22706

if (VT == MVT::v32i16 || VT == MVT::v64i8) {

22707

assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!")((!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasBWI() && \"Unexpected VT with AVX512BW!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22707, __PRETTY_FUNCTION__));

22708

return splitIntVSETCC(Op, DAG);

22709

}

22710

22711

// If this is a SETNE against the signed minimum value, change it to SETGT.

22712

// If this is a SETNE against the signed maximum value, change it to SETLT.

22713

// which will be swapped to SETGT.

22714

// Otherwise we use PCMPEQ+invert.

22715

APInt ConstValue;

22716

if (Cond == ISD::SETNE &&

22717

ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

22718

if (ConstValue.isMinSignedValue())

22719

Cond = ISD::SETGT;

22720

else if (ConstValue.isMaxSignedValue())

22721

Cond = ISD::SETLT;

22722

}

22723

22724

// If both operands are known non-negative, then an unsigned compare is the

22725

// same as a signed compare and there's no need to flip signbits.

22726

// TODO: We could check for more general simplifications here since we're

22727

// computing known bits.

22728

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

22729

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

22730

22731

// Special case: Use min/max operations for unsigned compares.

22732

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22733

if (ISD::isUnsignedIntSetCC(Cond) &&

22734

(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

22735

TLI.isOperationLegal(ISD::UMIN, VT)) {

22736

// If we have a constant operand, increment/decrement it and change the

22737

// condition to avoid an invert.

22738

if (Cond == ISD::SETUGT) {

22739

// X > C --> X >= (C+1) --> X == umax(X, C+1)

22740

if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {

22741

Op1 = UGTOp1;

22742

Cond = ISD::SETUGE;

22743

}

22744

}

22745

if (Cond == ISD::SETULT) {

22746

// X < C --> X <= (C-1) --> X == umin(X, C-1)

22747

if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {

22748

Op1 = ULTOp1;

22749

Cond = ISD::SETULE;

22750

}

22751

}

22752

bool Invert = false;

22753

unsigned Opc;

22754

switch (Cond) {

22755

default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22755);

22756

case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

22757

case ISD::SETULE: Opc = ISD::UMIN; break;

22758

case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

22759

case ISD::SETUGE: Opc = ISD::UMAX; break;

22760

}

22761

22762

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

22763

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

22764

22765

// If the logical-not of the result is required, perform that now.

22766

if (Invert)

22767

Result = DAG.getNOT(dl, Result, VT);

22768

22769

return Result;

22770

}

22771

22772

// Try to use SUBUS and PCMPEQ.

22773

if (FlipSigns)

22774

if (SDValue V =

22775

LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

22776

return V;

22777

22778

// We are handling one of the integer comparisons here. Since SSE only has

22779

// GT and EQ comparisons for integer, swapping operands and multiple

22780

// operations may be required for some comparisons.

22781

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

22782

: X86ISD::PCMPGT;

22783

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

22784

Cond == ISD::SETGE || Cond == ISD::SETUGE;

22785

bool Invert = Cond == ISD::SETNE ||

22786

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

22787

22788

if (Swap)

22789

std::swap(Op0, Op1);

22790

22791

// Check that the operation in question is available (most are plain SSE2,

22792

// but PCMPGTQ and PCMPEQQ have different requirements).

22793

if (VT == MVT::v2i64) {

22794

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

22795

assert(Subtarget.hasSSE2() && "Don't know how to lower!")((Subtarget.hasSSE2() && "Don't know how to lower!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22795, __PRETTY_FUNCTION__));

22796

22797

// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

22798

// the odd elements over the even elements.

22799

if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

22800

Op0 = DAG.getConstant(0, dl, MVT::v4i32);

22801

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

22802

22803

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

22804

static const int MaskHi[] = { 1, 1, 3, 3 };

22805

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

22806

22807

return DAG.getBitcast(VT, Result);

22808

}

22809

22810

if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

22811

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

22812

Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

22813

22814

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

22815

static const int MaskHi[] = { 1, 1, 3, 3 };

22816

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

22817

22818

return DAG.getBitcast(VT, Result);

22819

}

22820

22821

// Since SSE has no unsigned integer comparisons, we need to flip the sign

22822

// bits of the inputs before performing those operations. The lower

22823

// compare is always unsigned.

22824

SDValue SB;

22825

if (FlipSigns) {

22826

SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);

22827

} else {

22828

SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);

22829

}

22830

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

22831

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

22832

22833

// Cast everything to the right type.

22834

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

22835

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

22836

22837

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

22838

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

22839

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

22840

22841

// Create masks for only the low parts/high parts of the 64 bit integers.

22842

static const int MaskHi[] = { 1, 1, 3, 3 };

22843

static const int MaskLo[] = { 0, 0, 2, 2 };

22844

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

22845

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

22846

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

22847

22848

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

22849

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

22850

22851

if (Invert)

22852

Result = DAG.getNOT(dl, Result, MVT::v4i32);

22853

22854

return DAG.getBitcast(VT, Result);

22855

}

22856

22857

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

22858

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

22859

// pcmpeqd + pshufd + pand.

22860

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22860, __PRETTY_FUNCTION__));

22861

22862

// First cast everything to the right type.

22863

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

22864

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

22865

22866

// Do the compare.

22867

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

22868

22869

// Make sure the lower and upper halves are both all-ones.

22870

static const int Mask[] = { 1, 0, 3, 2 };

22871

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

22872

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

22873

22874

if (Invert)

22875

Result = DAG.getNOT(dl, Result, MVT::v4i32);

22876

22877

return DAG.getBitcast(VT, Result);

22878

}

22879

}

22880

22881

// Since SSE has no unsigned integer comparisons, we need to flip the sign

22882

// bits of the inputs before performing those operations.

22883

if (FlipSigns) {

22884

MVT EltVT = VT.getVectorElementType();

22885

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

22886

VT);

22887

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

22888

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

22889

}

22890

22891

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

22892

22893

// If the logical-not of the result is required, perform that now.

22894

if (Invert)

22895

Result = DAG.getNOT(dl, Result, VT);

22896

22897

return Result;

22898

}

22899

22900

// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.

22901

static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

22902

const SDLoc &dl, SelectionDAG &DAG,

22903

const X86Subtarget &Subtarget,

22904

SDValue &X86CC) {

22905

// Only support equality comparisons.

22906

if (CC != ISD::SETEQ && CC != ISD::SETNE)

22907

return SDValue();

22908

22909

// Must be a bitcast from vXi1.

22910

if (Op0.getOpcode() != ISD::BITCAST)

22911

return SDValue();

22912

22913

Op0 = Op0.getOperand(0);

22914

MVT VT = Op0.getSimpleValueType();

22915

if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

22916

!(Subtarget.hasDQI() && VT == MVT::v8i1) &&

22917

!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

22918

return SDValue();

22919

22920

X86::CondCode X86Cond;

22921

if (isNullConstant(Op1)) {

22922

X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

22923

} else if (isAllOnesConstant(Op1)) {

22924

// C flag is set for all ones.

22925

X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

22926

} else

22927

return SDValue();

22928

22929

// If the input is an AND, we can combine it's operands into the KTEST.

22930

bool KTestable = false;

22931

if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

22932

KTestable = true;

22933

if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

22934

KTestable = true;

22935

if (!isNullConstant(Op1))

22936

KTestable = false;

22937

if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

22938

SDValue LHS = Op0.getOperand(0);

22939

SDValue RHS = Op0.getOperand(1);

22940

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

22941

return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

22942

}

22943

22944

// If the input is an OR, we can combine it's operands into the KORTEST.

22945

SDValue LHS = Op0;

22946

SDValue RHS = Op0;

22947

if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

22948

LHS = Op0.getOperand(0);

22949

RHS = Op0.getOperand(1);

22950

}

22951

22952

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

22953

return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

22954

}

22955

22956

/// Emit flags for the given setcc condition and operands. Also returns the

22957

/// corresponding X86 condition code constant in X86CC.

22958

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

22959

ISD::CondCode CC, const SDLoc &dl,

22960

SelectionDAG &DAG,

22961

SDValue &X86CC) const {

22962

// Optimize to BT if possible.

22963

// Lower (X & (1 << N)) == 0 to BT(X, N).

22964

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

22965

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

22966

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&

22967

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

22968

if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))

22969

return BT;

22970

}

22971

22972

// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.

22973

// TODO: We could do AND tree with all 1s as well by using the C flag.

22974

if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))

22975

if (SDValue CmpZ =

22976

MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))

22977

return CmpZ;

22978

22979

// Try to lower using KORTEST or KTEST.

22980

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

22981

return Test;

22982

22983

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of

22984

// these.

22985

if ((isOneConstant(Op1) || isNullConstant(Op1)) &&

22986

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

22987

// If the input is a setcc, then reuse the input setcc or use a new one with

22988

// the inverted condition.

22989

if (Op0.getOpcode() == X86ISD::SETCC) {

22990

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

22991

22992

X86CC = Op0.getOperand(0);

22993

if (Invert) {

22994

X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);

22995

CCode = X86::GetOppositeBranchCondition(CCode);

22996

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

22997

}

22998

22999

return Op0.getOperand(1);

23000

}

23001

}

23002

23003

// Try to use the carry flag from the add in place of an separate CMP for:

23004

// (seteq (add X, -1), -1). Similar for setne.

23005

if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

23006

Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {

23007

if (isProfitableToUseFlagOp(Op0)) {

23008

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

23009

23010

SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

23011

Op0.getOperand(1));

23012

DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

23013

X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

23014

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

23015

return SDValue(New.getNode(), 1);

23016

}

23017

}

23018

23019

X86::CondCode CondCode =

23020

TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

23021

assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")((CondCode != X86::COND_INVALID && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23021, __PRETTY_FUNCTION__));

23022

23023

SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

23024

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

23025

return EFLAGS;

23026

}

23027

23028

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

23029

23030

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

23031

Op.getOpcode() == ISD::STRICT_FSETCCS;

23032

MVT VT = Op->getSimpleValueType(0);

23033

23034

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

23035

23036

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23036, __PRETTY_FUNCTION__));

23037

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23038

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

23039

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

23040

SDLoc dl(Op);

23041

ISD::CondCode CC =

23042

cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

23043

23044

// Handle f128 first, since one possible outcome is a normal integer

23045

// comparison which gets handled by emitFlagsForSetcc.

23046

if (Op0.getValueType() == MVT::f128) {

23047

softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

23048

Op.getOpcode() == ISD::STRICT_FSETCCS);

23049

23050

// If softenSetCCOperands returned a scalar, use it.

23051

if (!Op1.getNode()) {

23052

assert(Op0.getValueType() == Op.getValueType() &&((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23053, __PRETTY_FUNCTION__))

23053

"Unexpected setcc expansion!")((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23053, __PRETTY_FUNCTION__));

23054

if (IsStrict)

23055

return DAG.getMergeValues({Op0, Chain}, dl);

23056

return Op0;

23057

}

23058

}

23059

23060

if (Op0.getSimpleValueType().isInteger()) {

23061

SDValue X86CC;

23062

SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

23063

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

23064

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

23065

}

23066

23067

// Handle floating point.

23068

X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

23069

if (CondCode == X86::COND_INVALID)

23070

return SDValue();

23071

23072

SDValue EFLAGS;

23073

if (IsStrict) {

23074

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

23075

EFLAGS =

23076

DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

23077

dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

23078

Chain = EFLAGS.getValue(1);

23079

} else {

23080

EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

23081

}

23082

23083

SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

23084

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

23085

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

23086

}

23087

23088

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

23089

SDValue LHS = Op.getOperand(0);

23090

SDValue RHS = Op.getOperand(1);

23091

SDValue Carry = Op.getOperand(2);

23092

SDValue Cond = Op.getOperand(3);

23093

SDLoc DL(Op);

23094

23095

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."
) ? static_cast<void> (0) : __assert_fail ("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23095, __PRETTY_FUNCTION__));

23096

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

23097

23098

// Recreate the carry if needed.

23099

EVT CarryVT = Carry.getValueType();

23100

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

23101

Carry, DAG.getAllOnesConstant(DL, CarryVT));

23102

23103

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

23104

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

23105

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

23106

}

23107

23108

// This function returns three things: the arithmetic computation itself

23109

// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The

23110

// flag and the condition code define the case in which the arithmetic

23111

// computation overflows.

23112

static std::pair<SDValue, SDValue>

23113

getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

23114

assert(Op.getResNo() == 0 && "Unexpected result number!")((Op.getResNo() == 0 && "Unexpected result number!") ?
static_cast<void> (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23114, __PRETTY_FUNCTION__));

23115

SDValue Value, Overflow;

23116

SDValue LHS = Op.getOperand(0);

23117

SDValue RHS = Op.getOperand(1);

23118

unsigned BaseOp = 0;

23119

SDLoc DL(Op);

23120

switch (Op.getOpcode()) {

23121

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23121);

23122

case ISD::SADDO:

23123

BaseOp = X86ISD::ADD;

23124

Cond = X86::COND_O;

23125

break;

23126

case ISD::UADDO:

23127

BaseOp = X86ISD::ADD;

23128

Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

23129

break;

23130

case ISD::SSUBO:

23131

BaseOp = X86ISD::SUB;

23132

Cond = X86::COND_O;

23133

break;

23134

case ISD::USUBO:

23135

BaseOp = X86ISD::SUB;

23136

Cond = X86::COND_B;

23137

break;

23138

case ISD::SMULO:

23139

BaseOp = X86ISD::SMUL;

23140

Cond = X86::COND_O;

23141

break;

23142

case ISD::UMULO:

23143

BaseOp = X86ISD::UMUL;

23144

Cond = X86::COND_O;

23145

break;

23146

}

23147

23148

if (BaseOp) {

23149

// Also sets EFLAGS.

23150

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

23151

Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

23152

Overflow = Value.getValue(1);

23153

}

23154

23155

return std::make_pair(Value, Overflow);

23156

}

23157

23158

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

23159

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

23160

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

23161

// looks for this combo and may remove the "setcc" instruction if the "setcc"

23162

// has only one use.

23163

SDLoc DL(Op);

23164

X86::CondCode Cond;

23165

SDValue Value, Overflow;

23166

std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

23167

23168

SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

23169

assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((Op->getValueType(1) == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23169, __PRETTY_FUNCTION__));

23170

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

23171

}

23172

23173

/// Return true if opcode is a X86 logical comparison.

23174

static bool isX86LogicalCmp(SDValue Op) {

23175

unsigned Opc = Op.getOpcode();

23176

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

23177

Opc == X86ISD::FCMP)

23178

return true;

23179

if (Op.getResNo() == 1 &&

23180

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

23181

Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

23182

Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

23183

return true;

23184

23185

return false;

23186

}

23187

23188

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

23189

if (V.getOpcode() != ISD::TRUNCATE)

23190

return false;

23191

23192

SDValue VOp0 = V.getOperand(0);

23193

unsigned InBits = VOp0.getValueSizeInBits();

23194

unsigned Bits = V.getValueSizeInBits();

23195

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

23196

}

23197

23198

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

23199

bool AddTest = true;

23200

SDValue Cond = Op.getOperand(0);

23201

SDValue Op1 = Op.getOperand(1);

23202

SDValue Op2 = Op.getOperand(2);

23203

SDLoc DL(Op);

23204

MVT VT = Op1.getSimpleValueType();

23205

SDValue CC;

23206

23207

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

23208

// are available or VBLENDV if AVX is available.

23209

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

23210

if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

23211

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

23212

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

23213

bool IsAlwaysSignaling;

23214

unsigned SSECC =

23215

translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

23216

CondOp0, CondOp1, IsAlwaysSignaling);

23217

23218

if (Subtarget.hasAVX512()) {

23219

SDValue Cmp =

23220

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

23221

DAG.getTargetConstant(SSECC, DL, MVT::i8));

23222

assert(!VT.isVector() && "Not a scalar type?")((!VT.isVector() && "Not a scalar type?") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23222, __PRETTY_FUNCTION__));

23223

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

23224

}

23225

23226

if (SSECC < 8 || Subtarget.hasAVX()) {

23227

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

23228

DAG.getTargetConstant(SSECC, DL, MVT::i8));

23229

23230

// If we have AVX, we can use a variable vector select (VBLENDV) instead

23231

// of 3 logic instructions for size savings and potentially speed.

23232

// Unfortunately, there is no scalar form of VBLENDV.

23233

23234

// If either operand is a +0.0 constant, don't try this. We can expect to

23235

// optimize away at least one of the logic instructions later in that

23236

// case, so that sequence would be faster than a variable blend.

23237

23238

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

23239

// uses XMM0 as the selection register. That may need just as many

23240

// instructions as the AND/ANDN/OR sequence due to register moves, so

23241

// don't bother.

23242

if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&

23243

!isNullFPConstant(Op2)) {

23244

// Convert to vectors, do a VSELECT, and convert back to scalar.

23245

// All of the conversions should be optimized away.

23246

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

23247

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

23248

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

23249

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

23250

23251

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

23252

VCmp = DAG.getBitcast(VCmpVT, VCmp);

23253

23254

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

23255

23256

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

23257

VSel, DAG.getIntPtrConstant(0, DL));

23258

}

23259

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

23260

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

23261

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

23262

}

23263

}

23264

23265

// AVX512 fallback is to lower selects of scalar floats to masked moves.

23266

if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

23267

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

23268

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

23269

}

23270

23271

if (Cond.getOpcode() == ISD::SETCC) {

23272

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

23273

Cond = NewCond;

23274

// If the condition was updated, it's possible that the operands of the

23275

// select were also updated (for example, EmitTest has a RAUW). Refresh

23276

// the local references to the select operands in case they got stale.

23277

Op1 = Op.getOperand(1);

23278

Op2 = Op.getOperand(2);

23279

}

23280

}

23281

23282

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

23283

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

23284

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

23285

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

23286

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

23287

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

23288

if (Cond.getOpcode() == X86ISD::SETCC &&

23289

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

23290

isNullConstant(Cond.getOperand(1).getOperand(1))) {

23291

SDValue Cmp = Cond.getOperand(1);

23292

SDValue CmpOp0 = Cmp.getOperand(0);

23293

unsigned CondCode = Cond.getConstantOperandVal(0);

23294

23295

// Special handling for __builtin_ffs(X) - 1 pattern which looks like

23296

// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

23297

// handle to keep the CMP with 0. This should be removed by

23298

// optimizeCompareInst by using the flags from the BSR/TZCNT used for the

23299

// cttz_zero_undef.

23300

auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

23301

return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

23302

Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

23303

};

23304

if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&

23305

((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

23306

(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

23307

// Keep Cmp.

23308

} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

23309

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

23310

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

23311

23312

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

23313

SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

23314

23315

// Apply further optimizations for special cases

23316

// (select (x != 0), -1, 0) -> neg & sbb

23317

// (select (x == 0), 0, -1) -> neg & sbb

23318

if (isNullConstant(Y) &&

23319

(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {

23320

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

23321

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

23322

Zero = DAG.getConstant(0, DL, Op.getValueType());

23323

return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));

23324

}

23325

23326

Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,

23327

CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));

23328

23329

SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());

23330

SDValue Res = // Res = 0 or -1.

23331

DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));

23332

23333

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))

23334

Res = DAG.getNOT(DL, Res, Res.getValueType());

23335

23336

return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);

23337

} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&

23338

Cmp.getOperand(0).getOpcode() == ISD::AND &&

23339

isOneConstant(Cmp.getOperand(0).getOperand(1))) {

23340

SDValue Src1, Src2;

23341

// true if Op2 is XOR or OR operator and one of its operands

23342

// is equal to Op1

23343

// ( a , a op b) || ( b , a op b)

23344

auto isOrXorPattern = [&]() {

23345

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

23346

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

23347

Src1 =

23348

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

23349

Src2 = Op1;

23350

return true;

23351

}

23352

return false;

23353

};

23354

23355

if (isOrXorPattern()) {

23356

SDValue Neg;

23357

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

23358

// we need mask of all zeros or ones with same size of the other

23359

// operands.

23360

if (CmpSz > VT.getSizeInBits())

23361

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

23362

else if (CmpSz < VT.getSizeInBits())

23363

Neg = DAG.getNode(ISD::AND, DL, VT,

23364

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

23365

DAG.getConstant(1, DL, VT));

23366

else

23367

Neg = CmpOp0;

23368

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

23369

Neg); // -(and (x, 0x1))

23370

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

23371

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

23372

}

23373

}

23374

}

23375

23376

// Look past (and (setcc_carry (cmp ...)), 1).

23377

if (Cond.getOpcode() == ISD::AND &&

23378

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

23379

isOneConstant(Cond.getOperand(1)))

23380

Cond = Cond.getOperand(0);

23381

23382

// If condition flag is set by a X86ISD::CMP, then use it as the condition

23383

// setting operand in place of the X86ISD::SETCC.

23384

unsigned CondOpcode = Cond.getOpcode();

23385

if (CondOpcode == X86ISD::SETCC ||

23386

CondOpcode == X86ISD::SETCC_CARRY) {

23387

CC = Cond.getOperand(0);

23388

23389

SDValue Cmp = Cond.getOperand(1);

23390

bool IllegalFPCMov = false;

23391

if (VT.isFloatingPoint() && !VT.isVector() &&

23392

!isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?

23393

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

23394

23395

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

23396

Cmp.getOpcode() == X86ISD::BT) { // FIXME

23397

Cond = Cmp;

23398

AddTest = false;

23399

}

23400

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

23401

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

23402

CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

23403

SDValue Value;

23404

X86::CondCode X86Cond;

23405

std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

23406

23407

CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

23408

AddTest = false;

23409

}

23410

23411

if (AddTest) {

23412

// Look past the truncate if the high bits are known zero.

23413

if (isTruncWithZeroHighBitsInput(Cond, DAG))

23414

Cond = Cond.getOperand(0);

23415

23416

// We know the result of AND is compared against zero. Try to match

23417

// it to BT.

23418

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

23419

SDValue BTCC;

23420

if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {

23421

CC = BTCC;

23422

Cond = BT;

23423

AddTest = false;

23424

}

23425

}

23426

}

23427

23428

if (AddTest) {

23429

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

23430

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

23431

}

23432

23433

// a < b ? -1 : 0 -> RES = ~setcc_carry

23434

// a < b ? 0 : -1 -> RES = setcc_carry

23435

// a >= b ? -1 : 0 -> RES = setcc_carry

23436

// a >= b ? 0 : -1 -> RES = ~setcc_carry

23437

if (Cond.getOpcode() == X86ISD::SUB) {

23438

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

23439

23440

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

23441

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

23442

(isNullConstant(Op1) || isNullConstant(Op2))) {

23443

SDValue Res =

23444

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

23445

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

23446

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

23447

return DAG.getNOT(DL, Res, Res.getValueType());

23448

return Res;

23449

}

23450

}

23451

23452

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

23453

// widen the cmov and push the truncate through. This avoids introducing a new

23454

// branch during isel and doesn't add any extensions.

23455

if (Op.getValueType() == MVT::i8 &&

23456

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

23457

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

23458

if (T1.getValueType() == T2.getValueType() &&

23459

// Exclude CopyFromReg to avoid partial register stalls.

23460

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

23461

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

23462

CC, Cond);

23463

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

23464

}

23465

}

23466

23467

// Or finally, promote i8 cmovs if we have CMOV,

23468

// or i16 cmovs if it won't prevent folding a load.

23469

// FIXME: we should not limit promotion of i8 case to only when the CMOV is

23470

// legal, but EmitLoweredSelect() can not deal with these extensions

23471

// being inserted between two CMOV's. (in i16 case too TBN)

23472

// https://bugs.llvm.org/show_bug.cgi?id=40974

23473

if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||

23474

(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&

23475

!MayFoldLoad(Op2))) {

23476

Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

23477

Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

23478

SDValue Ops[] = { Op2, Op1, CC, Cond };

23479

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

23480

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

23481

}

23482

23483

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

23484

// condition is true.

23485

SDValue Ops[] = { Op2, Op1, CC, Cond };

23486

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

23487

}

23488

23489

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

23490

const X86Subtarget &Subtarget,

23491

SelectionDAG &DAG) {

23492

MVT VT = Op->getSimpleValueType(0);

23493

SDValue In = Op->getOperand(0);

23494

MVT InVT = In.getSimpleValueType();

23495

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23495, __PRETTY_FUNCTION__));

23496

MVT VTElt = VT.getVectorElementType();

23497

SDLoc dl(Op);

23498

23499

unsigned NumElts = VT.getVectorNumElements();

23500

23501

// Extend VT if the scalar type is i8/i16 and BWI is not supported.

23502

MVT ExtVT = VT;

23503

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

23504

// If v16i32 is to be avoided, we'll need to split and concatenate.

23505

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

23506

return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

23507

23508

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

23509

}

23510

23511

// Widen to 512-bits if VLX is not supported.

23512

MVT WideVT = ExtVT;

23513

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

23514

NumElts *= 512 / ExtVT.getSizeInBits();

23515

InVT = MVT::getVectorVT(MVT::i1, NumElts);

23516

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

23517

In, DAG.getIntPtrConstant(0, dl));

23518

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

23519

}

23520

23521

SDValue V;

23522

MVT WideEltVT = WideVT.getVectorElementType();

23523

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

23524

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

23525

V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

23526

} else {

23527

SDValue NegOne = DAG.getConstant(-1, dl, WideVT);

23528

SDValue Zero = DAG.getConstant(0, dl, WideVT);

23529

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

23530

}

23531

23532

// Truncate if we had to extend i16/i8 above.

23533

if (VT != ExtVT) {

23534

WideVT = MVT::getVectorVT(VTElt, NumElts);

23535

V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

23536

}

23537

23538

// Extract back to 128/256-bit if we widened.

23539

if (WideVT != VT)

23540

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

23541

DAG.getIntPtrConstant(0, dl));

23542

23543

return V;

23544

}

23545

23546

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

23547

SelectionDAG &DAG) {

23548

SDValue In = Op->getOperand(0);

23549

MVT InVT = In.getSimpleValueType();

23550

23551

if (InVT.getVectorElementType() == MVT::i1)

23552

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

23553

23554

assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23554, __PRETTY_FUNCTION__));

23555

return LowerAVXExtend(Op, DAG, Subtarget);

23556

}

23557

23558

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

23559

// For sign extend this needs to handle all vector sizes and SSE4.1 and

23560

// non-SSE4.1 targets. For zero extend this should only handle inputs of

23561

// MVT::v64i8 when BWI is not supported, but AVX512 is.

23562

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

23563

const X86Subtarget &Subtarget,

23564

SelectionDAG &DAG) {

23565

SDValue In = Op->getOperand(0);

23566

MVT VT = Op->getSimpleValueType(0);

23567

MVT InVT = In.getSimpleValueType();

23568

23569

MVT SVT = VT.getVectorElementType();

23570

MVT InSVT = InVT.getVectorElementType();

23571

assert(SVT.getSizeInBits() > InSVT.getSizeInBits())((SVT.getSizeInBits() > InSVT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SVT.getSizeInBits() > InSVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23571, __PRETTY_FUNCTION__));

23572

23573

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

23574

return SDValue();

23575

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

23576

return SDValue();

23577

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

23578

!(VT.is256BitVector() && Subtarget.hasAVX()) &&

23579

!(VT.is512BitVector() && Subtarget.hasAVX512()))

23580

return SDValue();

23581

23582

SDLoc dl(Op);

23583

unsigned Opc = Op.getOpcode();

23584

unsigned NumElts = VT.getVectorNumElements();

23585

23586

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

23587

// For 512-bit vectors, we need 128-bits or 256-bits.

23588

if (InVT.getSizeInBits() > 128) {

23589

// Input needs to be at least the same number of elements as output, and

23590

// at least 128-bits.

23591

int InSize = InSVT.getSizeInBits() * NumElts;

23592

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

23593

InVT = In.getSimpleValueType();

23594

}

23595

23596

// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

23597

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

23598

// need to be handled here for 256/512-bit results.

23599

if (Subtarget.hasInt256()) {

23600

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23600, __PRETTY_FUNCTION__));

23601

23602

if (InVT.getVectorNumElements() != NumElts)

23603

return DAG.getNode(Op.getOpcode(), dl, VT, In);

23604

23605

// FIXME: Apparently we create inreg operations that could be regular

23606

// extends.

23607

unsigned ExtOpc =

23608

Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

23609

: ISD::ZERO_EXTEND;

23610

return DAG.getNode(ExtOpc, dl, VT, In);

23611

}

23612

23613

// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

23614

if (Subtarget.hasAVX()) {

23615

assert(VT.is256BitVector() && "256-bit vector expected")((VT.is256BitVector() && "256-bit vector expected") ?
static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23615, __PRETTY_FUNCTION__));

23616

MVT HalfVT = VT.getHalfNumVectorElementsVT();

23617

int HalfNumElts = HalfVT.getVectorNumElements();

23618

23619

unsigned NumSrcElts = InVT.getVectorNumElements();

23620

SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

23621

for (int i = 0; i != HalfNumElts; ++i)

23622

HiMask[i] = HalfNumElts + i;

23623

23624

SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

23625

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

23626

Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

23627

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

23628

}

23629

23630

// We should only get here for sign extend.

23631

assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23631, __PRETTY_FUNCTION__));

23632

assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((VT.is128BitVector() && InVT.is128BitVector() &&
"Unexpected VTs") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23632, __PRETTY_FUNCTION__));

23633

23634

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

23635

SDValue Curr = In;

23636

SDValue SignExt = Curr;

23637

23638

// As SRAI is only available on i16/i32 types, we expand only up to i32

23639

// and handle i64 separately.

23640

if (InVT != MVT::v4i32) {

23641

MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

23642

23643

unsigned DestWidth = DestVT.getScalarSizeInBits();

23644

unsigned Scale = DestWidth / InSVT.getSizeInBits();

23645

23646

unsigned InNumElts = InVT.getVectorNumElements();

23647

unsigned DestElts = DestVT.getVectorNumElements();

23648

23649

// Build a shuffle mask that takes each input element and places it in the

23650

// MSBs of the new element size.

23651

SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

23652

for (unsigned i = 0; i != DestElts; ++i)

23653

Mask[i * Scale + (Scale - 1)] = i;

23654

23655

Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

23656

Curr = DAG.getBitcast(DestVT, Curr);

23657

23658

unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

23659

SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

23660

DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

23661

}

23662

23663

if (VT == MVT::v2i64) {

23664

assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"
) ? static_cast<void> (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23664, __PRETTY_FUNCTION__));

23665

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

23666

SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

23667

SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

23668

SignExt = DAG.getBitcast(VT, SignExt);

23669

}

23670

23671

return SignExt;

23672

}

23673

23674

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

23675

SelectionDAG &DAG) {

23676

MVT VT = Op->getSimpleValueType(0);

23677

SDValue In = Op->getOperand(0);

23678

MVT InVT = In.getSimpleValueType();

23679

SDLoc dl(Op);

23680

23681

if (InVT.getVectorElementType() == MVT::i1)

23682

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

23683

23684

assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23684, __PRETTY_FUNCTION__));

23685

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23686, __PRETTY_FUNCTION__))

23686

"Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23686, __PRETTY_FUNCTION__));

23687

assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23690, __PRETTY_FUNCTION__))

23688

VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23690, __PRETTY_FUNCTION__))

23689

VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23690, __PRETTY_FUNCTION__))

23690

"Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23690, __PRETTY_FUNCTION__));

23691

assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23694, __PRETTY_FUNCTION__))

23692

InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23694, __PRETTY_FUNCTION__))

23693

InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23694, __PRETTY_FUNCTION__))

23694

"Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23694, __PRETTY_FUNCTION__));

23695

23696

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

23697

assert(InVT == MVT::v32i8 && "Unexpected VT!")((InVT == MVT::v32i8 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23697, __PRETTY_FUNCTION__));

23698

return splitVectorIntUnary(Op, DAG);

23699

}

23700

23701

if (Subtarget.hasInt256())

23702

return Op;

23703

23704

// Optimize vectors in AVX mode

23705

// Sign extend v8i16 to v8i32 and

23706

// v4i32 to v4i64

23707

//

23708

// Divide input vector into two parts

23709

// for v4i32 the high shuffle mask will be {2, 3, -1, -1}

23710

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

23711

// concat the vectors to original VT

23712

MVT HalfVT = VT.getHalfNumVectorElementsVT();

23713

SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

23714

23715

unsigned NumElems = InVT.getVectorNumElements();

23716

SmallVector<int,8> ShufMask(NumElems, -1);

23717

for (unsigned i = 0; i != NumElems/2; ++i)

23718

ShufMask[i] = i + NumElems/2;

23719

23720

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

23721

OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

23722

23723

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

23724

}

23725

23726

/// Change a vector store into a pair of half-size vector stores.

23727

static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

23728

SDValue StoredVal = Store->getValue();

23729

assert((StoredVal.getValueType().is256BitVector() ||(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23731, __PRETTY_FUNCTION__))

23730

StoredVal.getValueType().is512BitVector()) &&(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23731, __PRETTY_FUNCTION__))

23731

"Expecting 256/512-bit op")(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23731, __PRETTY_FUNCTION__));

23732

23733

// Splitting volatile memory ops is not allowed unless the operation was not

23734

// legal to begin with. Assume the input store is legal (this transform is

23735

// only used for targets with AVX). Note: It is possible that we have an

23736

// illegal type like v2i128, and so we could allow splitting a volatile store

23737

// in that case if that is important.

23738

if (!Store->isSimple())

23739

return SDValue();

23740

23741

SDLoc DL(Store);

23742

SDValue Value0, Value1;

23743

std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

23744

unsigned HalfOffset = Value0.getValueType().getStoreSize();

23745

SDValue Ptr0 = Store->getBasePtr();

23746

SDValue Ptr1 =

23747

DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);

23748

SDValue Ch0 =

23749

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

23750

Store->getOriginalAlign(),

23751

Store->getMemOperand()->getFlags());

23752

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

23753

Store->getPointerInfo().getWithOffset(HalfOffset),

23754

Store->getOriginalAlign(),

23755

Store->getMemOperand()->getFlags());

23756

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

23757

}

23758

23759

/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

23760

/// type.

23761

static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

23762

SelectionDAG &DAG) {

23763

SDValue StoredVal = Store->getValue();

23764

assert(StoreVT.is128BitVector() &&((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23765, __PRETTY_FUNCTION__))

23765

StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23765, __PRETTY_FUNCTION__));

23766

StoredVal = DAG.getBitcast(StoreVT, StoredVal);

23767

23768

// Splitting volatile memory ops is not allowed unless the operation was not

23769

// legal to begin with. We are assuming the input op is legal (this transform

23770

// is only used for targets with AVX).

23771

if (!Store->isSimple())

23772

return SDValue();

23773

23774

MVT StoreSVT = StoreVT.getScalarType();

23775

unsigned NumElems = StoreVT.getVectorNumElements();

23776

unsigned ScalarSize = StoreSVT.getStoreSize();

23777

23778

SDLoc DL(Store);

23779

SmallVector<SDValue, 4> Stores;

23780

for (unsigned i = 0; i != NumElems; ++i) {

23781

unsigned Offset = i * ScalarSize;

23782

SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

23783

TypeSize::Fixed(Offset), DL);

23784

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

23785

DAG.getIntPtrConstant(i, DL));

23786

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

23787

Store->getPointerInfo().getWithOffset(Offset),

23788

Store->getOriginalAlign(),

23789

Store->getMemOperand()->getFlags());

23790

Stores.push_back(Ch);

23791

}

23792

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

23793

}

23794

23795

static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

23796

SelectionDAG &DAG) {

23797

StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

23798

SDLoc dl(St);

23799

SDValue StoredVal = St->getValue();

23800

23801

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

23802

if (StoredVal.getValueType().isVector() &&

23803

StoredVal.getValueType().getVectorElementType() == MVT::i1) {

23804

assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23805, __PRETTY_FUNCTION__))

23805

"Unexpected VT")((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23805, __PRETTY_FUNCTION__));

23806

assert(!St->isTruncatingStore() && "Expected non-truncating store")((!St->isTruncatingStore() && "Expected non-truncating store"
) ? static_cast<void> (0) : __assert_fail ("!St->isTruncatingStore() && \"Expected non-truncating store\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23806, __PRETTY_FUNCTION__));

23807

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23808, __PRETTY_FUNCTION__))

23808

"Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23808, __PRETTY_FUNCTION__));

23809

23810

StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

23811

DAG.getUNDEF(MVT::v16i1), StoredVal,

23812

DAG.getIntPtrConstant(0, dl));

23813

StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

23814

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

23815

23816

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

23817

St->getPointerInfo(), St->getOriginalAlign(),

23818

St->getMemOperand()->getFlags());

23819

}

23820

23821

if (St->isTruncatingStore())

23822

return SDValue();

23823

23824

// If this is a 256-bit store of concatenated ops, we are better off splitting

23825

// that store into two 128-bit stores. This avoids spurious use of 256-bit ops

23826

// and each half can execute independently. Some cores would split the op into

23827

// halves anyway, so the concat (vinsertf128) is purely an extra op.

23828

MVT StoreVT = StoredVal.getSimpleValueType();

23829

if (StoreVT.is256BitVector() ||

23830

((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

23831

!Subtarget.hasBWI())) {

23832

SmallVector<SDValue, 4> CatOps;

23833

if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))

23834

return splitVectorStore(St, DAG);

23835

return SDValue();

23836

}

23837

23838

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

23839

assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23840, __PRETTY_FUNCTION__))

23840

"Unexpected VT")((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23840, __PRETTY_FUNCTION__));

23841

assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23842, __PRETTY_FUNCTION__))

23842

TargetLowering::TypeWidenVector && "Unexpected type action!")((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23842, __PRETTY_FUNCTION__));

23843

23844

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

23845

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

23846

DAG.getUNDEF(StoreVT));

23847

23848

if (Subtarget.hasSSE2()) {

23849

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element

23850

// and store it.

23851

MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

23852

MVT CastVT = MVT::getVectorVT(StVT, 2);

23853

StoredVal = DAG.getBitcast(CastVT, StoredVal);

23854

StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

23855

DAG.getIntPtrConstant(0, dl));

23856

23857

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

23858

St->getPointerInfo(), St->getOriginalAlign(),

23859

St->getMemOperand()->getFlags());

23860

}

23861

assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23861, __PRETTY_FUNCTION__));

23862

SDVTList Tys = DAG.getVTList(MVT::Other);

23863

SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

23864

return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

23865

St->getMemOperand());

23866

}

23867

23868

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

23869

// may emit an illegal shuffle but the expansion is still better than scalar

23870

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

23871

// we'll emit a shuffle and a arithmetic shift.

23872

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

23873

// TODO: It is possible to support ZExt by zeroing the undef values during

23874

// the shuffle phase or after the shuffle.

23875

static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

23876

SelectionDAG &DAG) {

23877

MVT RegVT = Op.getSimpleValueType();

23878

assert(RegVT.isVector() && "We only custom lower vector loads.")((RegVT.isVector() && "We only custom lower vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23878, __PRETTY_FUNCTION__));

23879

assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23880, __PRETTY_FUNCTION__))

23880

"We only custom lower integer vector loads.")((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23880, __PRETTY_FUNCTION__));

23881

23882

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

23883

SDLoc dl(Ld);

23884

23885

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

23886

if (RegVT.getVectorElementType() == MVT::i1) {

23887

assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"
) ? static_cast<void> (0) : __assert_fail ("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23887, __PRETTY_FUNCTION__));

23888

assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((RegVT.getVectorNumElements() <= 8 && "Unexpected VT"
) ? static_cast<void> (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23888, __PRETTY_FUNCTION__));

23889

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23890, __PRETTY_FUNCTION__))

23890

"Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23890, __PRETTY_FUNCTION__));

23891

23892

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

23893

Ld->getPointerInfo(), Ld->getOriginalAlign(),

23894

Ld->getMemOperand()->getFlags());

23895

23896

// Replace chain users with the new chain.

23897

assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((NewLd->getNumValues() == 2 && "Loads must carry a chain!"
) ? static_cast<void> (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23897, __PRETTY_FUNCTION__));

23898

23899

SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

23900

Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

23901

DAG.getBitcast(MVT::v16i1, Val),

23902

DAG.getIntPtrConstant(0, dl));

23903

return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

23904

}

23905

23906

return SDValue();

23907

}

23908

23909

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

23910

/// each of which has no other use apart from the AND / OR.

23911

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

23912

Opc = Op.getOpcode();

23913

if (Opc != ISD::OR && Opc != ISD::AND)

23914

return false;

23915

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

23916

Op.getOperand(0).hasOneUse() &&

23917

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

23918

Op.getOperand(1).hasOneUse());

23919

}

23920

23921

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

23922

SDValue Chain = Op.getOperand(0);

23923

SDValue Cond = Op.getOperand(1);

23924

SDValue Dest = Op.getOperand(2);

23925

SDLoc dl(Op);

23926

23927

if (Cond.getOpcode() == ISD::SETCC &&

23928

Cond.getOperand(0).getValueType() != MVT::f128) {

23929

SDValue LHS = Cond.getOperand(0);

23930

SDValue RHS = Cond.getOperand(1);

23931

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

23932

23933

// Special case for

23934

// setcc([su]{add,sub,mul}o == 0)

23935

// setcc([su]{add,sub,mul}o != 1)

23936

if (ISD::isOverflowIntrOpRes(LHS) &&

23937

(CC == ISD::SETEQ || CC == ISD::SETNE) &&

23938

(isNullConstant(RHS) || isOneConstant(RHS))) {

23939

SDValue Value, Overflow;

23940

X86::CondCode X86Cond;

23941

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

23942

23943

if ((CC == ISD::SETEQ) == isNullConstant(RHS))

23944

X86Cond = X86::GetOppositeBranchCondition(X86Cond);

23945

23946

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

23947

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

23948

Overflow);

23949

}

23950

23951

if (LHS.getSimpleValueType().isInteger()) {

23952

SDValue CCVal;

23953

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

23954

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

23955

EFLAGS);

23956

}

23957

23958

if (CC == ISD::SETOEQ) {

23959

// For FCMP_OEQ, we can emit

23960

// two branches instead of an explicit AND instruction with a

23961

// separate test. However, we only do this if this block doesn't

23962

// have a fall-through edge, because this requires an explicit

23963

// jmp when the condition is false.

23964

if (Op.getNode()->hasOneUse()) {

23965

SDNode *User = *Op.getNode()->use_begin();

23966

// Look for an unconditional branch following this conditional branch.

23967

// We need this because we need to reverse the successors in order

23968

// to implement FCMP_OEQ.

23969

if (User->getOpcode() == ISD::BR) {

23970

SDValue FalseBB = User->getOperand(1);

23971

SDNode *NewBR =

23972

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

23973

assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23973, __PRETTY_FUNCTION__));

23974

(void)NewBR;

23975

Dest = FalseBB;

23976

23977

SDValue Cmp =

23978

DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

23979

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

23980

Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

23981

CCVal, Cmp);

23982

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

23983

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

23984

Cmp);

23985

}

23986

}

23987

} else if (CC == ISD::SETUNE) {

23988

// For FCMP_UNE, we can emit

23989

// two branches instead of an explicit OR instruction with a

23990

// separate test.

23991

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

23992

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

23993

Chain =

23994

DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

23995

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

23996

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

23997

Cmp);

23998

} else {

23999

X86::CondCode X86Cond =

24000

TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

24001

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

24002

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

24003

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

24004

Cmp);

24005

}

24006

}

24007

24008

if (ISD::isOverflowIntrOpRes(Cond)) {

24009

SDValue Value, Overflow;

24010

X86::CondCode X86Cond;

24011

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

24012

24013

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

24014

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

24015

Overflow);

24016

}

24017

24018

// Look past the truncate if the high bits are known zero.

24019

if (isTruncWithZeroHighBitsInput(Cond, DAG))

24020

Cond = Cond.getOperand(0);

24021

24022

EVT CondVT = Cond.getValueType();

24023

24024

// Add an AND with 1 if we don't already have one.

24025

if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

24026

Cond =

24027

DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

24028

24029

SDValue LHS = Cond;

24030

SDValue RHS = DAG.getConstant(0, dl, CondVT);

24031

24032

SDValue CCVal;

24033

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

24034

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

24035

EFLAGS);

24036

}

24037

24038

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

24039

// Calls to _alloca are needed to probe the stack when allocating more than 4k

24040

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

24041

// that the guard pages used by the OS virtual memory manager are allocated in

24042

// correct sequence.

24043

SDValue

24044

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

24045

SelectionDAG &DAG) const {

24046

MachineFunction &MF = DAG.getMachineFunction();

24047

bool SplitStack = MF.shouldSplitStack();

24048

bool EmitStackProbeCall = hasStackProbeSymbol(MF);

24049

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

24050

SplitStack || EmitStackProbeCall;

24051

SDLoc dl(Op);

24052

24053

// Get the inputs.

24054

SDNode *Node = Op.getNode();

24055

SDValue Chain = Op.getOperand(0);

24056

SDValue Size = Op.getOperand(1);

24057

MaybeAlign Alignment(Op.getConstantOperandVal(2));

24058

EVT VT = Node->getValueType(0);

24059

24060

// Chain the dynamic stack allocation so that it doesn't modify the stack

24061

// pointer when other instructions are using the stack.

24062

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

24063

24064

bool Is64Bit = Subtarget.is64Bit();

24065

MVT SPTy = getPointerTy(DAG.getDataLayout());

24066

24067

SDValue Result;

24068

if (!Lower) {

24069

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24070

unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();

24071

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24072, __PRETTY_FUNCTION__))

24072

" not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24072, __PRETTY_FUNCTION__));

24073

24074

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

24075

const Align StackAlign = TFI.getStackAlign();

24076

if (hasInlineStackProbe(MF)) {

24077

MachineRegisterInfo &MRI = MF.getRegInfo();

24078

24079

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

24080

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

24081

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

24082

Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

24083

DAG.getRegister(Vreg, SPTy));

24084

} else {

24085

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

24086

Chain = SP.getValue(1);

24087

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

24088

}

24089

if (Alignment && *Alignment > StackAlign)

24090

Result =

24091

DAG.getNode(ISD::AND, dl, VT, Result,

24092

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

24093

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

24094

} else if (SplitStack) {

24095

MachineRegisterInfo &MRI = MF.getRegInfo();

24096

24097

if (Is64Bit) {

24098

// The 64 bit implementation of segmented stacks needs to clobber both r10

24099

// r11. This makes it impossible to use it along with nested parameters.

24100

const Function &F = MF.getFunction();

24101

for (const auto &A : F.args()) {

24102

if (A.hasNestAttr())

24103

report_fatal_error("Cannot use segmented stacks with functions that "

24104

"have nested arguments.");

24105

}

24106

}

24107

24108

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

24109

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

24110

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

24111

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

24112

DAG.getRegister(Vreg, SPTy));

24113

} else {

24114

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

24115

Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);

24116

MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

24117

24118

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

24119

Register SPReg = RegInfo->getStackRegister();

24120

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

24121

Chain = SP.getValue(1);

24122

24123

if (Alignment) {

24124

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

24125

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

24126

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

24127

}

24128

24129

Result = SP;

24130

}

24131

24132

Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),

24133

DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

24134

24135

SDValue Ops[2] = {Result, Chain};

24136

return DAG.getMergeValues(Ops, dl);

24137

}

24138

24139

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

24140

MachineFunction &MF = DAG.getMachineFunction();

24141

auto PtrVT = getPointerTy(MF.getDataLayout());

24142

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

24143

24144

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

24145

SDLoc DL(Op);

24146

24147

if (!Subtarget.is64Bit() ||

24148

Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

24149

// vastart just stores the address of the VarArgsFrameIndex slot into the

24150

// memory location argument.

24151

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

24152

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

24153

MachinePointerInfo(SV));

24154

}

24155

24156

// __va_list_tag:

24157

// gp_offset (0 - 6 * 8)

24158

// fp_offset (48 - 48 + 8 * 16)

24159

// overflow_arg_area (point to parameters coming in memory).

24160

// reg_save_area

24161

SmallVector<SDValue, 8> MemOps;

24162

SDValue FIN = Op.getOperand(1);

24163

// Store gp_offset

24164

SDValue Store = DAG.getStore(

24165

Op.getOperand(0), DL,

24166

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

24167

MachinePointerInfo(SV));

24168

MemOps.push_back(Store);

24169

24170

// Store fp_offset

24171

FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);

24172

Store = DAG.getStore(

24173

Op.getOperand(0), DL,

24174

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

24175

MachinePointerInfo(SV, 4));

24176

MemOps.push_back(Store);

24177

24178

// Store ptr to overflow_arg_area

24179

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

24180

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

24181

Store =

24182

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

24183

MemOps.push_back(Store);

24184

24185

// Store ptr to reg_save_area.

24186

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

24187

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

24188

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

24189

Store = DAG.getStore(

24190

Op.getOperand(0), DL, RSFIN, FIN,

24191

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

24192

MemOps.push_back(Store);

24193

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

24194

}

24195

24196

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

24197

assert(Subtarget.is64Bit() &&((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24198, __PRETTY_FUNCTION__))

24198

"LowerVAARG only handles 64-bit va_arg!")((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24198, __PRETTY_FUNCTION__));

24199

assert(Op.getNumOperands() == 4)((Op.getNumOperands() == 4) ? static_cast<void> (0) : __assert_fail
("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24199, __PRETTY_FUNCTION__));

24200

24201

MachineFunction &MF = DAG.getMachineFunction();

24202

if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

24203

// The Win64 ABI uses char* instead of a structure.

24204

return DAG.expandVAArg(Op.getNode());

24205

24206

SDValue Chain = Op.getOperand(0);

24207

SDValue SrcPtr = Op.getOperand(1);

24208

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

24209

unsigned Align = Op.getConstantOperandVal(3);

24210

SDLoc dl(Op);

24211

24212

EVT ArgVT = Op.getNode()->getValueType(0);

24213

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

24214

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

24215

uint8_t ArgMode;

24216

24217

// Decide which area this value should be read from.

24218

// TODO: Implement the AMD64 ABI in its entirety. This simple

24219

// selection mechanism works only for the basic types.

24220

assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")((ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? static_cast<void> (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24220, __PRETTY_FUNCTION__));

24221

if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

24222

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

24223

} else {

24224

assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&((ArgVT.isInteger() && ArgSize <= 32 && "Unhandled argument type in LowerVAARG"
) ? static_cast<void> (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24225, __PRETTY_FUNCTION__))

24225

"Unhandled argument type in LowerVAARG")((ArgVT.isInteger() && ArgSize <= 32 && "Unhandled argument type in LowerVAARG"
) ? static_cast<void> (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24225, __PRETTY_FUNCTION__));

24226

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

24227

}

24228

24229

if (ArgMode == 2) {

24230

// Sanity Check: Make sure using fp_offset makes sense.

24231

assert(!Subtarget.useSoftFloat() &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24233, __PRETTY_FUNCTION__))

24232

!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24233, __PRETTY_FUNCTION__))

24233

Subtarget.hasSSE1())((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24233, __PRETTY_FUNCTION__));

24234

}

24235

24236

// Insert VAARG_64 node into the DAG

24237

// VAARG_64 returns two values: Variable Argument Address, Chain

24238

SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),

24239

DAG.getConstant(ArgMode, dl, MVT::i8),

24240

DAG.getConstant(Align, dl, MVT::i32)};

24241

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

24242

SDValue VAARG = DAG.getMemIntrinsicNode(

24243

X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

24244

/*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

24245

Chain = VAARG.getValue(1);

24246

24247

// Load the next argument and return it

24248

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

24249

}

24250

24251

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

24252

SelectionDAG &DAG) {

24253

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

24254

// where a va_list is still an i8*.

24255

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24255, __PRETTY_FUNCTION__));

24256

if (Subtarget.isCallingConvWin64(

24257

DAG.getMachineFunction().getFunction().getCallingConv()))

24258

// Probably a Win64 va_copy.

24259

return DAG.expandVACopy(Op.getNode());

24260

24261

SDValue Chain = Op.getOperand(0);

24262

SDValue DstPtr = Op.getOperand(1);

24263

SDValue SrcPtr = Op.getOperand(2);

24264

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

24265

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

24266

SDLoc DL(Op);

24267

24268

return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),

24269

Align(8), /*isVolatile*/ false, false, false,

24270

MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

24271

}

24272

24273

// Helper to get immediate/variable SSE shift opcode from other shift opcodes.

24274

static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

24275

switch (Opc) {

24276

case ISD::SHL:

24277

case X86ISD::VSHL:

24278

case X86ISD::VSHLI:

24279

return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

24280

case ISD::SRL:

24281

case X86ISD::VSRL:

24282

case X86ISD::VSRLI:

24283

return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

24284

case ISD::SRA:

24285

case X86ISD::VSRA:

24286

case X86ISD::VSRAI:

24287

return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

24288

}

24289

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24289);

24290

}

24291

24292

/// Handle vector element shifts where the shift amount is a constant.

24293

/// Takes immediate version of shift as input.

24294

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

24295

SDValue SrcOp, uint64_t ShiftAmt,

24296

SelectionDAG &DAG) {

24297

MVT ElementType = VT.getVectorElementType();

24298

24299

// Bitcast the source vector to the output type, this is mainly necessary for

24300

// vXi8/vXi64 shifts.

24301

if (VT != SrcOp.getSimpleValueType())

24302

SrcOp = DAG.getBitcast(VT, SrcOp);

24303

24304

// Fold this packed shift into its first operand if ShiftAmt is 0.

24305

if (ShiftAmt == 0)

24306

return SrcOp;

24307

24308

// Check for ShiftAmt >= element width

24309

if (ShiftAmt >= ElementType.getSizeInBits()) {

24310

if (Opc == X86ISD::VSRAI)

24311

ShiftAmt = ElementType.getSizeInBits() - 1;

24312

else

24313

return DAG.getConstant(0, dl, VT);

24314

}

24315

24316

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24317, __PRETTY_FUNCTION__))

24317

&& "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24317, __PRETTY_FUNCTION__));

24318

24319

// Fold this packed vector shift into a build vector if SrcOp is a

24320

// vector of Constants or UNDEFs.

24321

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

24322

SmallVector<SDValue, 8> Elts;

24323

unsigned NumElts = SrcOp->getNumOperands();

24324

24325

switch (Opc) {

24326

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24326);

24327

case X86ISD::VSHLI:

24328

for (unsigned i = 0; i != NumElts; ++i) {

24329

SDValue CurrentOp = SrcOp->getOperand(i);

24330

if (CurrentOp->isUndef()) {

24331

// Must produce 0s in the correct bits.

24332

Elts.push_back(DAG.getConstant(0, dl, ElementType));

24333

continue;

24334

}

24335

auto *ND = cast<ConstantSDNode>(CurrentOp);

24336

const APInt &C = ND->getAPIntValue();

24337

Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));

24338

}

24339

break;

24340

case X86ISD::VSRLI:

24341

for (unsigned i = 0; i != NumElts; ++i) {

24342

SDValue CurrentOp = SrcOp->getOperand(i);

24343

if (CurrentOp->isUndef()) {

24344

// Must produce 0s in the correct bits.

24345

Elts.push_back(DAG.getConstant(0, dl, ElementType));

24346

continue;

24347

}

24348

auto *ND = cast<ConstantSDNode>(CurrentOp);

24349

const APInt &C = ND->getAPIntValue();

24350

Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));

24351

}

24352

break;

24353

case X86ISD::VSRAI:

24354

for (unsigned i = 0; i != NumElts; ++i) {

24355

SDValue CurrentOp = SrcOp->getOperand(i);

24356

if (CurrentOp->isUndef()) {

24357

// All shifted in bits must be the same so use 0.

24358

Elts.push_back(DAG.getConstant(0, dl, ElementType));

24359

continue;

24360

}

24361

auto *ND = cast<ConstantSDNode>(CurrentOp);

24362

const APInt &C = ND->getAPIntValue();

24363

Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));

24364

}

24365

break;

24366

}

24367

24368

return DAG.getBuildVector(VT, dl, Elts);

24369

}

24370

24371

return DAG.getNode(Opc, dl, VT, SrcOp,

24372

DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

24373

}

24374

24375

/// Handle vector element shifts where the shift amount may or may not be a

24376

/// constant. Takes immediate version of shift as input.

24377

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

24378

SDValue SrcOp, SDValue ShAmt,

24379

const X86Subtarget &Subtarget,

24380

SelectionDAG &DAG) {

24381

MVT SVT = ShAmt.getSimpleValueType();

24382

assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"
) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24382, __PRETTY_FUNCTION__));

24383

24384

// Catch shift-by-constant.

24385

if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

24386

return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,

24387

CShAmt->getZExtValue(), DAG);

24388

24389

// Change opcode to non-immediate version.

24390

Opc = getTargetVShiftUniformOpcode(Opc, true);

24391

24392

// Need to build a vector containing shift amount.

24393

// SSE/AVX packed shifts only use the lower 64-bit of the shift count.

24394

// +====================+============+=======================================+

24395

// | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |

24396

// +====================+============+=======================================+

24397

// | i64 | Yes, No | Use ShAmt as lowest elt |

24398

// | i32 | Yes | zero-extend in-reg |

24399

// | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |

24400

// | (i32 zext(i16/i8)) | No | byte-shift-in-reg |

24401

// | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |

24402

// +====================+============+=======================================+

24403

24404

if (SVT == MVT::i64)

24405

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);

24406

else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&

24407

ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

24408

(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||

24409

ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {

24410

ShAmt = ShAmt.getOperand(0);

24411

MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;

24412

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);

24413

if (Subtarget.hasSSE41())

24414

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

24415

MVT::v2i64, ShAmt);

24416

else {

24417

SDValue ByteShift = DAG.getTargetConstant(

24418

(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

24419

ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

24420

ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

24421

ByteShift);

24422

ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

24423

ByteShift);

24424

}

24425

} else if (Subtarget.hasSSE41() &&

24426

ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

24427

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);

24428

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

24429

MVT::v2i64, ShAmt);

24430

} else {

24431

SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),

24432

DAG.getUNDEF(SVT)};

24433

ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);

24434

}

24435

24436

// The return type has to be a 128-bit type with the same element

24437

// type as the input type.

24438

MVT EltVT = VT.getVectorElementType();

24439

MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

24440

24441

ShAmt = DAG.getBitcast(ShVT, ShAmt);

24442

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

24443

}

24444

24445

/// Return Mask with the necessary casting or extending

24446

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

24447

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

24448

const X86Subtarget &Subtarget, SelectionDAG &DAG,

24449

const SDLoc &dl) {

24450

24451

if (isAllOnesConstant(Mask))

24452

return DAG.getConstant(1, dl, MaskVT);

24453

if (X86::isZeroNode(Mask))

24454

return DAG.getConstant(0, dl, MaskVT);

24455

24456

assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24456, __PRETTY_FUNCTION__));

24457

24458

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

24459

assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((MaskVT == MVT::v64i1 && "Expected v64i1 mask!") ? static_cast
<void> (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24459, __PRETTY_FUNCTION__));

24460

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24460, __PRETTY_FUNCTION__));

24461

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

24462

SDValue Lo, Hi;

24463

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

24464

DAG.getConstant(0, dl, MVT::i32));

24465

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

24466

DAG.getConstant(1, dl, MVT::i32));

24467

24468

Lo = DAG.getBitcast(MVT::v32i1, Lo);

24469

Hi = DAG.getBitcast(MVT::v32i1, Hi);

24470

24471

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

24472

} else {

24473

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

24474

Mask.getSimpleValueType().getSizeInBits());

24475

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

24476

// are extracted by EXTRACT_SUBVECTOR.

24477

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

24478

DAG.getBitcast(BitcastVT, Mask),

24479

DAG.getIntPtrConstant(0, dl));

24480

}

24481

}

24482

24483

/// Return (and \p Op, \p Mask) for compare instructions or

24484

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

24485

/// necessary casting or extending for \p Mask when lowering masking intrinsics

24486

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

24487

SDValue PreservedSrc,

24488

const X86Subtarget &Subtarget,

24489

SelectionDAG &DAG) {

24490

MVT VT = Op.getSimpleValueType();

24491

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

24492

unsigned OpcodeSelect = ISD::VSELECT;

24493

SDLoc dl(Op);

24494

24495

if (isAllOnesConstant(Mask))

24496

return Op;

24497

24498

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

24499

24500

if (PreservedSrc.isUndef())

24501

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

24502

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

24503

}

24504

24505

/// Creates an SDNode for a predicated scalar operation.

24506

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

24507

/// The mask is coming as MVT::i8 and it should be transformed

24508

/// to MVT::v1i1 while lowering masking intrinsics.

24509

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

24510

/// "X86select" instead of "vselect". We just can't create the "vselect" node

24511

/// for a scalar instruction.

24512

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

24513

SDValue PreservedSrc,

24514

const X86Subtarget &Subtarget,

24515

SelectionDAG &DAG) {

24516

24517

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

24518

if (MaskConst->getZExtValue() & 0x1)

24519

return Op;

24520

24521

MVT VT = Op.getSimpleValueType();

24522

SDLoc dl(Op);

24523

24524

assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((Mask.getValueType() == MVT::i8 && "Unexpect type") ?
static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24524, __PRETTY_FUNCTION__));

24525

SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

24526

DAG.getBitcast(MVT::v8i1, Mask),

24527

DAG.getIntPtrConstant(0, dl));

24528

if (Op.getOpcode() == X86ISD::FSETCCM ||

24529

Op.getOpcode() == X86ISD::FSETCCM_SAE ||

24530

Op.getOpcode() == X86ISD::VFPCLASSS)

24531

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

24532

24533

if (PreservedSrc.isUndef())

24534

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

24535

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

24536

}

24537

24538

static int getSEHRegistrationNodeSize(const Function *Fn) {

24539

if (!Fn->hasPersonalityFn())

24540

report_fatal_error(

24541

"querying registration node size for function without personality");

24542

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

24543

// WinEHStatePass for the full struct definition.

24544

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

24545

case EHPersonality::MSVC_X86SEH: return 24;

24546

case EHPersonality::MSVC_CXX: return 16;

24547

default: break;

24548

}

24549

report_fatal_error(

24550

"can only recover FP for 32-bit MSVC EH personality functions");

24551

}

24552

24553

/// When the MSVC runtime transfers control to us, either to an outlined

24554

/// function or when returning to a parent frame after catching an exception, we

24555

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

24556

/// Here's the math:

24557

/// RegNodeBase = EntryEBP - RegNodeSize

24558

/// ParentFP = RegNodeBase - ParentFrameOffset

24559

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

24560

/// subtracting the offset (negative on x86) takes us back to the parent FP.

24561

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

24562

SDValue EntryEBP) {

24563

MachineFunction &MF = DAG.getMachineFunction();

24564

SDLoc dl;

24565

24566

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24567

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

24568

24569

// It's possible that the parent function no longer has a personality function

24570

// if the exceptional code was optimized away, in which case we just return

24571

// the incoming EBP.

24572

if (!Fn->hasPersonalityFn())

24573

return EntryEBP;

24574

24575

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

24576

// registration, or the .set_setframe offset.

24577

MCSymbol *OffsetSym =

24578

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

24579

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

24580

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

24581

SDValue ParentFrameOffset =

24582

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

24583

24584

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

24585

// prologue to RBP in the parent function.

24586

const X86Subtarget &Subtarget =

24587

static_cast<const X86Subtarget &>(DAG.getSubtarget());

24588

if (Subtarget.is64Bit())

24589

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

24590

24591

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

24592

// RegNodeBase = EntryEBP - RegNodeSize

24593

// ParentFP = RegNodeBase - ParentFrameOffset

24594

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

24595

DAG.getConstant(RegNodeSize, dl, PtrVT));

24596

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

24597

}

24598

24599

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

24600

SelectionDAG &DAG) const {

24601

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

24602

auto isRoundModeCurDirection = [](SDValue Rnd) {

24603

if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

24604

return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

24605

24606

return false;

24607

};

24608

auto isRoundModeSAE = [](SDValue Rnd) {

24609

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

24610

unsigned RC = C->getZExtValue();

24611

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

24612

// Clear the NO_EXC bit and check remaining bits.

24613

RC ^= X86::STATIC_ROUNDING::NO_EXC;

24614

// As a convenience we allow no other bits or explicitly

24615

// current direction.

24616

return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

24617

}

24618

}

24619

24620

return false;

24621

};

24622

auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

24623

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

24624

RC = C->getZExtValue();

24625

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

24626

// Clear the NO_EXC bit and check remaining bits.

24627

RC ^= X86::STATIC_ROUNDING::NO_EXC;

24628

return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

24629

RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

24630

RC == X86::STATIC_ROUNDING::TO_POS_INF ||

24631

RC == X86::STATIC_ROUNDING::TO_ZERO;

24632

}

24633

}

24634

24635

return false;

24636

};

24637

24638

SDLoc dl(Op);

24639

unsigned IntNo = Op.getConstantOperandVal(0);

24640

MVT VT = Op.getSimpleValueType();

24641

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

24642

24643

if (IntrData) {

24644

switch(IntrData->Type) {

24645

case INTR_TYPE_1OP: {

24646

// We specify 2 possible opcodes for intrinsics with rounding modes.

24647

// First, we check if the intrinsic may have non-default rounding mode,

24648

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

24649

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

24650

if (IntrWithRoundingModeOpcode != 0) {

24651

SDValue Rnd = Op.getOperand(2);

24652

unsigned RC = 0;

24653

if (isRoundModeSAEToX(Rnd, RC))

24654

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

24655

Op.getOperand(1),

24656

DAG.getTargetConstant(RC, dl, MVT::i32));

24657

if (!isRoundModeCurDirection(Rnd))

24658

return SDValue();

24659

}

24660

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

24661

Op.getOperand(1));

24662

}

24663

case INTR_TYPE_1OP_SAE: {

24664

SDValue Sae = Op.getOperand(2);

24665

24666

unsigned Opc;

24667

if (isRoundModeCurDirection(Sae))

24668

Opc = IntrData->Opc0;

24669

else if (isRoundModeSAE(Sae))

24670

Opc = IntrData->Opc1;

24671

else

24672

return SDValue();

24673

24674

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

24675

}

24676

case INTR_TYPE_2OP: {

24677

SDValue Src2 = Op.getOperand(2);

24678

24679

// We specify 2 possible opcodes for intrinsics with rounding modes.

24680

// First, we check if the intrinsic may have non-default rounding mode,

24681

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

24682

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

24683

if (IntrWithRoundingModeOpcode != 0) {

24684

SDValue Rnd = Op.getOperand(3);

24685

unsigned RC = 0;

24686

if (isRoundModeSAEToX(Rnd, RC))

24687

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

24688

Op.getOperand(1), Src2,

24689

DAG.getTargetConstant(RC, dl, MVT::i32));

24690

if (!isRoundModeCurDirection(Rnd))

24691

return SDValue();

24692

}

24693

24694

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

24695

Op.getOperand(1), Src2);

24696

}

24697

case INTR_TYPE_2OP_SAE: {

24698

SDValue Sae = Op.getOperand(3);

24699

24700

unsigned Opc;

24701

if (isRoundModeCurDirection(Sae))

24702

Opc = IntrData->Opc0;

24703

else if (isRoundModeSAE(Sae))

24704

Opc = IntrData->Opc1;

24705

else

24706

return SDValue();

24707

24708

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

24709

Op.getOperand(2));

24710

}

24711

case INTR_TYPE_3OP:

24712

case INTR_TYPE_3OP_IMM8: {

24713

SDValue Src1 = Op.getOperand(1);

24714

SDValue Src2 = Op.getOperand(2);

24715

SDValue Src3 = Op.getOperand(3);

24716

24717

if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

24718

Src3.getValueType() != MVT::i8) {

24719

Src3 = DAG.getTargetConstant(

24720

cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);

24721

}

24722

24723

// We specify 2 possible opcodes for intrinsics with rounding modes.

24724

// First, we check if the intrinsic may have non-default rounding mode,

24725

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

24726

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

24727

if (IntrWithRoundingModeOpcode != 0) {

24728

SDValue Rnd = Op.getOperand(4);

24729

unsigned RC = 0;

24730

if (isRoundModeSAEToX(Rnd, RC))

24731

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

24732

Src1, Src2, Src3,

24733

DAG.getTargetConstant(RC, dl, MVT::i32));

24734

if (!isRoundModeCurDirection(Rnd))

24735

return SDValue();

24736

}

24737

24738

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

24739

{Src1, Src2, Src3});

24740

}

24741

case INTR_TYPE_4OP_IMM8: {

24742

assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)((Op.getOperand(4)->getOpcode() == ISD::TargetConstant) ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24742, __PRETTY_FUNCTION__));

24743

SDValue Src4 = Op.getOperand(4);

24744

if (Src4.getValueType() != MVT::i8) {

24745

Src4 = DAG.getTargetConstant(

24746

cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);

24747

}

24748

24749

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

24750

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

24751

Src4);

24752

}

24753

case INTR_TYPE_1OP_MASK: {

24754

SDValue Src = Op.getOperand(1);

24755

SDValue PassThru = Op.getOperand(2);

24756

SDValue Mask = Op.getOperand(3);

24757

// We add rounding mode to the Node when

24758

// - RC Opcode is specified and

24759

// - RC is not "current direction".

24760

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

24761

if (IntrWithRoundingModeOpcode != 0) {

24762

SDValue Rnd = Op.getOperand(4);

24763

unsigned RC = 0;

24764

if (isRoundModeSAEToX(Rnd, RC))

24765

return getVectorMaskingNode(

24766

DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

24767

Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

24768

Mask, PassThru, Subtarget, DAG);

24769

if (!isRoundModeCurDirection(Rnd))

24770

return SDValue();

24771

}

24772

return getVectorMaskingNode(

24773

DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

24774

Subtarget, DAG);

24775

}

24776

case INTR_TYPE_1OP_MASK_SAE: {

24777

SDValue Src = Op.getOperand(1);

24778

SDValue PassThru = Op.getOperand(2);

24779

SDValue Mask = Op.getOperand(3);

24780

SDValue Rnd = Op.getOperand(4);

24781

24782

unsigned Opc;

24783

if (isRoundModeCurDirection(Rnd))

24784

Opc = IntrData->Opc0;

24785

else if (isRoundModeSAE(Rnd))

24786

Opc = IntrData->Opc1;

24787

else

24788

return SDValue();

24789

24790

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

24791

Subtarget, DAG);

24792

}

24793

case INTR_TYPE_SCALAR_MASK: {

24794

SDValue Src1 = Op.getOperand(1);

24795

SDValue Src2 = Op.getOperand(2);

24796

SDValue passThru = Op.getOperand(3);

24797

SDValue Mask = Op.getOperand(4);

24798

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

24799

// There are 2 kinds of intrinsics in this group:

24800

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

24801

// (2) With rounding mode and sae - 7 operands.

24802

bool HasRounding = IntrWithRoundingModeOpcode != 0;

24803

if (Op.getNumOperands() == (5U + HasRounding)) {

24804

if (HasRounding) {

24805

SDValue Rnd = Op.getOperand(5);

24806

unsigned RC = 0;

24807

if (isRoundModeSAEToX(Rnd, RC))

24808

return getScalarMaskingNode(

24809

DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

24810

DAG.getTargetConstant(RC, dl, MVT::i32)),

24811

Mask, passThru, Subtarget, DAG);

24812

if (!isRoundModeCurDirection(Rnd))

24813

return SDValue();

24814

}

24815

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

24816

Src2),

24817

Mask, passThru, Subtarget, DAG);

24818

}

24819

24820

assert(Op.getNumOperands() == (6U + HasRounding) &&((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24821, __PRETTY_FUNCTION__))

24821

"Unexpected intrinsic form")((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24821, __PRETTY_FUNCTION__));

24822

SDValue RoundingMode = Op.getOperand(5);

24823

unsigned Opc = IntrData->Opc0;

24824

if (HasRounding) {

24825

SDValue Sae = Op.getOperand(6);

24826

if (isRoundModeSAE(Sae))

24827

Opc = IntrWithRoundingModeOpcode;

24828

else if (!isRoundModeCurDirection(Sae))

24829

return SDValue();

24830

}

24831

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

24832

Src2, RoundingMode),

24833

Mask, passThru, Subtarget, DAG);

24834

}

24835

case INTR_TYPE_SCALAR_MASK_RND: {

24836

SDValue Src1 = Op.getOperand(1);

24837

SDValue Src2 = Op.getOperand(2);

24838

SDValue passThru = Op.getOperand(3);

24839

SDValue Mask = Op.getOperand(4);

24840

SDValue Rnd = Op.getOperand(5);

24841

24842

SDValue NewOp;

24843

unsigned RC = 0;

24844

if (isRoundModeCurDirection(Rnd))

24845

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

24846

else if (isRoundModeSAEToX(Rnd, RC))

24847

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

24848

DAG.getTargetConstant(RC, dl, MVT::i32));

24849

else

24850

return SDValue();

24851

24852

return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

24853

}

24854

case INTR_TYPE_SCALAR_MASK_SAE: {

24855

SDValue Src1 = Op.getOperand(1);

24856

SDValue Src2 = Op.getOperand(2);

24857

SDValue passThru = Op.getOperand(3);

24858

SDValue Mask = Op.getOperand(4);

24859

SDValue Sae = Op.getOperand(5);

24860

unsigned Opc;

24861

if (isRoundModeCurDirection(Sae))

24862

Opc = IntrData->Opc0;

24863

else if (isRoundModeSAE(Sae))

24864

Opc = IntrData->Opc1;

24865

else

24866

return SDValue();

24867

24868

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

24869

Mask, passThru, Subtarget, DAG);

24870

}

24871

case INTR_TYPE_2OP_MASK: {

24872

SDValue Src1 = Op.getOperand(1);

24873

SDValue Src2 = Op.getOperand(2);

24874

SDValue PassThru = Op.getOperand(3);

24875

SDValue Mask = Op.getOperand(4);

24876

SDValue NewOp;

24877

if (IntrData->Opc1 != 0) {

24878

SDValue Rnd = Op.getOperand(5);

24879

unsigned RC = 0;

24880

if (isRoundModeSAEToX(Rnd, RC))

24881

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

24882

DAG.getTargetConstant(RC, dl, MVT::i32));

24883

else if (!isRoundModeCurDirection(Rnd))

24884

return SDValue();

24885

}

24886

if (!NewOp)

24887

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

24888

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

24889

}

24890

case INTR_TYPE_2OP_MASK_SAE: {

24891

SDValue Src1 = Op.getOperand(1);

24892

SDValue Src2 = Op.getOperand(2);

24893

SDValue PassThru = Op.getOperand(3);

24894

SDValue Mask = Op.getOperand(4);

24895

24896

unsigned Opc = IntrData->Opc0;

24897

if (IntrData->Opc1 != 0) {

24898

SDValue Sae = Op.getOperand(5);

24899

if (isRoundModeSAE(Sae))

24900

Opc = IntrData->Opc1;

24901

else if (!isRoundModeCurDirection(Sae))

24902

return SDValue();

24903

}

24904

24905

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

24906

Mask, PassThru, Subtarget, DAG);

24907

}

24908

case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

24909

SDValue Src1 = Op.getOperand(1);

24910

SDValue Src2 = Op.getOperand(2);

24911

SDValue Src3 = Op.getOperand(3);

24912

SDValue PassThru = Op.getOperand(4);

24913

SDValue Mask = Op.getOperand(5);

24914

SDValue Sae = Op.getOperand(6);

24915

unsigned Opc;

24916

if (isRoundModeCurDirection(Sae))

24917

Opc = IntrData->Opc0;

24918

else if (isRoundModeSAE(Sae))

24919

Opc = IntrData->Opc1;

24920

else

24921

return SDValue();

24922

24923

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

24924

Mask, PassThru, Subtarget, DAG);

24925

}

24926

case INTR_TYPE_3OP_MASK_SAE: {

24927

SDValue Src1 = Op.getOperand(1);

24928

SDValue Src2 = Op.getOperand(2);

24929

SDValue Src3 = Op.getOperand(3);

24930

SDValue PassThru = Op.getOperand(4);

24931

SDValue Mask = Op.getOperand(5);

24932

24933

unsigned Opc = IntrData->Opc0;

24934

if (IntrData->Opc1 != 0) {

24935

SDValue Sae = Op.getOperand(6);

24936

if (isRoundModeSAE(Sae))

24937

Opc = IntrData->Opc1;

24938

else if (!isRoundModeCurDirection(Sae))

24939

return SDValue();

24940

}

24941

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

24942

Mask, PassThru, Subtarget, DAG);

24943

}

24944

case BLENDV: {

24945

SDValue Src1 = Op.getOperand(1);

24946

SDValue Src2 = Op.getOperand(2);

24947

SDValue Src3 = Op.getOperand(3);

24948

24949

EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

24950

Src3 = DAG.getBitcast(MaskVT, Src3);

24951

24952

// Reverse the operands to match VSELECT order.

24953

return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

24954

}

24955

case VPERM_2OP : {

24956

SDValue Src1 = Op.getOperand(1);

24957

SDValue Src2 = Op.getOperand(2);

24958

24959

// Swap Src1 and Src2 in the node creation

24960

return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

24961

}

24962

case IFMA_OP:

24963

// NOTE: We need to swizzle the operands to pass the multiply operands

24964

// first.

24965

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

24966

Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

24967

case FPCLASSS: {

24968

SDValue Src1 = Op.getOperand(1);

24969

SDValue Imm = Op.getOperand(2);

24970

SDValue Mask = Op.getOperand(3);

24971

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

24972

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

24973

Subtarget, DAG);

24974

// Need to fill with zeros to ensure the bitcast will produce zeroes

24975

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

24976

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

24977

DAG.getConstant(0, dl, MVT::v8i1),

24978

FPclassMask, DAG.getIntPtrConstant(0, dl));

24979

return DAG.getBitcast(MVT::i8, Ins);

24980

}

24981

24982

case CMP_MASK_CC: {

24983

MVT MaskVT = Op.getSimpleValueType();

24984

SDValue CC = Op.getOperand(3);

24985

SDValue Mask = Op.getOperand(4);

24986

// We specify 2 possible opcodes for intrinsics with rounding modes.

24987

// First, we check if the intrinsic may have non-default rounding mode,

24988

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

24989

if (IntrData->Opc1 != 0) {

24990

SDValue Sae = Op.getOperand(5);

24991

if (isRoundModeSAE(Sae))

24992

return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

24993

Op.getOperand(2), CC, Mask, Sae);

24994

if (!isRoundModeCurDirection(Sae))

24995

return SDValue();

24996

}

24997

//default rounding mode

24998

return DAG.getNode(IntrData->Opc0, dl, MaskVT,

24999

{Op.getOperand(1), Op.getOperand(2), CC, Mask});

25000

}

25001

case CMP_MASK_SCALAR_CC: {

25002

SDValue Src1 = Op.getOperand(1);

25003

SDValue Src2 = Op.getOperand(2);

25004

SDValue CC = Op.getOperand(3);

25005

SDValue Mask = Op.getOperand(4);

25006

25007

SDValue Cmp;

25008

if (IntrData->Opc1 != 0) {

25009

SDValue Sae = Op.getOperand(5);

25010

if (isRoundModeSAE(Sae))

25011

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

25012

else if (!isRoundModeCurDirection(Sae))

25013

return SDValue();

25014

}

25015

//default rounding mode

25016

if (!Cmp.getNode())

25017

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

25018

25019

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

25020

Subtarget, DAG);

25021

// Need to fill with zeros to ensure the bitcast will produce zeroes

25022

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

25023

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

25024

DAG.getConstant(0, dl, MVT::v8i1),

25025

CmpMask, DAG.getIntPtrConstant(0, dl));

25026

return DAG.getBitcast(MVT::i8, Ins);

25027

}

25028

case COMI: { // Comparison intrinsics

25029

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

25030

SDValue LHS = Op.getOperand(1);

25031

SDValue RHS = Op.getOperand(2);

25032

// Some conditions require the operands to be swapped.

25033

if (CC == ISD::SETLT || CC == ISD::SETLE)

25034

std::swap(LHS, RHS);

25035

25036

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

25037

SDValue SetCC;

25038

switch (CC) {

25039

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

25040

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

25041

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

25042

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

25043

break;

25044

}

25045

case ISD::SETNE: { // (ZF = 1 or PF = 1)

25046

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

25047

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

25048

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

25049

break;

25050

}

25051

case ISD::SETGT: // (CF = 0 and ZF = 0)

25052

case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

25053

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

25054

break;

25055

}

25056

case ISD::SETGE: // CF = 0

25057

case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

25058

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

25059

break;

25060

default:

25061

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25061);

25062

}

25063

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

25064

}

25065

case COMI_RM: { // Comparison intrinsics with Sae

25066

SDValue LHS = Op.getOperand(1);

25067

SDValue RHS = Op.getOperand(2);

25068

unsigned CondVal = Op.getConstantOperandVal(3);

25069

SDValue Sae = Op.getOperand(4);

25070

25071

SDValue FCmp;

25072

if (isRoundModeCurDirection(Sae))

25073

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

25074

DAG.getTargetConstant(CondVal, dl, MVT::i8));

25075

else if (isRoundModeSAE(Sae))

25076

FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

25077

DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

25078

else

25079

return SDValue();

25080

// Need to fill with zeros to ensure the bitcast will produce zeroes

25081

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

25082

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

25083

DAG.getConstant(0, dl, MVT::v16i1),

25084

FCmp, DAG.getIntPtrConstant(0, dl));

25085

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

25086

DAG.getBitcast(MVT::i16, Ins));

25087

}

25088

case VSHIFT:

25089

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

25090

Op.getOperand(1), Op.getOperand(2), Subtarget,

25091

DAG);

25092

case COMPRESS_EXPAND_IN_REG: {

25093

SDValue Mask = Op.getOperand(3);

25094

SDValue DataToCompress = Op.getOperand(1);

25095

SDValue PassThru = Op.getOperand(2);

25096

if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

25097

return Op.getOperand(1);

25098

25099

// Avoid false dependency.

25100

if (PassThru.isUndef())

25101

PassThru = DAG.getConstant(0, dl, VT);

25102

25103

return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

25104

Mask);

25105

}

25106

case FIXUPIMM:

25107

case FIXUPIMM_MASKZ: {

25108

SDValue Src1 = Op.getOperand(1);

25109

SDValue Src2 = Op.getOperand(2);

25110

SDValue Src3 = Op.getOperand(3);

25111

SDValue Imm = Op.getOperand(4);

25112

SDValue Mask = Op.getOperand(5);

25113

SDValue Passthru = (IntrData->Type == FIXUPIMM)

25114

? Src1

25115

: getZeroVector(VT, Subtarget, DAG, dl);

25116

25117

unsigned Opc = IntrData->Opc0;

25118

if (IntrData->Opc1 != 0) {

25119

SDValue Sae = Op.getOperand(6);

25120

if (isRoundModeSAE(Sae))

25121

Opc = IntrData->Opc1;

25122

else if (!isRoundModeCurDirection(Sae))

25123

return SDValue();

25124

}

25125

25126

SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

25127

25128

if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

25129

return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

25130

25131

return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

25132

}

25133

case ROUNDP: {

25134

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25134, __PRETTY_FUNCTION__));

25135

// Clear the upper bits of the rounding immediate so that the legacy

25136

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

25137

auto Round = cast<ConstantSDNode>(Op.getOperand(2));

25138

SDValue RoundingMode =

25139

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

25140

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

25141

Op.getOperand(1), RoundingMode);

25142

}

25143

case ROUNDS: {

25144

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25144, __PRETTY_FUNCTION__));

25145

// Clear the upper bits of the rounding immediate so that the legacy

25146

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

25147

auto Round = cast<ConstantSDNode>(Op.getOperand(3));

25148

SDValue RoundingMode =

25149

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

25150

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

25151

Op.getOperand(1), Op.getOperand(2), RoundingMode);

25152

}

25153

case BEXTRI: {

25154

assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode")((IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::BEXTR && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25154, __PRETTY_FUNCTION__));

25155

25156

// The control is a TargetConstant, but we need to convert it to a

25157

// ConstantSDNode.

25158

uint64_t Imm = Op.getConstantOperandVal(2);

25159

SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());

25160

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

25161

Op.getOperand(1), Control);

25162

}

25163

// ADC/ADCX/SBB

25164

case ADX: {

25165

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

25166

SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

25167

25168

SDValue Res;

25169

// If the carry in is zero, then we should just use ADD/SUB instead of

25170

// ADC/SBB.

25171

if (isNullConstant(Op.getOperand(1))) {

25172

Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

25173

Op.getOperand(3));

25174

} else {

25175

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

25176

DAG.getConstant(-1, dl, MVT::i8));

25177

Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

25178

Op.getOperand(3), GenCF.getValue(1));

25179

}

25180

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

25181

SDValue Results[] = { SetCC, Res };

25182

return DAG.getMergeValues(Results, dl);

25183

}

25184

case CVTPD2PS_MASK:

25185

case CVTPD2DQ_MASK:

25186

case CVTQQ2PS_MASK:

25187

case TRUNCATE_TO_REG: {

25188

SDValue Src = Op.getOperand(1);

25189

SDValue PassThru = Op.getOperand(2);

25190

SDValue Mask = Op.getOperand(3);

25191

25192

if (isAllOnesConstant(Mask))

25193

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

25194

25195

MVT SrcVT = Src.getSimpleValueType();

25196

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

25197

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

25198

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

25199

{Src, PassThru, Mask});

25200

}

25201

case CVTPS2PH_MASK: {

25202

SDValue Src = Op.getOperand(1);

25203

SDValue Rnd = Op.getOperand(2);

25204

SDValue PassThru = Op.getOperand(3);

25205

SDValue Mask = Op.getOperand(4);

25206

25207

if (isAllOnesConstant(Mask))

25208

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);

25209

25210

MVT SrcVT = Src.getSimpleValueType();

25211

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

25212

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

25213

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,

25214

PassThru, Mask);

25215

25216

}

25217

case CVTNEPS2BF16_MASK: {

25218

SDValue Src = Op.getOperand(1);

25219

SDValue PassThru = Op.getOperand(2);

25220

SDValue Mask = Op.getOperand(3);

25221

25222

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

25223

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

25224

25225

// Break false dependency.

25226

if (PassThru.isUndef())

25227

PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

25228

25229

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

25230

Mask);

25231

}

25232

default:

25233

break;

25234

}

25235

}

25236

25237

switch (IntNo) {

25238

default: return SDValue(); // Don't custom lower most intrinsics.

25239

25240

// ptest and testp intrinsics. The intrinsic these come from are designed to

25241

// return an integer value, not just an instruction so lower it to the ptest

25242

// or testp pattern and a setcc for the result.

25243

case Intrinsic::x86_avx512_ktestc_b:

25244

case Intrinsic::x86_avx512_ktestc_w:

25245

case Intrinsic::x86_avx512_ktestc_d:

25246

case Intrinsic::x86_avx512_ktestc_q:

25247

case Intrinsic::x86_avx512_ktestz_b:

25248

case Intrinsic::x86_avx512_ktestz_w:

25249

case Intrinsic::x86_avx512_ktestz_d:

25250

case Intrinsic::x86_avx512_ktestz_q:

25251

case Intrinsic::x86_sse41_ptestz:

25252

case Intrinsic::x86_sse41_ptestc:

25253

case Intrinsic::x86_sse41_ptestnzc:

25254

case Intrinsic::x86_avx_ptestz_256:

25255

case Intrinsic::x86_avx_ptestc_256:

25256

case Intrinsic::x86_avx_ptestnzc_256:

25257

case Intrinsic::x86_avx_vtestz_ps:

25258

case Intrinsic::x86_avx_vtestc_ps:

25259

case Intrinsic::x86_avx_vtestnzc_ps:

25260

case Intrinsic::x86_avx_vtestz_pd:

25261

case Intrinsic::x86_avx_vtestc_pd:

25262

case Intrinsic::x86_avx_vtestnzc_pd:

25263

case Intrinsic::x86_avx_vtestz_ps_256:

25264

case Intrinsic::x86_avx_vtestc_ps_256:

25265

case Intrinsic::x86_avx_vtestnzc_ps_256:

25266

case Intrinsic::x86_avx_vtestz_pd_256:

25267

case Intrinsic::x86_avx_vtestc_pd_256:

25268

case Intrinsic::x86_avx_vtestnzc_pd_256: {

25269

unsigned TestOpc = X86ISD::PTEST;

25270

X86::CondCode X86CC;

25271

switch (IntNo) {

25272

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25272);

25273

case Intrinsic::x86_avx512_ktestc_b:

25274

case Intrinsic::x86_avx512_ktestc_w:

25275

case Intrinsic::x86_avx512_ktestc_d:

25276

case Intrinsic::x86_avx512_ktestc_q:

25277

// CF = 1

25278

TestOpc = X86ISD::KTEST;

25279

X86CC = X86::COND_B;

25280

break;

25281

case Intrinsic::x86_avx512_ktestz_b:

25282

case Intrinsic::x86_avx512_ktestz_w:

25283

case Intrinsic::x86_avx512_ktestz_d:

25284

case Intrinsic::x86_avx512_ktestz_q:

25285

TestOpc = X86ISD::KTEST;

25286

X86CC = X86::COND_E;

25287

break;

25288

case Intrinsic::x86_avx_vtestz_ps:

25289

case Intrinsic::x86_avx_vtestz_pd:

25290

case Intrinsic::x86_avx_vtestz_ps_256:

25291

case Intrinsic::x86_avx_vtestz_pd_256:

25292

TestOpc = X86ISD::TESTP;

25293

LLVM_FALLTHROUGH[[gnu::fallthrough]];

25294

case Intrinsic::x86_sse41_ptestz:

25295

case Intrinsic::x86_avx_ptestz_256:

25296

// ZF = 1

25297

X86CC = X86::COND_E;

25298

break;

25299

case Intrinsic::x86_avx_vtestc_ps:

25300

case Intrinsic::x86_avx_vtestc_pd:

25301

case Intrinsic::x86_avx_vtestc_ps_256:

25302

case Intrinsic::x86_avx_vtestc_pd_256:

25303

TestOpc = X86ISD::TESTP;

25304

LLVM_FALLTHROUGH[[gnu::fallthrough]];

25305

case Intrinsic::x86_sse41_ptestc:

25306

case Intrinsic::x86_avx_ptestc_256:

25307

// CF = 1

25308

X86CC = X86::COND_B;

25309

break;

25310

case Intrinsic::x86_avx_vtestnzc_ps:

25311

case Intrinsic::x86_avx_vtestnzc_pd:

25312

case Intrinsic::x86_avx_vtestnzc_ps_256:

25313

case Intrinsic::x86_avx_vtestnzc_pd_256:

25314

TestOpc = X86ISD::TESTP;

25315

LLVM_FALLTHROUGH[[gnu::fallthrough]];

25316

case Intrinsic::x86_sse41_ptestnzc:

25317

case Intrinsic::x86_avx_ptestnzc_256:

25318

// ZF and CF = 0

25319

X86CC = X86::COND_A;

25320

break;

25321

}

25322

25323

SDValue LHS = Op.getOperand(1);

25324

SDValue RHS = Op.getOperand(2);

25325

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

25326

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

25327

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

25328

}

25329

25330

case Intrinsic::x86_sse42_pcmpistria128:

25331

case Intrinsic::x86_sse42_pcmpestria128:

25332

case Intrinsic::x86_sse42_pcmpistric128:

25333

case Intrinsic::x86_sse42_pcmpestric128:

25334

case Intrinsic::x86_sse42_pcmpistrio128:

25335

case Intrinsic::x86_sse42_pcmpestrio128:

25336

case Intrinsic::x86_sse42_pcmpistris128:

25337

case Intrinsic::x86_sse42_pcmpestris128:

25338

case Intrinsic::x86_sse42_pcmpistriz128:

25339

case Intrinsic::x86_sse42_pcmpestriz128: {

25340

unsigned Opcode;

25341

X86::CondCode X86CC;

25342

switch (IntNo) {

25343

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25343); // Can't reach here.

25344

case Intrinsic::x86_sse42_pcmpistria128:

25345

Opcode = X86ISD::PCMPISTR;

25346

X86CC = X86::COND_A;

25347

break;

25348

case Intrinsic::x86_sse42_pcmpestria128:

25349

Opcode = X86ISD::PCMPESTR;

25350

X86CC = X86::COND_A;

25351

break;

25352

case Intrinsic::x86_sse42_pcmpistric128:

25353

Opcode = X86ISD::PCMPISTR;

25354

X86CC = X86::COND_B;

25355

break;

25356

case Intrinsic::x86_sse42_pcmpestric128:

25357

Opcode = X86ISD::PCMPESTR;

25358

X86CC = X86::COND_B;

25359

break;

25360

case Intrinsic::x86_sse42_pcmpistrio128:

25361

Opcode = X86ISD::PCMPISTR;

25362

X86CC = X86::COND_O;

25363

break;

25364

case Intrinsic::x86_sse42_pcmpestrio128:

25365

Opcode = X86ISD::PCMPESTR;

25366

X86CC = X86::COND_O;

25367

break;

25368

case Intrinsic::x86_sse42_pcmpistris128:

25369

Opcode = X86ISD::PCMPISTR;

25370

X86CC = X86::COND_S;

25371

break;

25372

case Intrinsic::x86_sse42_pcmpestris128:

25373

Opcode = X86ISD::PCMPESTR;

25374

X86CC = X86::COND_S;

25375

break;

25376

case Intrinsic::x86_sse42_pcmpistriz128:

25377

Opcode = X86ISD::PCMPISTR;

25378

X86CC = X86::COND_E;

25379

break;

25380

case Intrinsic::x86_sse42_pcmpestriz128:

25381

Opcode = X86ISD::PCMPESTR;

25382

X86CC = X86::COND_E;

25383

break;

25384

}

25385

SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());

25386

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

25387

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

25388

SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

25389

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

25390

}

25391

25392

case Intrinsic::x86_sse42_pcmpistri128:

25393

case Intrinsic::x86_sse42_pcmpestri128: {

25394

unsigned Opcode;

25395

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

25396

Opcode = X86ISD::PCMPISTR;

25397

else

25398

Opcode = X86ISD::PCMPESTR;

25399

25400

SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());

25401

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

25402

return DAG.getNode(Opcode, dl, VTs, NewOps);

25403

}

25404

25405

case Intrinsic::x86_sse42_pcmpistrm128:

25406

case Intrinsic::x86_sse42_pcmpestrm128: {

25407

unsigned Opcode;

25408

if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

25409

Opcode = X86ISD::PCMPISTR;

25410

else

25411

Opcode = X86ISD::PCMPESTR;

25412

25413

SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());

25414

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

25415

return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

25416

}

25417

25418

case Intrinsic::eh_sjlj_lsda: {

25419

MachineFunction &MF = DAG.getMachineFunction();

25420

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25421

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

25422

auto &Context = MF.getMMI().getContext();

25423

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

25424

Twine(MF.getFunctionNumber()));

25425

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

25426

DAG.getMCSymbol(S, PtrVT));

25427

}

25428

25429

case Intrinsic::x86_seh_lsda: {

25430

// Compute the symbol for the LSDA. We know it'll get emitted later.

25431

MachineFunction &MF = DAG.getMachineFunction();

25432

SDValue Op1 = Op.getOperand(1);

25433

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

25434

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

25435

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

25436

25437

// Generate a simple absolute symbol reference. This intrinsic is only

25438

// supported on 32-bit Windows, which isn't PIC.

25439

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

25440

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

25441

}

25442

25443

case Intrinsic::eh_recoverfp: {

25444

SDValue FnOp = Op.getOperand(1);

25445

SDValue IncomingFPOp = Op.getOperand(2);

25446

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

25447

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

25448

if (!Fn)

25449

report_fatal_error(

25450

"llvm.eh.recoverfp must take a function as the first argument");

25451

return recoverFramePointer(DAG, Fn, IncomingFPOp);

25452

}

25453

25454

case Intrinsic::localaddress: {

25455

// Returns one of the stack, base, or frame pointer registers, depending on

25456

// which is used to reference local variables.

25457

MachineFunction &MF = DAG.getMachineFunction();

25458

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

25459

unsigned Reg;

25460

if (RegInfo->hasBasePointer(MF))

25461

Reg = RegInfo->getBaseRegister();

25462

else { // Handles the SP or FP case.

25463

bool CantUseFP = RegInfo->needsStackRealignment(MF);

25464

if (CantUseFP)

25465

Reg = RegInfo->getPtrSizedStackRegister(MF);

25466

else

25467

Reg = RegInfo->getPtrSizedFrameRegister(MF);

25468

}

25469

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

25470

}

25471

25472

case Intrinsic::x86_avx512_vp2intersect_q_512:

25473

case Intrinsic::x86_avx512_vp2intersect_q_256:

25474

case Intrinsic::x86_avx512_vp2intersect_q_128:

25475

case Intrinsic::x86_avx512_vp2intersect_d_512:

25476

case Intrinsic::x86_avx512_vp2intersect_d_256:

25477

case Intrinsic::x86_avx512_vp2intersect_d_128: {

25478

MVT MaskVT = Op.getSimpleValueType();

25479

25480

SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

25481

SDLoc DL(Op);

25482

25483

SDValue Operation =

25484

DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

25485

Op->getOperand(1), Op->getOperand(2));

25486

25487

SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,

25488

MaskVT, Operation);

25489

SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,

25490

MaskVT, Operation);

25491

return DAG.getMergeValues({Result0, Result1}, DL);

25492

}

25493

case Intrinsic::x86_mmx_pslli_w:

25494

case Intrinsic::x86_mmx_pslli_d:

25495

case Intrinsic::x86_mmx_pslli_q:

25496

case Intrinsic::x86_mmx_psrli_w:

25497

case Intrinsic::x86_mmx_psrli_d:

25498

case Intrinsic::x86_mmx_psrli_q:

25499

case Intrinsic::x86_mmx_psrai_w:

25500

case Intrinsic::x86_mmx_psrai_d: {

25501

SDLoc DL(Op);

25502

SDValue ShAmt = Op.getOperand(2);

25503

// If the argument is a constant, convert it to a target constant.

25504

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

25505

// Clamp out of bounds shift amounts since they will otherwise be masked

25506

// to 8-bits which may make it no longer out of bounds.

25507

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

25508

if (ShiftAmount == 0)

25509

return Op.getOperand(1);

25510

25511

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

25512

Op.getOperand(0), Op.getOperand(1),

25513

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

25514

}

25515

25516

unsigned NewIntrinsic;

25517

switch (IntNo) {

25518

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25518); // Can't reach here.

25519

case Intrinsic::x86_mmx_pslli_w:

25520

NewIntrinsic = Intrinsic::x86_mmx_psll_w;

25521

break;

25522

case Intrinsic::x86_mmx_pslli_d:

25523

NewIntrinsic = Intrinsic::x86_mmx_psll_d;

25524

break;

25525

case Intrinsic::x86_mmx_pslli_q:

25526

NewIntrinsic = Intrinsic::x86_mmx_psll_q;

25527

break;

25528

case Intrinsic::x86_mmx_psrli_w:

25529

NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

25530

break;

25531

case Intrinsic::x86_mmx_psrli_d:

25532

NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

25533

break;

25534

case Intrinsic::x86_mmx_psrli_q:

25535

NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

25536

break;

25537

case Intrinsic::x86_mmx_psrai_w:

25538

NewIntrinsic = Intrinsic::x86_mmx_psra_w;

25539

break;

25540

case Intrinsic::x86_mmx_psrai_d:

25541

NewIntrinsic = Intrinsic::x86_mmx_psra_d;

25542

break;

25543

}

25544

25545

// The vector shift intrinsics with scalars uses 32b shift amounts but

25546

// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

25547

// MMX register.

25548

ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

25549

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

25550

DAG.getTargetConstant(NewIntrinsic, DL,

25551

getPointerTy(DAG.getDataLayout())),

25552

Op.getOperand(1), ShAmt);

25553

}

25554

}

25555

}

25556

25557

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

25558

SDValue Src, SDValue Mask, SDValue Base,

25559

SDValue Index, SDValue ScaleOp, SDValue Chain,

25560

const X86Subtarget &Subtarget) {

25561

SDLoc dl(Op);

25562

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

25563

// Scale must be constant.

25564

if (!C)

25565

return SDValue();

25566

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25567

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

25568

TLI.getPointerTy(DAG.getDataLayout()));

25569

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

25570

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

25571

// If source is undef or we know it won't be used, use a zero vector

25572

// to break register dependency.

25573

// TODO: use undef instead and let BreakFalseDeps deal with it?

25574

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

25575

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

25576

25577

// Cast mask to an integer type.

25578

Mask = DAG.getBitcast(MaskVT, Mask);

25579

25580

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

25581

25582

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

25583

SDValue Res =

25584

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

25585

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

25586

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

25587

}

25588

25589

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

25590

SDValue Src, SDValue Mask, SDValue Base,

25591

SDValue Index, SDValue ScaleOp, SDValue Chain,

25592

const X86Subtarget &Subtarget) {

25593

MVT VT = Op.getSimpleValueType();

25594

SDLoc dl(Op);

25595

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

25596

// Scale must be constant.

25597

if (!C)

25598

return SDValue();

25599

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25600

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

25601

TLI.getPointerTy(DAG.getDataLayout()));

25602

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

25603

VT.getVectorNumElements());

25604

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

25605

25606

// We support two versions of the gather intrinsics. One with scalar mask and

25607

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

25608

if (Mask.getValueType() != MaskVT)

25609

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

25610

25611

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

25612

// If source is undef or we know it won't be used, use a zero vector

25613

// to break register dependency.

25614

// TODO: use undef instead and let BreakFalseDeps deal with it?

25615

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

25616

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

25617

25618

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

25619

25620

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

25621

SDValue Res =

25622

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

25623

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

25624

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

25625

}

25626

25627

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

25628

SDValue Src, SDValue Mask, SDValue Base,

25629

SDValue Index, SDValue ScaleOp, SDValue Chain,

25630

const X86Subtarget &Subtarget) {

25631

SDLoc dl(Op);

25632

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

25633

// Scale must be constant.

25634

if (!C)

25635

return SDValue();

25636

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25637

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

25638

TLI.getPointerTy(DAG.getDataLayout()));

25639

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

25640

Src.getSimpleValueType().getVectorNumElements());

25641

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

25642

25643

// We support two versions of the scatter intrinsics. One with scalar mask and

25644

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

25645

if (Mask.getValueType() != MaskVT)

25646

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

25647

25648

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

25649

25650

SDVTList VTs = DAG.getVTList(MVT::Other);

25651

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

25652

SDValue Res =

25653

DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

25654

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

25655

return Res;

25656

}

25657

25658

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

25659

SDValue Mask, SDValue Base, SDValue Index,

25660

SDValue ScaleOp, SDValue Chain,

25661

const X86Subtarget &Subtarget) {

25662

SDLoc dl(Op);

25663

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

25664

// Scale must be constant.

25665

if (!C)

25666

return SDValue();

25667

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25668

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

25669

TLI.getPointerTy(DAG.getDataLayout()));

25670

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

25671

SDValue Segment = DAG.getRegister(0, MVT::i32);

25672

MVT MaskVT =

25673

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

25674

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

25675

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

25676

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

25677

return SDValue(Res, 0);

25678

}

25679

25680

/// Handles the lowering of builtin intrinsics with chain that return their

25681

/// value into registers EDX:EAX.

25682

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

25683

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

25684

/// TargetOpcode.

25685

/// Returns a Glue value which can be used to add extra copy-from-reg if the

25686

/// expanded intrinsics implicitly defines extra registers (i.e. not just

25687

/// EDX:EAX).

25688

static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

25689

SelectionDAG &DAG,

25690

unsigned TargetOpcode,

25691

unsigned SrcReg,

25692

const X86Subtarget &Subtarget,

25693

SmallVectorImpl<SDValue> &Results) {

25694

SDValue Chain = N->getOperand(0);

25695

SDValue Glue;

25696

25697

if (SrcReg) {

25698

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25698, __PRETTY_FUNCTION__));

25699

Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

25700

Glue = Chain.getValue(1);

25701

}

25702

25703

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

25704

SDValue N1Ops[] = {Chain, Glue};

25705

SDNode *N1 = DAG.getMachineNode(

25706

TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

25707

Chain = SDValue(N1, 0);

25708

25709

// Reads the content of XCR and returns it in registers EDX:EAX.

25710

SDValue LO, HI;

25711

if (Subtarget.is64Bit()) {

25712

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

25713

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

25714

LO.getValue(2));

25715

} else {

25716

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

25717

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

25718

LO.getValue(2));

25719

}

25720

Chain = HI.getValue(1);

25721

Glue = HI.getValue(2);

25722

25723

if (Subtarget.is64Bit()) {

25724

// Merge the two 32-bit values into a 64-bit one.

25725

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

25726

DAG.getConstant(32, DL, MVT::i8));

25727

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

25728

Results.push_back(Chain);

25729

return Glue;

25730

}

25731

25732

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

25733

SDValue Ops[] = { LO, HI };

25734

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

25735

Results.push_back(Pair);

25736

Results.push_back(Chain);

25737

return Glue;

25738

}

25739

25740

/// Handles the lowering of builtin intrinsics that read the time stamp counter

25741

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

25742

/// READCYCLECOUNTER nodes.

25743

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

25744

SelectionDAG &DAG,

25745

const X86Subtarget &Subtarget,

25746

SmallVectorImpl<SDValue> &Results) {

25747

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

25748

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

25749

// and the EAX register is loaded with the low-order 32 bits.

25750

SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

25751

/* NoRegister */0, Subtarget,

25752

Results);

25753

if (Opcode != X86::RDTSCP)

25754

return;

25755

25756

SDValue Chain = Results[1];

25757

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

25758

// the ECX register. Add 'ecx' explicitly to the chain.

25759

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

25760

Results[1] = ecx;

25761

Results.push_back(ecx.getValue(1));

25762

}

25763

25764

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

25765

SelectionDAG &DAG) {

25766

SmallVector<SDValue, 3> Results;

25767

SDLoc DL(Op);

25768

getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

25769

Results);

25770

return DAG.getMergeValues(Results, DL);

25771

}

25772

25773

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

25774

MachineFunction &MF = DAG.getMachineFunction();

25775

SDValue Chain = Op.getOperand(0);

25776

SDValue RegNode = Op.getOperand(2);

25777

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

25778

if (!EHInfo)

25779

report_fatal_error("EH registrations only live in functions using WinEH");

25780

25781

// Cast the operand to an alloca, and remember the frame index.

25782

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

25783

if (!FINode)

25784

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

25785

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

25786

25787

// Return the chain operand without making any DAG nodes.

25788

return Chain;

25789

}

25790

25791

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

25792

MachineFunction &MF = DAG.getMachineFunction();

25793

SDValue Chain = Op.getOperand(0);

25794

SDValue EHGuard = Op.getOperand(2);

25795

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

25796

if (!EHInfo)

25797

report_fatal_error("EHGuard only live in functions using WinEH");

25798

25799

// Cast the operand to an alloca, and remember the frame index.

25800

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

25801

if (!FINode)

25802

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

25803

EHInfo->EHGuardFrameIndex = FINode->getIndex();

25804

25805

// Return the chain operand without making any DAG nodes.

25806

return Chain;

25807

}

25808

25809

/// Emit Truncating Store with signed or unsigned saturation.

25810

static SDValue

25811

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

25812

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

25813

SelectionDAG &DAG) {

25814

SDVTList VTs = DAG.getVTList(MVT::Other);

25815

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

25816

SDValue Ops[] = { Chain, Val, Ptr, Undef };

25817

unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

25818

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

25819

}

25820

25821

/// Emit Masked Truncating Store with signed or unsigned saturation.

25822

static SDValue

25823

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

25824

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

25825

MachineMemOperand *MMO, SelectionDAG &DAG) {

25826

SDVTList VTs = DAG.getVTList(MVT::Other);

25827

SDValue Ops[] = { Chain, Val, Ptr, Mask };

25828

unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

25829

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

25830

}

25831

25832

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

25833

SelectionDAG &DAG) {

25834

unsigned IntNo = Op.getConstantOperandVal(1);

25835

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

25836

if (!IntrData) {

25837

switch (IntNo) {

25838

case llvm::Intrinsic::x86_seh_ehregnode:

25839

return MarkEHRegistrationNode(Op, DAG);

25840

case llvm::Intrinsic::x86_seh_ehguard:

25841

return MarkEHGuard(Op, DAG);

25842

case llvm::Intrinsic::x86_rdpkru: {

25843

SDLoc dl(Op);

25844

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

25845

// Create a RDPKRU node and pass 0 to the ECX parameter.

25846

return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

25847

DAG.getConstant(0, dl, MVT::i32));

25848

}

25849

case llvm::Intrinsic::x86_wrpkru: {

25850

SDLoc dl(Op);

25851

// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0

25852

// to the EDX and ECX parameters.

25853

return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

25854

Op.getOperand(0), Op.getOperand(2),

25855

DAG.getConstant(0, dl, MVT::i32),

25856

DAG.getConstant(0, dl, MVT::i32));

25857

}

25858

case llvm::Intrinsic::x86_flags_read_u32:

25859

case llvm::Intrinsic::x86_flags_read_u64:

25860

case llvm::Intrinsic::x86_flags_write_u32:

25861

case llvm::Intrinsic::x86_flags_write_u64: {

25862

// We need a frame pointer because this will get lowered to a PUSH/POP

25863

// sequence.

25864

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

25865

MFI.setHasCopyImplyingStackAdjustment(true);

25866

// Don't do anything here, we will expand these intrinsics out later

25867

// during FinalizeISel in EmitInstrWithCustomInserter.

25868

return Op;

25869

}

25870

case Intrinsic::x86_lwpins32:

25871

case Intrinsic::x86_lwpins64:

25872

case Intrinsic::x86_umwait:

25873

case Intrinsic::x86_tpause: {

25874

SDLoc dl(Op);

25875

SDValue Chain = Op->getOperand(0);

25876

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

25877

unsigned Opcode;

25878

25879

switch (IntNo) {

25880

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25880);

25881

case Intrinsic::x86_umwait:

25882

Opcode = X86ISD::UMWAIT;

25883

break;

25884

case Intrinsic::x86_tpause:

25885

Opcode = X86ISD::TPAUSE;

25886

break;

25887

case Intrinsic::x86_lwpins32:

25888

case Intrinsic::x86_lwpins64:

25889

Opcode = X86ISD::LWPINS;

25890

break;

25891

}

25892

25893

SDValue Operation =

25894

DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

25895

Op->getOperand(3), Op->getOperand(4));

25896

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

25897

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

25898

Operation.getValue(1));

25899

}

25900

case Intrinsic::x86_enqcmd:

25901

case Intrinsic::x86_enqcmds: {

25902

SDLoc dl(Op);

25903

SDValue Chain = Op.getOperand(0);

25904

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

25905

unsigned Opcode;

25906

switch (IntNo) {

25907

default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25907);

25908

case Intrinsic::x86_enqcmd:

25909

Opcode = X86ISD::ENQCMD;

25910

break;

25911

case Intrinsic::x86_enqcmds:

25912

Opcode = X86ISD::ENQCMDS;

25913

break;

25914

}

25915

SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

25916

Op.getOperand(3));

25917

SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

25918

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

25919

Operation.getValue(1));

25920

}

25921

case Intrinsic::x86_mwaitx: {

25922

// If the current function needs the base pointer, RBX,

25923

// we shouldn't use mwaitx directly.

25924

// Indeed the lowering of that instruction will clobber

25925

// that register and since RBX will be a reserved register

25926

// the register allocator will not make sure its value will

25927

// be properly saved and restored around this live-range.

25928

SDLoc dl(Op);

25929

unsigned Opcode = X86ISD::MWAITX_DAG;

25930

SDValue Chain = DAG.getNode(Opcode, dl, MVT::Other,

25931

{Op->getOperand(0), Op->getOperand(2),

25932

Op->getOperand(3), Op->getOperand(4)});

25933

return Chain;

25934

}

25935

}

25936

return SDValue();

25937

}

25938

25939

SDLoc dl(Op);

25940

switch(IntrData->Type) {

25941

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25941);

25942

case RDSEED:

25943

case RDRAND: {

25944

// Emit the node with the right value type.

25945

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

25946

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

25947

25948

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

25949

// Otherwise return the value from Rand, which is always 0, casted to i32.

25950

SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

25951

DAG.getConstant(1, dl, Op->getValueType(1)),

25952

DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

25953

SDValue(Result.getNode(), 1)};

25954

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

25955

25956

// Return { result, isValid, chain }.

25957

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

25958

SDValue(Result.getNode(), 2));

25959

}

25960

case GATHER_AVX2: {

25961

SDValue Chain = Op.getOperand(0);

25962

SDValue Src = Op.getOperand(2);

25963

SDValue Base = Op.getOperand(3);

25964

SDValue Index = Op.getOperand(4);

25965

SDValue Mask = Op.getOperand(5);

25966

SDValue Scale = Op.getOperand(6);

25967

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

25968

Scale, Chain, Subtarget);

25969

}

25970

case GATHER: {

25971

//gather(v1, mask, index, base, scale);

25972

SDValue Chain = Op.getOperand(0);

25973

SDValue Src = Op.getOperand(2);

25974

SDValue Base = Op.getOperand(3);

25975

SDValue Index = Op.getOperand(4);

25976

SDValue Mask = Op.getOperand(5);

25977

SDValue Scale = Op.getOperand(6);

25978

return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

25979

Chain, Subtarget);

25980

}

25981

case SCATTER: {

25982

//scatter(base, mask, index, v1, scale);

25983

SDValue Chain = Op.getOperand(0);

25984

SDValue Base = Op.getOperand(2);

25985

SDValue Mask = Op.getOperand(3);

25986

SDValue Index = Op.getOperand(4);

25987

SDValue Src = Op.getOperand(5);

25988

SDValue Scale = Op.getOperand(6);

25989

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

25990

Scale, Chain, Subtarget);

25991

}

25992

case PREFETCH: {

25993

const APInt &HintVal = Op.getConstantOperandAPInt(6);

25994

assert((HintVal == 2 || HintVal == 3) &&(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25995, __PRETTY_FUNCTION__))

25995

"Wrong prefetch hint in intrinsic: should be 2 or 3")(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25995, __PRETTY_FUNCTION__));

25996

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

25997

SDValue Chain = Op.getOperand(0);

25998

SDValue Mask = Op.getOperand(2);

25999

SDValue Index = Op.getOperand(3);

26000

SDValue Base = Op.getOperand(4);

26001

SDValue Scale = Op.getOperand(5);

26002

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

26003

Subtarget);

26004

}

26005

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

26006

case RDTSC: {

26007

SmallVector<SDValue, 2> Results;

26008

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

26009

Results);

26010

return DAG.getMergeValues(Results, dl);

26011

}

26012

// Read Performance Monitoring Counters.

26013

case RDPMC:

26014

// GetExtended Control Register.

26015

case XGETBV: {

26016

SmallVector<SDValue, 2> Results;

26017

26018

// RDPMC uses ECX to select the index of the performance counter to read.

26019

// XGETBV uses ECX to select the index of the XCR register to return.

26020

// The result is stored into registers EDX:EAX.

26021

expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

26022

Subtarget, Results);

26023

return DAG.getMergeValues(Results, dl);

26024

}

26025

// XTEST intrinsics.

26026

case XTEST: {

26027

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

26028

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

26029

26030

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

26031

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

26032

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

26033

Ret, SDValue(InTrans.getNode(), 1));

26034

}

26035

case TRUNCATE_TO_MEM_VI8:

26036

case TRUNCATE_TO_MEM_VI16:

26037

case TRUNCATE_TO_MEM_VI32: {

26038

SDValue Mask = Op.getOperand(4);

26039

SDValue DataToTruncate = Op.getOperand(3);

26040

SDValue Addr = Op.getOperand(2);

26041

SDValue Chain = Op.getOperand(0);

26042

26043

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

26044

assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast
<void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26044, __PRETTY_FUNCTION__));

26045

26046

EVT MemVT = MemIntr->getMemoryVT();

26047

26048

uint16_t TruncationOp = IntrData->Opc0;

26049

switch (TruncationOp) {

26050

case X86ISD::VTRUNC: {

26051

if (isAllOnesConstant(Mask)) // return just a truncate store

26052

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

26053

MemIntr->getMemOperand());

26054

26055

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

26056

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

26057

SDValue Offset = DAG.getUNDEF(VMask.getValueType());

26058

26059

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

26060

MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

26061

true /* truncating */);

26062

}

26063

case X86ISD::VTRUNCUS:

26064

case X86ISD::VTRUNCS: {

26065

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

26066

if (isAllOnesConstant(Mask))

26067

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

26068

MemIntr->getMemOperand(), DAG);

26069

26070

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

26071

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

26072

26073

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

26074

VMask, MemVT, MemIntr->getMemOperand(), DAG);

26075

}

26076

default:

26077

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26077);

26078

}

26079

}

26080

}

26081

}

26082

26083

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

26084

SelectionDAG &DAG) const {

26085

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

26086

MFI.setReturnAddressIsTaken(true);

26087

26088

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

26089

return SDValue();

26090

26091

unsigned Depth = Op.getConstantOperandVal(0);

26092

SDLoc dl(Op);

26093

EVT PtrVT = getPointerTy(DAG.getDataLayout());

26094

26095

if (Depth > 0) {

26096

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

26097

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26098

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

26099

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

26100

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

26101

MachinePointerInfo());

26102

}

26103

26104

// Just load the return address.

26105

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

26106

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

26107

MachinePointerInfo());

26108

}

26109

26110

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

26111

SelectionDAG &DAG) const {

26112

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

26113

return getReturnAddressFrameIndex(DAG);

26114

}

26115

26116

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

26117

MachineFunction &MF = DAG.getMachineFunction();

26118

MachineFrameInfo &MFI = MF.getFrameInfo();

26119

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

26120

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26121

EVT VT = Op.getValueType();

26122

26123

MFI.setFrameAddressIsTaken(true);

26124

26125

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

26126

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

26127

// is not possible to crawl up the stack without looking at the unwind codes

26128

// simultaneously.

26129

int FrameAddrIndex = FuncInfo->getFAIndex();

26130

if (!FrameAddrIndex) {

26131

// Set up a frame object for the return address.

26132

unsigned SlotSize = RegInfo->getSlotSize();

26133

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

26134

SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

26135

FuncInfo->setFAIndex(FrameAddrIndex);

26136

}

26137

return DAG.getFrameIndex(FrameAddrIndex, VT);

26138

}

26139

26140

unsigned FrameReg =

26141

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

26142

SDLoc dl(Op); // FIXME probably not meaningful

26143

unsigned Depth = Op.getConstantOperandVal(0);

26144

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26146, __PRETTY_FUNCTION__))

26145

(FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26146, __PRETTY_FUNCTION__))

26146

"Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26146, __PRETTY_FUNCTION__));

26147

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

26148

while (Depth--)

26149

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

26150

MachinePointerInfo());

26151

return FrameAddr;

26152

}

26153

26154

// FIXME? Maybe this could be a TableGen attribute on some registers and

26155

// this table could be generated automatically from RegInfo.

26156

Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

26157

const MachineFunction &MF) const {

26158

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

26159

26160

Register Reg = StringSwitch<unsigned>(RegName)

26161

.Case("esp", X86::ESP)

26162

.Case("rsp", X86::RSP)

26163

.Case("ebp", X86::EBP)

26164

.Case("rbp", X86::RBP)

26165

.Default(0);

26166

26167

if (Reg == X86::EBP || Reg == X86::RBP) {

26168

if (!TFI.hasFP(MF))

26169

report_fatal_error("register " + StringRef(RegName) +

26170

" is allocatable: function has no frame pointer");

26171

#ifndef NDEBUG

26172

else {

26173

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26174

Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

26175

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26176, __PRETTY_FUNCTION__))

26176

"Invalid Frame Register!")(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26176, __PRETTY_FUNCTION__));

26177

}

26178

#endif

26179

}

26180

26181

if (Reg)

26182

return Reg;

26183

26184

report_fatal_error("Invalid register name global variable");

26185

}

26186

26187

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

26188

SelectionDAG &DAG) const {

26189

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26190

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

26191

}

26192

26193

Register X86TargetLowering::getExceptionPointerRegister(

26194

const Constant *PersonalityFn) const {

26195

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

26196

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

26197

26198

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

26199

}

26200

26201

Register X86TargetLowering::getExceptionSelectorRegister(

26202

const Constant *PersonalityFn) const {

26203

// Funclet personalities don't use selectors (the runtime does the selection).

26204

assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))((!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn
))) ? static_cast<void> (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26204, __PRETTY_FUNCTION__));

26205

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

26206

}

26207

26208

bool X86TargetLowering::needsFixedCatchObjects() const {

26209

return Subtarget.isTargetWin64();

26210

}

26211

26212

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

26213

SDValue Chain = Op.getOperand(0);

26214

SDValue Offset = Op.getOperand(1);

26215

SDValue Handler = Op.getOperand(2);

26216

SDLoc dl (Op);

26217

26218

EVT PtrVT = getPointerTy(DAG.getDataLayout());

26219

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26220

Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

26221

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26223, __PRETTY_FUNCTION__))

26222

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26223, __PRETTY_FUNCTION__))

26223

"Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26223, __PRETTY_FUNCTION__));

26224

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

26225

Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

26226

26227

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

26228

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

26229

dl));

26230

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

26231

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

26232

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

26233

26234

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

26235

DAG.getRegister(StoreAddrReg, PtrVT));

26236

}

26237

26238

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

26239

SelectionDAG &DAG) const {

26240

SDLoc DL(Op);

26241

// If the subtarget is not 64bit, we may need the global base reg

26242

// after isel expand pseudo, i.e., after CGBR pass ran.

26243

// Therefore, ask for the GlobalBaseReg now, so that the pass

26244

// inserts the code for us in case we need it.

26245

// Otherwise, we will end up in a situation where we will

26246

// reference a virtual register that is not defined!

26247

if (!Subtarget.is64Bit()) {

26248

const X86InstrInfo *TII = Subtarget.getInstrInfo();

26249

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

26250

}

26251

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

26252

DAG.getVTList(MVT::i32, MVT::Other),

26253

Op.getOperand(0), Op.getOperand(1));

26254

}

26255

26256

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

26257

SelectionDAG &DAG) const {

26258

SDLoc DL(Op);

26259

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

26260

Op.getOperand(0), Op.getOperand(1));

26261

}

26262

26263

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

26264

SelectionDAG &DAG) const {

26265

SDLoc DL(Op);

26266

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

26267

Op.getOperand(0));

26268

}

26269

26270

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

26271

return Op.getOperand(0);

26272

}

26273

26274

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

26275

SelectionDAG &DAG) const {

26276

SDValue Root = Op.getOperand(0);

26277

SDValue Trmp = Op.getOperand(1); // trampoline

26278

SDValue FPtr = Op.getOperand(2); // nested function

26279

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

26280

SDLoc dl (Op);

26281

26282

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

26283

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

26284

26285

if (Subtarget.is64Bit()) {

26286

SDValue OutChains[6];

26287

26288

// Large code-model.

26289

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

26290

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

26291

26292

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

26293

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

26294

26295

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

26296

26297

// Load the pointer to the nested function into R11.

26298

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

26299

SDValue Addr = Trmp;

26300

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

26301

Addr, MachinePointerInfo(TrmpAddr));

26302

26303

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

26304

DAG.getConstant(2, dl, MVT::i64));

26305

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

26306

MachinePointerInfo(TrmpAddr, 2), Align(2));

26307

26308

// Load the 'nest' parameter value into R10.

26309

// R10 is specified in X86CallingConv.td

26310

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

26311

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

26312

DAG.getConstant(10, dl, MVT::i64));

26313

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

26314

Addr, MachinePointerInfo(TrmpAddr, 10));

26315

26316

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

26317

DAG.getConstant(12, dl, MVT::i64));

26318

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

26319

MachinePointerInfo(TrmpAddr, 12), Align(2));

26320

26321

// Jump to the nested function.

26322

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

26323

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

26324

DAG.getConstant(20, dl, MVT::i64));

26325

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

26326

Addr, MachinePointerInfo(TrmpAddr, 20));

26327

26328

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

26329

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

26330

DAG.getConstant(22, dl, MVT::i64));

26331

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

26332

Addr, MachinePointerInfo(TrmpAddr, 22));

26333

26334

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

26335

} else {

26336

const Function *Func =

26337

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

26338

CallingConv::ID CC = Func->getCallingConv();

26339

unsigned NestReg;

26340

26341

switch (CC) {

26342

default:

26343

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26343);

26344

case CallingConv::C:

26345

case CallingConv::X86_StdCall: {

26346

// Pass 'nest' parameter in ECX.

26347

// Must be kept in sync with X86CallingConv.td

26348

NestReg = X86::ECX;

26349

26350

// Check that ECX wasn't needed by an 'inreg' parameter.

26351

FunctionType *FTy = Func->getFunctionType();

26352

const AttributeList &Attrs = Func->getAttributes();

26353

26354

if (!Attrs.isEmpty() && !Func->isVarArg()) {

26355

unsigned InRegCount = 0;

26356

unsigned Idx = 1;

26357

26358

for (FunctionType::param_iterator I = FTy->param_begin(),

26359

E = FTy->param_end(); I != E; ++I, ++Idx)

26360

if (Attrs.hasAttribute(Idx, Attribute::InReg)) {

26361

const DataLayout &DL = DAG.getDataLayout();

26362

// FIXME: should only count parameters that are lowered to integers.

26363

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

26364

}

26365

26366

if (InRegCount > 2) {

26367

report_fatal_error("Nest register in use - reduce number of inreg"

26368

" parameters!");

26369

}

26370

}

26371

break;

26372

}

26373

case CallingConv::X86_FastCall:

26374

case CallingConv::X86_ThisCall:

26375

case CallingConv::Fast:

26376

case CallingConv::Tail:

26377

// Pass 'nest' parameter in EAX.

26378

// Must be kept in sync with X86CallingConv.td

26379

NestReg = X86::EAX;

26380

break;

26381

}

26382

26383

SDValue OutChains[4];

26384

SDValue Addr, Disp;

26385

26386

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

26387

DAG.getConstant(10, dl, MVT::i32));

26388

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

26389

26390

// This is storing the opcode for MOV32ri.

26391

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

26392

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

26393

OutChains[0] =

26394

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

26395

Trmp, MachinePointerInfo(TrmpAddr));

26396

26397

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

26398

DAG.getConstant(1, dl, MVT::i32));

26399

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

26400

MachinePointerInfo(TrmpAddr, 1), Align(1));

26401

26402

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

26403

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

26404

DAG.getConstant(5, dl, MVT::i32));

26405

OutChains[2] =

26406

DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

26407

MachinePointerInfo(TrmpAddr, 5), Align(1));

26408

26409

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

26410

DAG.getConstant(6, dl, MVT::i32));

26411

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

26412

MachinePointerInfo(TrmpAddr, 6), Align(1));

26413

26414

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

26415

}

26416

}

26417

26418

SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,

26419

SelectionDAG &DAG) const {

26420

/*

26421

The rounding mode is in bits 11:10 of FPSR, and has the following

26422

settings:

26423

00 Round to nearest

26424

01 Round to -inf

26425

10 Round to +inf

26426

11 Round to 0

26427

26428

FLT_ROUNDS, on the other hand, expects the following:

26429

-1 Undefined

26430

0 Round to 0

26431

1 Round to nearest

26432

2 Round to +inf

26433

3 Round to -inf

26434

26435

To perform the conversion, we use a packed lookup table of the four 2-bit

26436

values that we can index by FPSP[11:10]

26437

0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

26438

26439

(0x2d >> ((FPSR & 0xc00) >> 9)) & 3

26440

*/

26441

26442

MachineFunction &MF = DAG.getMachineFunction();

26443

MVT VT = Op.getSimpleValueType();

26444

SDLoc DL(Op);

26445

26446

// Save FP Control Word to stack slot

26447

int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

26448

SDValue StackSlot =

26449

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

26450

26451

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

26452

26453

SDValue Chain = Op.getOperand(0);

26454

SDValue Ops[] = {Chain, StackSlot};

26455

Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

26456

DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

26457

Align(2), MachineMemOperand::MOStore);

26458

26459

// Load FP Control Word from stack slot

26460

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

26461

Chain = CWD.getValue(1);

26462

26463

// Mask and turn the control bits into a shift for the lookup table.

26464

SDValue Shift =

26465

DAG.getNode(ISD::SRL, DL, MVT::i16,

26466

DAG.getNode(ISD::AND, DL, MVT::i16,

26467

CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

26468

DAG.getConstant(9, DL, MVT::i8));

26469

Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

26470

26471

SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

26472

SDValue RetVal =

26473

DAG.getNode(ISD::AND, DL, MVT::i32,

26474

DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

26475

DAG.getConstant(3, DL, MVT::i32));

26476

26477

RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

26478

26479

return DAG.getMergeValues({RetVal, Chain}, DL);

26480

}

26481

26482

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

26483

//

26484

// i8/i16 vector implemented using dword LZCNT vector instruction

26485

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

26486

// split the vector, perform operation on it's Lo a Hi part and

26487

// concatenate the results.

26488

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

26489

const X86Subtarget &Subtarget) {

26490

assert(Op.getOpcode() == ISD::CTLZ)((Op.getOpcode() == ISD::CTLZ) ? static_cast<void> (0) :
__assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26490, __PRETTY_FUNCTION__));

26491

SDLoc dl(Op);

26492

MVT VT = Op.getSimpleValueType();

26493

MVT EltVT = VT.getVectorElementType();

26494

unsigned NumElems = VT.getVectorNumElements();

26495

26496

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26497, __PRETTY_FUNCTION__))

26497

"Unsupported element type")(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26497, __PRETTY_FUNCTION__));

26498

26499

// Split vector, it's Lo and Hi parts will be handled in next iteration.

26500

if (NumElems > 16 ||

26501

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

26502

return splitVectorIntUnary(Op, DAG);

26503

26504

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

26505

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26506, __PRETTY_FUNCTION__))

26506

"Unsupported value type for operation")(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26506, __PRETTY_FUNCTION__));

26507

26508

// Use native supported vector instruction vplzcntd.

26509

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

26510

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

26511

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

26512

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

26513

26514

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

26515

}

26516

26517

// Lower CTLZ using a PSHUFB lookup table implementation.

26518

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

26519

const X86Subtarget &Subtarget,

26520

SelectionDAG &DAG) {

26521

MVT VT = Op.getSimpleValueType();

26522

int NumElts = VT.getVectorNumElements();

26523

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

26524

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

26525

26526

// Per-nibble leading zero PSHUFB lookup table.

26527

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

26528

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

26529

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

26530

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

26531

26532

SmallVector<SDValue, 64> LUTVec;

26533

for (int i = 0; i < NumBytes; ++i)

26534

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

26535

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

26536

26537

// Begin by bitcasting the input to byte vector, then split those bytes

26538

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

26539

// If the hi input nibble is zero then we add both results together, otherwise

26540

// we just take the hi result (by masking the lo result to zero before the

26541

// add).

26542

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

26543

SDValue Zero = DAG.getConstant(0, DL, CurrVT);

26544

26545

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

26546

SDValue Lo = Op0;

26547

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

26548

SDValue HiZ;

26549

if (CurrVT.is512BitVector()) {

26550

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

26551

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

26552

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

26553

} else {

26554

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

26555

}

26556

26557

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

26558

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

26559

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

26560

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

26561

26562

// Merge result back from vXi8 back to VT, working on the lo/hi halves

26563

// of the current vector width in the same way we did for the nibbles.

26564

// If the upper half of the input element is zero then add the halves'

26565

// leading zero counts together, otherwise just use the upper half's.

26566

// Double the width of the result until we are at target width.

26567

while (CurrVT != VT) {

26568

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

26569

int CurrNumElts = CurrVT.getVectorNumElements();

26570

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

26571

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

26572

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

26573

26574

// Check if the upper half of the input element is zero.

26575

if (CurrVT.is512BitVector()) {

26576

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

26577

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

26578

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

26579

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

26580

} else {

26581

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

26582

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

26583

}

26584

HiZ = DAG.getBitcast(NextVT, HiZ);

26585

26586

// Move the upper/lower halves to the lower bits as we'll be extending to

26587

// NextVT. Mask the lower result to zero if HiZ is true and add the results

26588

// together.

26589

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

26590

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

26591

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

26592

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

26593

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

26594

CurrVT = NextVT;

26595

}

26596

26597

return Res;

26598

}

26599

26600

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

26601

const X86Subtarget &Subtarget,

26602

SelectionDAG &DAG) {

26603

MVT VT = Op.getSimpleValueType();

26604

26605

if (Subtarget.hasCDI() &&

26606

// vXi8 vectors need to be promoted to 512-bits for vXi32.

26607

(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

26608

return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

26609

26610

// Decompose 256-bit ops into smaller 128-bit ops.

26611

if (VT.is256BitVector() && !Subtarget.hasInt256())

26612

return splitVectorIntUnary(Op, DAG);

26613

26614

// Decompose 512-bit ops into smaller 256-bit ops.

26615

if (VT.is512BitVector() && !Subtarget.hasBWI())

26616

return splitVectorIntUnary(Op, DAG);

26617

26618

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26618, __PRETTY_FUNCTION__));

26619

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

26620

}

26621

26622

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

26623

SelectionDAG &DAG) {

26624

MVT VT = Op.getSimpleValueType();

26625

MVT OpVT = VT;

26626

unsigned NumBits = VT.getSizeInBits();

26627

SDLoc dl(Op);

26628

unsigned Opc = Op.getOpcode();

26629

26630

if (VT.isVector())

26631

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

26632

26633

Op = Op.getOperand(0);

26634

if (VT == MVT::i8) {

26635

// Zero extend to i32 since there is not an i8 bsr.

26636

OpVT = MVT::i32;

26637

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

26638

}

26639

26640

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

26641

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

26642

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

26643

26644

if (Opc == ISD::CTLZ) {

26645

// If src is zero (i.e. bsr sets ZF), returns NumBits.

26646

SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

26647

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

26648

Op.getValue(1)};

26649

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

26650

}

26651

26652

// Finally xor with NumBits-1.

26653

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

26654

DAG.getConstant(NumBits - 1, dl, OpVT));

26655

26656

if (VT == MVT::i8)

26657

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

26658

return Op;

26659

}

26660

26661

static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

26662

SelectionDAG &DAG) {

26663

MVT VT = Op.getSimpleValueType();

26664

unsigned NumBits = VT.getScalarSizeInBits();

26665

SDValue N0 = Op.getOperand(0);

26666

SDLoc dl(Op);

26667

26668

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26669, __PRETTY_FUNCTION__))

26669

"Only scalar CTTZ requires custom lowering")((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26669, __PRETTY_FUNCTION__));

26670

26671

// Issue a bsf (scan bits forward) which also sets EFLAGS.

26672

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

26673

Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

26674

26675

// If src is zero (i.e. bsf sets ZF), returns NumBits.

26676

SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

26677

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

26678

Op.getValue(1)};

26679

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

26680

}

26681

26682

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

26683

const X86Subtarget &Subtarget) {

26684

MVT VT = Op.getSimpleValueType();

26685

if (VT == MVT::i16 || VT == MVT::i32)

26686

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

26687

26688

if (VT.getScalarType() == MVT::i1)

26689

return DAG.getNode(ISD::XOR, SDLoc(Op), VT,

26690

Op.getOperand(0), Op.getOperand(1));

26691

26692

if (VT == MVT::v32i16 || VT == MVT::v64i8)

26693

return splitVectorIntBinary(Op, DAG);

26694

26695

assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26697, __PRETTY_FUNCTION__))

26696

Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26697, __PRETTY_FUNCTION__))

26697

"Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26697, __PRETTY_FUNCTION__));

26698

return splitVectorIntBinary(Op, DAG);

26699

}

26700

26701

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

26702

const X86Subtarget &Subtarget) {

26703

MVT VT = Op.getSimpleValueType();

26704

SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

26705

unsigned Opcode = Op.getOpcode();

26706

if (VT.getScalarType() == MVT::i1) {

26707

SDLoc dl(Op);

26708

switch (Opcode) {

26709

default: llvm_unreachable("Expected saturated arithmetic opcode")::llvm::llvm_unreachable_internal("Expected saturated arithmetic opcode"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26709);

26710

case ISD::UADDSAT:

26711

case ISD::SADDSAT:

26712

// *addsat i1 X, Y --> X | Y

26713

return DAG.getNode(ISD::OR, dl, VT, X, Y);

26714

case ISD::USUBSAT:

26715

case ISD::SSUBSAT:

26716

// *subsat i1 X, Y --> X & ~Y

26717

return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));

26718

}

26719

}

26720

26721

if (VT.is128BitVector()) {

26722

// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

26723

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26724

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

26725

*DAG.getContext(), VT);

26726

SDLoc DL(Op);

26727

if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {

26728

// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y

26729

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);

26730

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);

26731

return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);

26732

}

26733

if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {

26734

// usubsat X, Y --> (X >u Y) ? X - Y : 0

26735

SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

26736

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

26737

return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

26738

}

26739

// Use default expansion.

26740

return SDValue();

26741

}

26742

26743

if (VT == MVT::v32i16 || VT == MVT::v64i8)

26744

return splitVectorIntBinary(Op, DAG);

26745

26746

assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26748, __PRETTY_FUNCTION__))

26747

Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26748, __PRETTY_FUNCTION__))

26748

"Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26748, __PRETTY_FUNCTION__));

26749

return splitVectorIntBinary(Op, DAG);

26750

}

26751

26752

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

26753

SelectionDAG &DAG) {

26754

MVT VT = Op.getSimpleValueType();

26755

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

26756

// Since X86 does not have CMOV for 8-bit integer, we don't convert

26757

// 8-bit integer abs to NEG and CMOV.

26758

SDLoc DL(Op);

26759

SDValue N0 = Op.getOperand(0);

26760

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

26761

DAG.getConstant(0, DL, VT), N0);

26762

SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),

26763

SDValue(Neg.getNode(), 1)};

26764

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

26765

}

26766

26767

// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

26768

if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

26769

SDLoc DL(Op);

26770

SDValue Src = Op.getOperand(0);

26771

SDValue Sub =

26772

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);

26773

return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);

26774

}

26775

26776

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

26777

assert(VT.isInteger() &&((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26778, __PRETTY_FUNCTION__))

26778

"Only handle AVX 256-bit vector integer operation")((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26778, __PRETTY_FUNCTION__));

26779

return splitVectorIntUnary(Op, DAG);

26780

}

26781

26782

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

26783

return splitVectorIntUnary(Op, DAG);

26784

26785

// Default to expand.

26786

return SDValue();

26787

}

26788

26789

static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {

26790

MVT VT = Op.getSimpleValueType();

26791

26792

// For AVX1 cases, split to use legal ops (everything but v4i64).

26793

if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())

26794

return splitVectorIntBinary(Op, DAG);

26795

26796

if (VT == MVT::v32i16 || VT == MVT::v64i8)

26797

return splitVectorIntBinary(Op, DAG);

26798

26799

SDLoc DL(Op);

26800

unsigned Opcode = Op.getOpcode();

26801

SDValue N0 = Op.getOperand(0);

26802

SDValue N1 = Op.getOperand(1);

26803

26804

// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,

26805

// using the SMIN/SMAX instructions and flipping the signbit back.

26806

if (VT == MVT::v8i16) {

26807

assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26808, __PRETTY_FUNCTION__))

26808

"Unexpected MIN/MAX opcode")(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26808, __PRETTY_FUNCTION__));

26809

SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);

26810

N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);

26811

N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);

26812

Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);

26813

SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);

26814

return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);

26815

}

26816

26817

// Else, expand to a compare/select.

26818

ISD::CondCode CC;

26819

switch (Opcode) {

26820

case ISD::SMIN: CC = ISD::CondCode::SETLT; break;

26821

case ISD::SMAX: CC = ISD::CondCode::SETGT; break;

26822

case ISD::UMIN: CC = ISD::CondCode::SETULT; break;

26823

case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;

26824

default: llvm_unreachable("Unknown MINMAX opcode")::llvm::llvm_unreachable_internal("Unknown MINMAX opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26824);

26825

}

26826

26827

SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);

26828

return DAG.getSelect(DL, VT, Cond, N0, N1);

26829

}

26830

26831

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

26832

SelectionDAG &DAG) {

26833

SDLoc dl(Op);

26834

MVT VT = Op.getSimpleValueType();

26835

26836

if (VT.getScalarType() == MVT::i1)

26837

return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

26838

26839

// Decompose 256-bit ops into 128-bit ops.

26840

if (VT.is256BitVector() && !Subtarget.hasInt256())

26841

return splitVectorIntBinary(Op, DAG);

26842

26843

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

26844

return splitVectorIntBinary(Op, DAG);

26845

26846

SDValue A = Op.getOperand(0);

26847

SDValue B = Op.getOperand(1);

26848

26849

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

26850

// vector pairs, multiply and truncate.

26851

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

26852

unsigned NumElts = VT.getVectorNumElements();

26853

26854

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

26855

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

26856

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

26857

return DAG.getNode(

26858

ISD::TRUNCATE, dl, VT,

26859

DAG.getNode(ISD::MUL, dl, ExVT,

26860

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

26861

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

26862

}

26863

26864

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

26865

26866

// Extract the lo/hi parts to any extend to i16.

26867

// We're going to mask off the low byte of each result element of the

26868

// pmullw, so it doesn't matter what's in the high byte of each 16-bit

26869

// element.

26870

SDValue Undef = DAG.getUNDEF(VT);

26871

SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

26872

SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

26873

26874

SDValue BLo, BHi;

26875

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

26876

// If the LHS is a constant, manually unpackl/unpackh.

26877

SmallVector<SDValue, 16> LoOps, HiOps;

26878

for (unsigned i = 0; i != NumElts; i += 16) {

26879

for (unsigned j = 0; j != 8; ++j) {

26880

LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

26881

MVT::i16));

26882

HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

26883

MVT::i16));

26884

}

26885

}

26886

26887

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

26888

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

26889

} else {

26890

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

26891

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

26892

}

26893

26894

// Multiply, mask the lower 8bits of the lo/hi results and pack.

26895

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

26896

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

26897

RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));

26898

RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));

26899

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

26900

}

26901

26902

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

26903

if (VT == MVT::v4i32) {

26904

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26905, __PRETTY_FUNCTION__))

26905

"Should not custom lower when pmulld is available!")((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26905, __PRETTY_FUNCTION__));

26906

26907

// Extract the odd parts.

26908

static const int UnpackMask[] = { 1, -1, 3, -1 };

26909

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

26910

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

26911

26912

// Multiply the even parts.

26913

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

26914

DAG.getBitcast(MVT::v2i64, A),

26915

DAG.getBitcast(MVT::v2i64, B));

26916

// Now multiply odd parts.

26917

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

26918

DAG.getBitcast(MVT::v2i64, Aodds),

26919

DAG.getBitcast(MVT::v2i64, Bodds));

26920

26921

Evens = DAG.getBitcast(VT, Evens);

26922

Odds = DAG.getBitcast(VT, Odds);

26923

26924

// Merge the two vectors back together with a shuffle. This expands into 2

26925

// shuffles.

26926

static const int ShufMask[] = { 0, 4, 2, 6 };

26927

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

26928

}

26929

26930

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26931, __PRETTY_FUNCTION__))

26931

"Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26931, __PRETTY_FUNCTION__));

26932

assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((!Subtarget.hasDQI() && "DQI should use MULLQ") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26932, __PRETTY_FUNCTION__));

26933

26934

// Ahi = psrlqi(a, 32);

26935

// Bhi = psrlqi(b, 32);

26936

//

26937

// AloBlo = pmuludq(a, b);

26938

// AloBhi = pmuludq(a, Bhi);

26939

// AhiBlo = pmuludq(Ahi, b);

26940

//

26941

// Hi = psllqi(AloBhi + AhiBlo, 32);

26942

// return AloBlo + Hi;

26943

KnownBits AKnown = DAG.computeKnownBits(A);

26944

KnownBits BKnown = DAG.computeKnownBits(B);

26945

26946

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

26947

bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

26948

bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

26949

26950

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

26951

bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

26952

bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

26953

26954

SDValue Zero = DAG.getConstant(0, dl, VT);

26955

26956

// Only multiply lo/hi halves that aren't known to be zero.

26957

SDValue AloBlo = Zero;

26958

if (!ALoIsZero && !BLoIsZero)

26959

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

26960

26961

SDValue AloBhi = Zero;

26962

if (!ALoIsZero && !BHiIsZero) {

26963

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

26964

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

26965

}

26966

26967

SDValue AhiBlo = Zero;

26968

if (!AHiIsZero && !BLoIsZero) {

26969

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

26970

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

26971

}

26972

26973

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

26974

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

26975

26976

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

26977

}

26978

26979

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

26980

SelectionDAG &DAG) {

26981

SDLoc dl(Op);

26982

MVT VT = Op.getSimpleValueType();

26983

bool IsSigned = Op->getOpcode() == ISD::MULHS;

26984

unsigned NumElts = VT.getVectorNumElements();

26985

SDValue A = Op.getOperand(0);

26986

SDValue B = Op.getOperand(1);

26987

26988

// Decompose 256-bit ops into 128-bit ops.

26989

if (VT.is256BitVector() && !Subtarget.hasInt256())

26990

return splitVectorIntBinary(Op, DAG);

26991

26992

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

26993

return splitVectorIntBinary(Op, DAG);

26994

26995

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

26996

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26998, __PRETTY_FUNCTION__))

26997

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26998, __PRETTY_FUNCTION__))

26998

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26998, __PRETTY_FUNCTION__));

26999

27000

// PMULxD operations multiply each even value (starting at 0) of LHS with

27001

// the related value of RHS and produce a widen result.

27002

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

27003

// => <2 x i64> <ae|cg>

27004

//

27005

// In other word, to have all the results, we need to perform two PMULxD:

27006

// 1. one with the even values.

27007

// 2. one with the odd values.

27008

// To achieve #2, with need to place the odd values at an even position.

27009

//

27010

// Place the odd value at an even position (basically, shift all values 1

27011

// step to the left):

27012

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,

27013

9, -1, 11, -1, 13, -1, 15, -1};

27014

// <a|b|c|d> => <b|undef|d|undef>

27015

SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,

27016

makeArrayRef(&Mask[0], NumElts));

27017

// <e|f|g|h> => <f|undef|h|undef>

27018

SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,

27019

makeArrayRef(&Mask[0], NumElts));

27020

27021

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

27022

// ints.

27023

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

27024

unsigned Opcode =

27025

(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

27026

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

27027

// => <2 x i64> <ae|cg>

27028

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

27029

DAG.getBitcast(MulVT, A),

27030

DAG.getBitcast(MulVT, B)));

27031

27032

// => <2 x i64> <bf|dh>

27033

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

27034

DAG.getBitcast(MulVT, Odd0),

27035

DAG.getBitcast(MulVT, Odd1)));

27036

27037

// Shuffle it back into the right order.

27038

SmallVector<int, 16> ShufMask(NumElts);

27039

for (int i = 0; i != (int)NumElts; ++i)

27040

ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

27041

27042

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

27043

27044

// If we have a signed multiply but no PMULDQ fix up the result of an

27045

// unsigned multiply.

27046

if (IsSigned && !Subtarget.hasSSE41()) {

27047

SDValue Zero = DAG.getConstant(0, dl, VT);

27048

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

27049

DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

27050

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

27051

DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

27052

27053

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

27054

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

27055

}

27056

27057

return Res;

27058

}

27059

27060

// Only i8 vectors should need custom lowering after this.

27061

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27063, __PRETTY_FUNCTION__))

27062

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27063, __PRETTY_FUNCTION__))

27063

"Unsupported vector type")(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27063, __PRETTY_FUNCTION__));

27064

27065

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

27066

// logical shift down the upper half and pack back to i8.

27067

27068

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

27069

// and then ashr/lshr the upper bits down to the lower bits before multiply.

27070

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

27071

27072

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

27073

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

27074

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

27075

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

27076

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

27077

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

27078

Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

27079

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

27080

}

27081

27082

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen

27083

// to a vXi16 type. Do the multiplies, shift the results and pack the half

27084

// lane results back together.

27085

27086

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

27087

27088

static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,

27089

-1, -1, -1, -1, -1, -1, -1, -1};

27090

27091

// Extract the lo parts and zero/sign extend to i16.

27092

// Only use SSE4.1 instructions for signed v16i8 where using unpack requires

27093

// shifts to sign extend. Using unpack for unsigned only requires an xor to

27094

// create zeros and a copy due to tied registers contraints pre-avx. But using

27095

// zero_extend_vector_inreg would require an additional pshufd for the high

27096

// part.

27097

27098

SDValue ALo, AHi;

27099

if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {

27100

ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);

27101

27102

AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);

27103

AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);

27104

} else if (IsSigned) {

27105

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));

27106

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));

27107

27108

ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);

27109

AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);

27110

} else {

27111

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,

27112

DAG.getConstant(0, dl, VT)));

27113

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,

27114

DAG.getConstant(0, dl, VT)));

27115

}

27116

27117

SDValue BLo, BHi;

27118

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

27119

// If the LHS is a constant, manually unpackl/unpackh and extend.

27120

SmallVector<SDValue, 16> LoOps, HiOps;

27121

for (unsigned i = 0; i != NumElts; i += 16) {

27122

for (unsigned j = 0; j != 8; ++j) {

27123

SDValue LoOp = B.getOperand(i + j);

27124

SDValue HiOp = B.getOperand(i + j + 8);

27125

27126

if (IsSigned) {

27127

LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);

27128

HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);

27129

} else {

27130

LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);

27131

HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);

27132

}

27133

27134

LoOps.push_back(LoOp);

27135

HiOps.push_back(HiOp);

27136

}

27137

}

27138

27139

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

27140

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

27141

} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {

27142

BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);

27143

27144

BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);

27145

BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);

27146

} else if (IsSigned) {

27147

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));

27148

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));

27149

27150

BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);

27151

BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);

27152

} else {

27153

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,

27154

DAG.getConstant(0, dl, VT)));

27155

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,

27156

DAG.getConstant(0, dl, VT)));

27157

}

27158

27159

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

27160

// pack back to vXi8.

27161

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

27162

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

27163

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);

27164

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);

27165

27166

// Bitcast back to VT and then pack all the even elements from Lo and Hi.

27167

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

27168

}

27169

27170

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

27171

assert(Subtarget.isTargetWin64() && "Unexpected target")((Subtarget.isTargetWin64() && "Unexpected target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27171, __PRETTY_FUNCTION__));

27172

EVT VT = Op.getValueType();

27173

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27174, __PRETTY_FUNCTION__))

27174

"Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27174, __PRETTY_FUNCTION__));

27175

27176

RTLIB::Libcall LC;

27177

bool isSigned;

27178

switch (Op->getOpcode()) {

27179

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27179);

27180

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

27181

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

27182

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

27183

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

27184

}

27185

27186

SDLoc dl(Op);

27187

SDValue InChain = DAG.getEntryNode();

27188

27189

TargetLowering::ArgListTy Args;

27190

TargetLowering::ArgListEntry Entry;

27191

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

27192

EVT ArgVT = Op->getOperand(i).getValueType();

27193

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27194, __PRETTY_FUNCTION__))

27194

"Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27194, __PRETTY_FUNCTION__));

27195

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

27196

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

27197

MachinePointerInfo MPI =

27198

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

27199

Entry.Node = StackPtr;

27200

InChain =

27201

DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

27202

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

27203

Entry.Ty = PointerType::get(ArgTy,0);

27204

Entry.IsSExt = false;

27205

Entry.IsZExt = false;

27206

Args.push_back(Entry);

27207

}

27208

27209

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

27210

getPointerTy(DAG.getDataLayout()));

27211

27212

TargetLowering::CallLoweringInfo CLI(DAG);

27213

CLI.setDebugLoc(dl)

27214

.setChain(InChain)

27215

.setLibCallee(

27216

getLibcallCallingConv(LC),

27217

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

27218

std::move(Args))

27219

.setInRegister()

27220

.setSExtResult(isSigned)

27221

.setZExtResult(!isSigned);

27222

27223

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

27224

return DAG.getBitcast(VT, CallInfo.first);

27225

}

27226

27227

// Return true if the required (according to Opcode) shift-imm form is natively

27228

// supported by the Subtarget

27229

static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,

27230

unsigned Opcode) {

27231

if (VT.getScalarSizeInBits() < 16)

27232

return false;

27233

27234

if (VT.is512BitVector() && Subtarget.hasAVX512() &&

27235

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

27236

return true;

27237

27238

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

27239

(VT.is256BitVector() && Subtarget.hasInt256());

27240

27241

bool AShift = LShift && (Subtarget.hasAVX512() ||

27242

(VT != MVT::v2i64 && VT != MVT::v4i64));

27243

return (Opcode == ISD::SRA) ? AShift : LShift;

27244

}

27245

27246

// The shift amount is a variable, but it is the same for all vector lanes.

27247

// These instructions are defined together with shift-immediate.

27248

static

27249

bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,

27250

unsigned Opcode) {

27251

return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);

27252

}

27253

27254

// Return true if the required (according to Opcode) variable-shift form is

27255

// natively supported by the Subtarget

27256

static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,

27257

unsigned Opcode) {

27258

27259

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

27260

return false;

27261

27262

// vXi16 supported only on AVX-512, BWI

27263

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

27264

return false;

27265

27266

if (Subtarget.hasAVX512())

27267

return true;

27268

27269

bool LShift = VT.is128BitVector() || VT.is256BitVector();

27270

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

27271

return (Opcode == ISD::SRA) ? AShift : LShift;

27272

}

27273

27274

static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,

27275

const X86Subtarget &Subtarget) {

27276

MVT VT = Op.getSimpleValueType();

27277

SDLoc dl(Op);

27278

SDValue R = Op.getOperand(0);

27279

SDValue Amt = Op.getOperand(1);

27280

unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

27281

27282

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

27283

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27283, __PRETTY_FUNCTION__));

27284

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

27285

SDValue Ex = DAG.getBitcast(ExVT, R);

27286

27287

// ashr(R, 63) === cmp_slt(R, 0)

27288

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

27289

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27290, __PRETTY_FUNCTION__))

27290

"Unsupported PCMPGT op")(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27290, __PRETTY_FUNCTION__));

27291

return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

27292

}

27293

27294

if (ShiftAmt >= 32) {

27295

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

27296

SDValue Upper =

27297

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

27298

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

27299

ShiftAmt - 32, DAG);

27300

if (VT == MVT::v2i64)

27301

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

27302

if (VT == MVT::v4i64)

27303

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

27304

{9, 1, 11, 3, 13, 5, 15, 7});

27305

} else {

27306

// SRA upper i32, SRL whole i64 and select lower i32.

27307

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

27308

ShiftAmt, DAG);

27309

SDValue Lower =

27310

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

27311

Lower = DAG.getBitcast(ExVT, Lower);

27312

if (VT == MVT::v2i64)

27313

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

27314

if (VT == MVT::v4i64)

27315

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

27316

{8, 1, 10, 3, 12, 5, 14, 7});

27317

}

27318

return DAG.getBitcast(VT, Ex);

27319

};

27320

27321

// Optimize shl/srl/sra with constant shift amount.

27322

APInt APIntShiftAmt;

27323

if (!X86::isConstantSplat(Amt, APIntShiftAmt))

27324

return SDValue();

27325

27326

// If the shift amount is out of range, return undef.

27327

if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))

27328

return DAG.getUNDEF(VT);

27329

27330

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

27331

27332

if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))

27333

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

27334

27335

// i64 SRA needs to be performed as partial shifts.

27336

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

27337

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

27338

Op.getOpcode() == ISD::SRA)

27339

return ArithmeticShiftRight64(ShiftAmt);

27340

27341

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

27342

(Subtarget.hasBWI() && VT == MVT::v64i8)) {

27343

unsigned NumElts = VT.getVectorNumElements();

27344

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

27345

27346

// Simple i8 add case

27347

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)

27348

return DAG.getNode(ISD::ADD, dl, VT, R, R);

27349

27350

// ashr(R, 7) === cmp_slt(R, 0)

27351

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

27352

SDValue Zeros = DAG.getConstant(0, dl, VT);

27353

if (VT.is512BitVector()) {

27354

assert(VT == MVT::v64i8 && "Unexpected element type!")((VT == MVT::v64i8 && "Unexpected element type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27354, __PRETTY_FUNCTION__));

27355

SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

27356

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

27357

}

27358

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

27359

}

27360

27361

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

27362

if (VT == MVT::v16i8 && Subtarget.hasXOP())

27363

return SDValue();

27364

27365

if (Op.getOpcode() == ISD::SHL) {

27366

// Make a large shift.

27367

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

27368

ShiftAmt, DAG);

27369

SHL = DAG.getBitcast(VT, SHL);

27370

// Zero out the rightmost bits.

27371

APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

27372

return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

27373

}

27374

if (Op.getOpcode() == ISD::SRL) {

27375

// Make a large shift.

27376

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

27377

ShiftAmt, DAG);

27378

SRL = DAG.getBitcast(VT, SRL);

27379

// Zero out the leftmost bits.

27380

return DAG.getNode(ISD::AND, dl, VT, SRL,

27381

DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));

27382

}

27383

if (Op.getOpcode() == ISD::SRA) {

27384

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

27385

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

27386

27387

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

27388

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

27389

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

27390

return Res;

27391

}

27392

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27392);

27393

}

27394

27395

return SDValue();

27396

}

27397

27398

static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,

27399

const X86Subtarget &Subtarget) {

27400

MVT VT = Op.getSimpleValueType();

27401

SDLoc dl(Op);

27402

SDValue R = Op.getOperand(0);

27403

SDValue Amt = Op.getOperand(1);

27404

unsigned Opcode = Op.getOpcode();

27405

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);

27406

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);

27407

27408

if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {

27409

if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {

27410

MVT EltVT = VT.getVectorElementType();

27411

assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27411, __PRETTY_FUNCTION__));

27412

if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))

27413

BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);

27414

else if (EltVT.bitsLT(MVT::i32))

27415

BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

27416

27417

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);

27418

}

27419

27420

// vXi8 shifts - shift as v8i16 + mask result.

27421

if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

27422

(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

27423

VT == MVT::v64i8) &&

27424

!Subtarget.hasXOP()) {

27425

unsigned NumElts = VT.getVectorNumElements();

27426

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

27427

if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

27428

unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

27429

unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);

27430

BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

27431

27432

// Create the mask using vXi16 shifts. For shift-rights we need to move

27433

// the upper byte down before splatting the vXi8 mask.

27434

SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);

27435

BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

27436

BaseShAmt, Subtarget, DAG);

27437

if (Opcode != ISD::SHL)

27438

BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

27439

8, DAG);

27440

BitMask = DAG.getBitcast(VT, BitMask);

27441

BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

27442

SmallVector<int, 64>(NumElts, 0));

27443

27444

SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

27445

DAG.getBitcast(ExtVT, R), BaseShAmt,

27446

Subtarget, DAG);

27447

Res = DAG.getBitcast(VT, Res);

27448

Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

27449

27450

if (Opcode == ISD::SRA) {

27451

// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

27452

// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

27453

SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

27454

SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,

27455

BaseShAmt, Subtarget, DAG);

27456

SignMask = DAG.getBitcast(VT, SignMask);

27457

Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

27458

Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

27459

}

27460

return Res;

27461

}

27462

}

27463

}

27464

27465

// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.

27466

if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&

27467

Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {

27468

Amt = Amt.getOperand(0);

27469

unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();

27470

std::vector<SDValue> Vals(Ratio);

27471

for (unsigned i = 0; i != Ratio; ++i)

27472

Vals[i] = Amt.getOperand(i);

27473

for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {

27474

for (unsigned j = 0; j != Ratio; ++j)

27475

if (Vals[j] != Amt.getOperand(i + j))

27476

return SDValue();

27477

}

27478

27479

if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))

27480

return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));

27481

}

27482

return SDValue();

27483

}

27484

27485

// Convert a shift/rotate left amount to a multiplication scale factor.

27486

static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

27487

const X86Subtarget &Subtarget,

27488

SelectionDAG &DAG) {

27489

MVT VT = Amt.getSimpleValueType();

27490

if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

27491

(Subtarget.hasInt256() && VT == MVT::v16i16) ||

27492

(!Subtarget.hasAVX512() && VT == MVT::v16i8)))

27493

return SDValue();

27494

27495

if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {

27496

SmallVector<SDValue, 8> Elts;

27497

MVT SVT = VT.getVectorElementType();

27498

unsigned SVTBits = SVT.getSizeInBits();

27499

APInt One(SVTBits, 1);

27500

unsigned NumElems = VT.getVectorNumElements();

27501

27502

for (unsigned i = 0; i != NumElems; ++i) {

27503

SDValue Op = Amt->getOperand(i);

27504

if (Op->isUndef()) {

27505

Elts.push_back(Op);

27506

continue;

27507

}

27508

27509

ConstantSDNode *ND = cast<ConstantSDNode>(Op);

27510

APInt C(SVTBits, ND->getZExtValue());

27511

uint64_t ShAmt = C.getZExtValue();

27512

if (ShAmt >= SVTBits) {

27513

Elts.push_back(DAG.getUNDEF(SVT));

27514

continue;

27515

}

27516

Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));

27517

}

27518

return DAG.getBuildVector(VT, dl, Elts);

27519

}

27520

27521

// If the target doesn't support variable shifts, use either FP conversion

27522

// or integer multiplication to avoid shifting each element individually.

27523

if (VT == MVT::v4i32) {

27524

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

27525

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

27526

DAG.getConstant(0x3f800000U, dl, VT));

27527

Amt = DAG.getBitcast(MVT::v4f32, Amt);

27528

return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

27529

}

27530

27531

// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

27532

if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

27533

SDValue Z = DAG.getConstant(0, dl, VT);

27534

SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

27535

SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

27536

Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

27537

Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

27538

if (Subtarget.hasSSE41())

27539

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

27540

27541

return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),

27542

DAG.getBitcast(VT, Hi),

27543

{0, 2, 4, 6, 8, 10, 12, 14});

27544

}

27545

27546

return SDValue();

27547

}

27548

27549

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

27550

SelectionDAG &DAG) {

27551

MVT VT = Op.getSimpleValueType();

27552

SDLoc dl(Op);

27553

SDValue R = Op.getOperand(0);

27554

SDValue Amt = Op.getOperand(1);

27555

unsigned EltSizeInBits = VT.getScalarSizeInBits();

27556

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

27557

27558

unsigned Opc = Op.getOpcode();

27559

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

27560

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

27561

27562

assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27562, __PRETTY_FUNCTION__));

27563

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27563, __PRETTY_FUNCTION__));

27564

27565

if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))

27566

return V;

27567

27568

if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))

27569

return V;

27570

27571

if (SupportedVectorVarShift(VT, Subtarget, Opc))

27572

return Op;

27573

27574

// XOP has 128-bit variable logical/arithmetic shifts.

27575

// +ve/-ve Amt = shift left/right.

27576

if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

27577

VT == MVT::v8i16 || VT == MVT::v16i8)) {

27578

if (Opc == ISD::SRL || Opc == ISD::SRA) {

27579

SDValue Zero = DAG.getConstant(0, dl, VT);

27580

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

27581

}

27582

if (Opc == ISD::SHL || Opc == ISD::SRL)

27583

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

27584

if (Opc == ISD::SRA)

27585

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

27586

}

27587

27588

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

27589

// shifts per-lane and then shuffle the partial results back together.

27590

if (VT == MVT::v2i64 && Opc != ISD::SRA) {

27591

// Splat the shift amounts so the scalar shifts above will catch it.

27592

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

27593

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

27594

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

27595

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

27596

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

27597

}

27598

27599

// i64 vector arithmetic shift can be emulated with the transform:

27600

// M = lshr(SIGN_MASK, Amt)

27601

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

27602

if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&

27603

Opc == ISD::SRA) {

27604

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

27605

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

27606

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

27607

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

27608

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

27609

return R;

27610

}

27611

27612

// If possible, lower this shift as a sequence of two shifts by

27613

// constant plus a BLENDing shuffle instead of scalarizing it.

27614

// Example:

27615

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

27616

//

27617

// Could be rewritten as:

27618

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

27619

//

27620

// The advantage is that the two shifts from the example would be

27621

// lowered as X86ISD::VSRLI nodes in parallel before blending.

27622

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

27623

(VT == MVT::v16i16 && Subtarget.hasInt256()))) {

27624

SDValue Amt1, Amt2;

27625

unsigned NumElts = VT.getVectorNumElements();

27626

SmallVector<int, 8> ShuffleMask;

27627

for (unsigned i = 0; i != NumElts; ++i) {

27628

SDValue A = Amt->getOperand(i);

27629

if (A.isUndef()) {

27630

ShuffleMask.push_back(SM_SentinelUndef);

27631

continue;

27632

}

27633

if (!Amt1 || Amt1 == A) {

27634

ShuffleMask.push_back(i);

27635

Amt1 = A;

27636

continue;

27637

}

27638

if (!Amt2 || Amt2 == A) {

27639

ShuffleMask.push_back(i + NumElts);

27640

Amt2 = A;

27641

continue;

27642

}

27643

break;

27644

}

27645

27646

// Only perform this blend if we can perform it without loading a mask.

27647

if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&

27648

(VT != MVT::v16i16 ||

27649

is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

27650

(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

27651

canWidenShuffleElements(ShuffleMask))) {

27652

auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);

27653

auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);

27654

if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&

27655

Cst2->getAPIntValue().ult(EltSizeInBits)) {

27656

SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

27657

Cst1->getZExtValue(), DAG);

27658

SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

27659

Cst2->getZExtValue(), DAG);

27660

return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

27661

}

27662

}

27663

}

27664

27665

// If possible, lower this packed shift into a vector multiply instead of

27666

// expanding it into a sequence of scalar shifts.

27667

if (Opc == ISD::SHL)

27668

if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

27669

return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

27670

27671

// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

27672

// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

27673

if (Opc == ISD::SRL && ConstantAmt &&

27674

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

27675

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

27676

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

27677

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

27678

SDValue Zero = DAG.getConstant(0, dl, VT);

27679

SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

27680

SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

27681

return DAG.getSelect(dl, VT, ZAmt, R, Res);

27682

}

27683

}

27684

27685

// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

27686

// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

27687

// TODO: Special case handling for shift by 0/1, really we can afford either

27688

// of these cases in pre-SSE41/XOP/AVX512 but not both.

27689

if (Opc == ISD::SRA && ConstantAmt &&

27690

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

27691

((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

27692

!Subtarget.hasAVX512()) ||

27693

DAG.isKnownNeverZero(Amt))) {

27694

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

27695

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

27696

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

27697

SDValue Amt0 =

27698

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

27699

SDValue Amt1 =

27700

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

27701

SDValue Sra1 =

27702

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

27703

SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

27704

Res = DAG.getSelect(dl, VT, Amt0, R, Res);

27705

return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

27706

}

27707

}

27708

27709

// v4i32 Non Uniform Shifts.

27710

// If the shift amount is constant we can shift each lane using the SSE2

27711

// immediate shifts, else we need to zero-extend each lane to the lower i64

27712

// and shift using the SSE2 variable shifts.

27713

// The separate results can then be blended together.

27714

if (VT == MVT::v4i32) {

27715

SDValue Amt0, Amt1, Amt2, Amt3;

27716

if (ConstantAmt) {

27717

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

27718

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

27719

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

27720

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

27721

} else {

27722

// The SSE2 shifts use the lower i64 as the same shift amount for

27723

// all lanes and the upper i64 is ignored. On AVX we're better off

27724

// just zero-extending, but for SSE just duplicating the top 16-bits is

27725

// cheaper and has the same effect for out of range values.

27726

if (Subtarget.hasAVX()) {

27727

SDValue Z = DAG.getConstant(0, dl, VT);

27728

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

27729

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

27730

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

27731

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

27732

} else {

27733

SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

27734

SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

27735

{4, 5, 6, 7, -1, -1, -1, -1});

27736

Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

27737

{0, 1, 1, 1, -1, -1, -1, -1});

27738

Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

27739

{2, 3, 3, 3, -1, -1, -1, -1});

27740

Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,

27741

{0, 1, 1, 1, -1, -1, -1, -1});

27742

Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,

27743

{2, 3, 3, 3, -1, -1, -1, -1});

27744

}

27745

}

27746

27747

unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

27748

SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

27749

SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

27750

SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

27751

SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

27752

27753

// Merge the shifted lane results optimally with/without PBLENDW.

27754

// TODO - ideally shuffle combining would handle this.

27755

if (Subtarget.hasSSE41()) {

27756

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

27757

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

27758

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

27759

}

27760

SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

27761

SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

27762

return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

27763

}

27764

27765

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

27766

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

27767

// make the existing SSE solution better.

27768

// NOTE: We honor prefered vector width before promoting to 512-bits.

27769

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

27770

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

27771

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

27772

(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

27773

(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

27774

assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27775, __PRETTY_FUNCTION__))

27775

"Unexpected vector type")(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27775, __PRETTY_FUNCTION__));

27776

MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

27777

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

27778

unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

27779

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

27780

Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

27781

return DAG.getNode(ISD::TRUNCATE, dl, VT,

27782

DAG.getNode(Opc, dl, ExtVT, R, Amt));

27783

}

27784

27785

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

27786

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

27787

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

27788

(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

27789

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&

27790

!Subtarget.hasXOP()) {

27791

int NumElts = VT.getVectorNumElements();

27792

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

27793

27794

// Extend constant shift amount to vXi16 (it doesn't matter if the type

27795

// isn't legal).

27796

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

27797

Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

27798

Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

27799

Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

27800

assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27801, __PRETTY_FUNCTION__))

27801

"Constant build vector expected")((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27801, __PRETTY_FUNCTION__));

27802

27803

if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

27804

R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)

27805

: DAG.getZExtOrTrunc(R, dl, ExVT);

27806

R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

27807

R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

27808

return DAG.getZExtOrTrunc(R, dl, VT);

27809

}

27810

27811

SmallVector<SDValue, 16> LoAmt, HiAmt;

27812

for (int i = 0; i != NumElts; i += 16) {

27813

for (int j = 0; j != 8; ++j) {

27814

LoAmt.push_back(Amt.getOperand(i + j));

27815

HiAmt.push_back(Amt.getOperand(i + j + 8));

27816

}

27817

}

27818

27819

MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

27820

SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

27821

SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

27822

27823

SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

27824

SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

27825

LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

27826

HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

27827

LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

27828

HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

27829

LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

27830

HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

27831

return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

27832

}

27833

27834

if (VT == MVT::v16i8 ||

27835

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

27836

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

27837

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

27838

27839

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

27840

if (VT.is512BitVector()) {

27841

// On AVX512BW targets we make use of the fact that VSELECT lowers

27842

// to a masked blend which selects bytes based just on the sign bit

27843

// extracted to a mask.

27844

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

27845

V0 = DAG.getBitcast(VT, V0);

27846

V1 = DAG.getBitcast(VT, V1);

27847

Sel = DAG.getBitcast(VT, Sel);

27848

Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

27849

ISD::SETGT);

27850

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

27851

} else if (Subtarget.hasSSE41()) {

27852

// On SSE41 targets we can use PBLENDVB which selects bytes based just

27853

// on the sign bit.

27854

V0 = DAG.getBitcast(VT, V0);

27855

V1 = DAG.getBitcast(VT, V1);

27856

Sel = DAG.getBitcast(VT, Sel);

27857

return DAG.getBitcast(SelVT,

27858

DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

27859

}

27860

// On pre-SSE41 targets we test for the sign bit by comparing to

27861

// zero - a negative value will set all bits of the lanes to true

27862

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

27863

SDValue Z = DAG.getConstant(0, dl, SelVT);

27864

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

27865

return DAG.getSelect(dl, SelVT, C, V0, V1);

27866

};

27867

27868

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

27869

// We can safely do this using i16 shifts as we're only interested in

27870

// the 3 lower bits of each byte.

27871

Amt = DAG.getBitcast(ExtVT, Amt);

27872

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

27873

Amt = DAG.getBitcast(VT, Amt);

27874

27875

if (Opc == ISD::SHL || Opc == ISD::SRL) {

27876

// r = VSELECT(r, shift(r, 4), a);

27877

SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

27878

R = SignBitSelect(VT, Amt, M, R);

27879

27880

// a += a

27881

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

27882

27883

// r = VSELECT(r, shift(r, 2), a);

27884

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

27885

R = SignBitSelect(VT, Amt, M, R);

27886

27887

// a += a

27888

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

27889

27890

// return VSELECT(r, shift(r, 1), a);

27891

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

27892

R = SignBitSelect(VT, Amt, M, R);

27893

return R;

27894

}

27895

27896

if (Opc == ISD::SRA) {

27897

// For SRA we need to unpack each byte to the higher byte of a i16 vector

27898

// so we can correctly sign extend. We don't care what happens to the

27899

// lower byte.

27900

SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

27901

SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

27902

SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

27903

SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

27904

ALo = DAG.getBitcast(ExtVT, ALo);

27905

AHi = DAG.getBitcast(ExtVT, AHi);

27906

RLo = DAG.getBitcast(ExtVT, RLo);

27907

RHi = DAG.getBitcast(ExtVT, RHi);

27908

27909

// r = VSELECT(r, shift(r, 4), a);

27910

SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

27911

SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

27912

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

27913

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

27914

27915

// a += a

27916

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

27917

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

27918

27919

// r = VSELECT(r, shift(r, 2), a);

27920

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

27921

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

27922

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

27923

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

27924

27925

// a += a

27926

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

27927

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

27928

27929

// r = VSELECT(r, shift(r, 1), a);

27930

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

27931

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

27932

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

27933

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

27934

27935

// Logical shift the result back to the lower byte, leaving a zero upper

27936

// byte meaning that we can safely pack with PACKUSWB.

27937

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

27938

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

27939

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

27940

}

27941

}

27942

27943

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

27944

MVT ExtVT = MVT::v8i32;

27945

SDValue Z = DAG.getConstant(0, dl, VT);

27946

SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

27947

SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

27948

SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

27949

SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

27950

ALo = DAG.getBitcast(ExtVT, ALo);

27951

AHi = DAG.getBitcast(ExtVT, AHi);

27952

RLo = DAG.getBitcast(ExtVT, RLo);

27953

RHi = DAG.getBitcast(ExtVT, RHi);

27954

SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

27955

SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

27956

Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

27957

Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

27958

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

27959

}

27960

27961

if (VT == MVT::v8i16) {

27962

// If we have a constant shift amount, the non-SSE41 path is best as

27963

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

27964

bool UseSSE41 = Subtarget.hasSSE41() &&

27965

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

27966

27967

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

27968

// On SSE41 targets we can use PBLENDVB which selects bytes based just on

27969

// the sign bit.

27970

if (UseSSE41) {

27971

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

27972

V0 = DAG.getBitcast(ExtVT, V0);

27973

V1 = DAG.getBitcast(ExtVT, V1);

27974

Sel = DAG.getBitcast(ExtVT, Sel);

27975

return DAG.getBitcast(

27976

VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

27977

}

27978

// On pre-SSE41 targets we splat the sign bit - a negative value will

27979

// set all bits of the lanes to true and VSELECT uses that in

27980

// its OR(AND(V0,C),AND(V1,~C)) lowering.

27981

SDValue C =

27982

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

27983

return DAG.getSelect(dl, VT, C, V0, V1);

27984

};

27985

27986

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

27987

if (UseSSE41) {

27988

// On SSE41 targets we need to replicate the shift mask in both

27989

// bytes for PBLENDVB.

27990

Amt = DAG.getNode(

27991

ISD::OR, dl, VT,

27992

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

27993

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

27994

} else {

27995

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

27996

}

27997

27998

// r = VSELECT(r, shift(r, 8), a);

27999

SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

28000

R = SignBitSelect(Amt, M, R);

28001

28002

// a += a

28003

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

28004

28005

// r = VSELECT(r, shift(r, 4), a);

28006

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

28007

R = SignBitSelect(Amt, M, R);

28008

28009

// a += a

28010

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

28011

28012

// r = VSELECT(r, shift(r, 2), a);

28013

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

28014

R = SignBitSelect(Amt, M, R);

28015

28016

// a += a

28017

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

28018

28019

// return VSELECT(r, shift(r, 1), a);

28020

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

28021

R = SignBitSelect(Amt, M, R);

28022

return R;

28023

}

28024

28025

// Decompose 256-bit shifts into 128-bit shifts.

28026

if (VT.is256BitVector())

28027

return splitVectorIntBinary(Op, DAG);

28028

28029

if (VT == MVT::v32i16 || VT == MVT::v64i8)

28030

return splitVectorIntBinary(Op, DAG);

28031

28032

return SDValue();

28033

}

28034

28035

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

28036

SelectionDAG &DAG) {

28037

MVT VT = Op.getSimpleValueType();

28038

assert(VT.isVector() && "Custom lowering only for vector rotates!")((VT.isVector() && "Custom lowering only for vector rotates!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28038, __PRETTY_FUNCTION__));

28039

28040

SDLoc DL(Op);

28041

SDValue R = Op.getOperand(0);

28042

SDValue Amt = Op.getOperand(1);

28043

unsigned Opcode = Op.getOpcode();

28044

unsigned EltSizeInBits = VT.getScalarSizeInBits();

28045

int NumElts = VT.getVectorNumElements();

28046

28047

// Check for constant splat rotation amount.

28048

APInt CstSplatValue;

28049

bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

28050

28051

// Check for splat rotate by zero.

28052

if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

28053

return R;

28054

28055

// AVX512 implicitly uses modulo rotation amounts.

28056

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

28057

// Attempt to rotate by immediate.

28058

if (IsCstSplat) {

28059

unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);

28060

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

28061

return DAG.getNode(RotOpc, DL, VT, R,

28062

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

28063

}

28064

28065

// Else, fall-back on VPROLV/VPRORV.

28066

return Op;

28067

}

28068

28069

assert((Opcode == ISD::ROTL) && "Only ROTL supported")(((Opcode == ISD::ROTL) && "Only ROTL supported") ? static_cast
<void> (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28069, __PRETTY_FUNCTION__));

28070

28071

// XOP has 128-bit vector variable + immediate rotates.

28072

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

28073

// XOP implicitly uses modulo rotation amounts.

28074

if (Subtarget.hasXOP()) {

28075

if (VT.is256BitVector())

28076

return splitVectorIntBinary(Op, DAG);

28077

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28077, __PRETTY_FUNCTION__));

28078

28079

// Attempt to rotate by immediate.

28080

if (IsCstSplat) {

28081

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

28082

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

28083

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

28084

}

28085

28086

// Use general rotate by variable (per-element).

28087

return Op;

28088

}

28089

28090

// Split 256-bit integers on pre-AVX2 targets.

28091

if (VT.is256BitVector() && !Subtarget.hasAVX2())

28092

return splitVectorIntBinary(Op, DAG);

28093

28094

assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28097, __PRETTY_FUNCTION__))

28095

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28097, __PRETTY_FUNCTION__))

28096

Subtarget.hasAVX2())) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28097, __PRETTY_FUNCTION__))

28097

"Only vXi32/vXi16/vXi8 vector rotates supported")(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28097, __PRETTY_FUNCTION__));

28098

28099

// Rotate by an uniform constant - expand back to shifts.

28100

if (IsCstSplat)

28101

return SDValue();

28102

28103

bool IsSplatAmt = DAG.isSplatValue(Amt);

28104

28105

// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by

28106

// the amount bit.

28107

if (EltSizeInBits == 8 && !IsSplatAmt) {

28108

if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))

28109

return SDValue();

28110

28111

// We don't need ModuloAmt here as we just peek at individual bits.

28112

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

28113

28114

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

28115

if (Subtarget.hasSSE41()) {

28116

// On SSE41 targets we can use PBLENDVB which selects bytes based just

28117

// on the sign bit.

28118

V0 = DAG.getBitcast(VT, V0);

28119

V1 = DAG.getBitcast(VT, V1);

28120

Sel = DAG.getBitcast(VT, Sel);

28121

return DAG.getBitcast(SelVT,

28122

DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

28123

}

28124

// On pre-SSE41 targets we test for the sign bit by comparing to

28125

// zero - a negative value will set all bits of the lanes to true

28126

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

28127

SDValue Z = DAG.getConstant(0, DL, SelVT);

28128

SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

28129

return DAG.getSelect(DL, SelVT, C, V0, V1);

28130

};

28131

28132

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

28133

// We can safely do this using i16 shifts as we're only interested in

28134

// the 3 lower bits of each byte.

28135

Amt = DAG.getBitcast(ExtVT, Amt);

28136

Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

28137

Amt = DAG.getBitcast(VT, Amt);

28138

28139

// r = VSELECT(r, rot(r, 4), a);

28140

SDValue M;

28141

M = DAG.getNode(

28142

ISD::OR, DL, VT,

28143

DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),

28144

DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));

28145

R = SignBitSelect(VT, Amt, M, R);

28146

28147

// a += a

28148

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

28149

28150

// r = VSELECT(r, rot(r, 2), a);

28151

M = DAG.getNode(

28152

ISD::OR, DL, VT,

28153

DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),

28154

DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));

28155

R = SignBitSelect(VT, Amt, M, R);

28156

28157

// a += a

28158

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

28159

28160

// return VSELECT(r, rot(r, 1), a);

28161

M = DAG.getNode(

28162

ISD::OR, DL, VT,

28163

DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),

28164

DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));

28165

return SignBitSelect(VT, Amt, M, R);

28166

}

28167

28168

// ISD::ROT* uses modulo rotate amounts.

28169

Amt = DAG.getNode(ISD::AND, DL, VT, Amt,

28170

DAG.getConstant(EltSizeInBits - 1, DL, VT));

28171

28172

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

28173

bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

28174

SupportedVectorVarShift(VT, Subtarget, ISD::SRL);

28175

28176

// Fallback for splats + all supported variable shifts.

28177

// Fallback for non-constants AVX2 vXi16 as well.

28178

if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

28179

SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

28180

AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

28181

SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);

28182

SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);

28183

return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

28184

}

28185

28186

// As with shifts, convert the rotation amount to a multiplication factor.

28187

SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

28188

assert(Scale && "Failed to convert ROTL amount to scale")((Scale && "Failed to convert ROTL amount to scale") ?
static_cast<void> (0) : __assert_fail ("Scale && \"Failed to convert ROTL amount to scale\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28188, __PRETTY_FUNCTION__));

28189

28190

// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

28191

if (EltSizeInBits == 16) {

28192

SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

28193

SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

28194

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

28195

}

28196

28197

// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

28198

// to v2i64 results at a time. The upper 32-bits contain the wrapped bits

28199

// that can then be OR'd with the lower 32-bits.

28200

assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28200, __PRETTY_FUNCTION__));

28201

static const int OddMask[] = {1, -1, 3, -1};

28202

SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

28203

SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

28204

28205

SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

28206

DAG.getBitcast(MVT::v2i64, R),

28207

DAG.getBitcast(MVT::v2i64, Scale));

28208

SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

28209

DAG.getBitcast(MVT::v2i64, R13),

28210

DAG.getBitcast(MVT::v2i64, Scale13));

28211

Res02 = DAG.getBitcast(VT, Res02);

28212

Res13 = DAG.getBitcast(VT, Res13);

28213

28214

return DAG.getNode(ISD::OR, DL, VT,

28215

DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

28216

DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

28217

}

28218

28219

/// Returns true if the operand type is exactly twice the native width, and

28220

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

28221

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

28222

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

28223

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

28224

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

28225

28226

if (OpWidth == 64)

28227

return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();

28228

if (OpWidth == 128)

28229

return Subtarget.hasCmpxchg16b();

28230

28231

return false;

28232

}

28233

28234

bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

28235

Type *MemType = SI->getValueOperand()->getType();

28236

28237

bool NoImplicitFloatOps =

28238

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

28239

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

28240

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

28241

(Subtarget.hasSSE1() || Subtarget.hasX87()))

28242

return false;

28243

28244

return needsCmpXchgNb(MemType);

28245

}

28246

28247

// Note: this turns large loads into lock cmpxchg8b/16b.

28248

// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

28249

TargetLowering::AtomicExpansionKind

28250

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

28251

Type *MemType = LI->getType();

28252

28253

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

28254

// can use movq to do the load. If we have X87 we can load into an 80-bit

28255

// X87 register and store it to a stack temporary.

28256

bool NoImplicitFloatOps =

28257

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

28258

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

28259

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

28260

(Subtarget.hasSSE1() || Subtarget.hasX87()))

28261

return AtomicExpansionKind::None;

28262

28263

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

28264

: AtomicExpansionKind::None;

28265

}

28266

28267

TargetLowering::AtomicExpansionKind

28268

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

28269

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

28270

Type *MemType = AI->getType();

28271

28272

// If the operand is too big, we must see if cmpxchg8/16b is available

28273

// and default to library calls otherwise.

28274

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

28275

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

28276

: AtomicExpansionKind::None;

28277

}

28278

28279

AtomicRMWInst::BinOp Op = AI->getOperation();

28280

switch (Op) {

28281

default:

28282

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28282);

28283

case AtomicRMWInst::Xchg:

28284

case AtomicRMWInst::Add:

28285

case AtomicRMWInst::Sub:

28286

// It's better to use xadd, xsub or xchg for these in all cases.

28287

return AtomicExpansionKind::None;

28288

case AtomicRMWInst::Or:

28289

case AtomicRMWInst::And:

28290

case AtomicRMWInst::Xor:

28291

// If the atomicrmw's result isn't actually used, we can just add a "lock"

28292

// prefix to a normal instruction for these operations.

28293

return !AI->use_empty() ? AtomicExpansionKind::CmpXChg

28294

: AtomicExpansionKind::None;

28295

case AtomicRMWInst::Nand:

28296

case AtomicRMWInst::Max:

28297

case AtomicRMWInst::Min:

28298

case AtomicRMWInst::UMax:

28299

case AtomicRMWInst::UMin:

28300

case AtomicRMWInst::FAdd:

28301

case AtomicRMWInst::FSub:

28302

// These always require a non-trivial set of data operations on x86. We must

28303

// use a cmpxchg loop.

28304

return AtomicExpansionKind::CmpXChg;

28305

}

28306

}

28307

28308

LoadInst *

28309

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

28310

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

28311

Type *MemType = AI->getType();

28312

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

28313

// there is no benefit in turning such RMWs into loads, and it is actually

28314

// harmful as it introduces a mfence.

28315

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

28316

return nullptr;

28317

28318

// If this is a canonical idempotent atomicrmw w/no uses, we have a better

28319

// lowering available in lowerAtomicArith.

28320

// TODO: push more cases through this path.

28321

if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

28322

if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

28323

AI->use_empty())

28324

return nullptr;

28325

28326

IRBuilder<> Builder(AI);

28327

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

28328

auto SSID = AI->getSyncScopeID();

28329

// We must restrict the ordering to avoid generating loads with Release or

28330

// ReleaseAcquire orderings.

28331

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

28332

28333

// Before the load we need a fence. Here is an example lifted from

28334

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

28335

// is required:

28336

// Thread 0:

28337

// x.store(1, relaxed);

28338

// r1 = y.fetch_add(0, release);

28339

// Thread 1:

28340

// y.fetch_add(42, acquire);

28341

// r2 = x.load(relaxed);

28342

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

28343

// lowered to just a load without a fence. A mfence flushes the store buffer,

28344

// making the optimization clearly correct.

28345

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

28346

// otherwise, we might be able to be more aggressive on relaxed idempotent

28347

// rmw. In practice, they do not look useful, so we don't try to be

28348

// especially clever.

28349

if (SSID == SyncScope::SingleThread)

28350

// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at

28351

// the IR level, so we must wrap it in an intrinsic.

28352

return nullptr;

28353

28354

if (!Subtarget.hasMFence())

28355

// FIXME: it might make sense to use a locked operation here but on a

28356

// different cache-line to prevent cache-line bouncing. In practice it

28357

// is probably a small win, and x86 processors without mfence are rare

28358

// enough that we do not bother.

28359

return nullptr;

28360

28361

Function *MFence =

28362

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

28363

Builder.CreateCall(MFence, {});

28364

28365

// Finally we can emit the atomic load.

28366

LoadInst *Loaded =

28367

Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),

28368

Align(AI->getType()->getPrimitiveSizeInBits()));

28369

Loaded->setAtomic(Order, SSID);

28370

AI->replaceAllUsesWith(Loaded);

28371

AI->eraseFromParent();

28372

return Loaded;

28373

}

28374

28375

bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {

28376

if (!SI.isUnordered())

28377

return false;

28378

return ExperimentalUnorderedISEL;

28379

}

28380

bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {

28381

if (!LI.isUnordered())

28382

return false;

28383

return ExperimentalUnorderedISEL;

28384

}

28385

28386

28387

/// Emit a locked operation on a stack location which does not change any

28388

/// memory location, but does involve a lock prefix. Location is chosen to be

28389

/// a) very likely accessed only by a single thread to minimize cache traffic,

28390

/// and b) definitely dereferenceable. Returns the new Chain result.

28391

static SDValue emitLockedStackOp(SelectionDAG &DAG,

28392

const X86Subtarget &Subtarget, SDValue Chain,

28393

const SDLoc &DL) {

28394

// Implementation notes:

28395

// 1) LOCK prefix creates a full read/write reordering barrier for memory

28396

// operations issued by the current processor. As such, the location

28397

// referenced is not relevant for the ordering properties of the instruction.

28398

// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

28399

// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions

28400

// 2) Using an immediate operand appears to be the best encoding choice

28401

// here since it doesn't require an extra register.

28402

// 3) OR appears to be very slightly faster than ADD. (Though, the difference

28403

// is small enough it might just be measurement noise.)

28404

// 4) When choosing offsets, there are several contributing factors:

28405

// a) If there's no redzone, we default to TOS. (We could allocate a cache

28406

// line aligned stack object to improve this case.)

28407

// b) To minimize our chances of introducing a false dependence, we prefer

28408

// to offset the stack usage from TOS slightly.

28409

// c) To minimize concerns about cross thread stack usage - in particular,

28410

// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

28411

// captures state in the TOS frame and accesses it from many threads -

28412

// we want to use an offset such that the offset is in a distinct cache

28413

// line from the TOS frame.

28414

//

28415

// For a general discussion of the tradeoffs and benchmark results, see:

28416

// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

28417

28418

auto &MF = DAG.getMachineFunction();

28419

auto &TFL = *Subtarget.getFrameLowering();

28420

const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

28421

28422

if (Subtarget.is64Bit()) {

28423

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

28424

SDValue Ops[] = {

28425

DAG.getRegister(X86::RSP, MVT::i64), // Base

28426

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

28427

DAG.getRegister(0, MVT::i64), // Index

28428

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

28429

DAG.getRegister(0, MVT::i16), // Segment.

28430

Zero,

28431

Chain};

28432

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

28433

MVT::Other, Ops);

28434

return SDValue(Res, 1);

28435

}

28436

28437

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

28438

SDValue Ops[] = {

28439

DAG.getRegister(X86::ESP, MVT::i32), // Base

28440

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

28441

DAG.getRegister(0, MVT::i32), // Index

28442

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

28443

DAG.getRegister(0, MVT::i16), // Segment.

28444

Zero,

28445

Chain

28446

};

28447

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

28448

MVT::Other, Ops);

28449

return SDValue(Res, 1);

28450

}

28451

28452

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

28453

SelectionDAG &DAG) {

28454

SDLoc dl(Op);

28455

AtomicOrdering FenceOrdering =

28456

static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

28457

SyncScope::ID FenceSSID =

28458

static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

28459

28460

// The only fence that needs an instruction is a sequentially-consistent

28461

// cross-thread fence.

28462

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

28463

FenceSSID == SyncScope::System) {

28464

if (Subtarget.hasMFence())

28465

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

28466

28467

SDValue Chain = Op.getOperand(0);

28468

return emitLockedStackOp(DAG, Subtarget, Chain, dl);

28469

}

28470

28471

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

28472

return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

28473

}

28474

28475

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

28476

SelectionDAG &DAG) {

28477

MVT T = Op.getSimpleValueType();

28478

SDLoc DL(Op);

28479

unsigned Reg = 0;

28480

unsigned size = 0;

28481

switch(T.SimpleTy) {

28482

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28482);

28483

case MVT::i8: Reg = X86::AL; size = 1; break;

28484

case MVT::i16: Reg = X86::AX; size = 2; break;

28485

case MVT::i32: Reg = X86::EAX; size = 4; break;

28486

case MVT::i64:

28487

assert(Subtarget.is64Bit() && "Node not type legal!")((Subtarget.is64Bit() && "Node not type legal!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28487, __PRETTY_FUNCTION__));

28488

Reg = X86::RAX; size = 8;

28489

break;

28490

}

28491

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

28492

Op.getOperand(2), SDValue());

28493

SDValue Ops[] = { cpIn.getValue(0),

28494

Op.getOperand(1),

28495

Op.getOperand(3),

28496

DAG.getTargetConstant(size, DL, MVT::i8),

28497

cpIn.getValue(1) };

28498

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

28499

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

28500

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

28501

Ops, T, MMO);

28502

28503

SDValue cpOut =

28504

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

28505

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

28506

MVT::i32, cpOut.getValue(2));

28507

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

28508

28509

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28510

cpOut, Success, EFLAGS.getValue(1));

28511

}

28512

28513

// Create MOVMSKB, taking into account whether we need to split for AVX1.

28514

static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

28515

const X86Subtarget &Subtarget) {

28516

MVT InVT = V.getSimpleValueType();

28517

28518

if (InVT == MVT::v64i8) {

28519

SDValue Lo, Hi;

28520

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

28521

Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

28522

Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

28523

Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

28524

Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

28525

Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

28526

DAG.getConstant(32, DL, MVT::i8));

28527

return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

28528

}

28529

if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

28530

SDValue Lo, Hi;

28531

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

28532

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

28533

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

28534

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

28535

DAG.getConstant(16, DL, MVT::i8));

28536

return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

28537

}

28538

28539

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

28540

}

28541

28542

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

28543

SelectionDAG &DAG) {

28544

SDValue Src = Op.getOperand(0);

28545

MVT SrcVT = Src.getSimpleValueType();

28546

MVT DstVT = Op.getSimpleValueType();

28547

28548

// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

28549

// half to v32i1 and concatenating the result.

28550

if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

28551

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28551, __PRETTY_FUNCTION__));

28552

assert(Subtarget.hasBWI() && "Expected BWI target")((Subtarget.hasBWI() && "Expected BWI target") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28552, __PRETTY_FUNCTION__));

28553

SDLoc dl(Op);

28554

SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

28555

DAG.getIntPtrConstant(0, dl));

28556

Lo = DAG.getBitcast(MVT::v32i1, Lo);

28557

SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

28558

DAG.getIntPtrConstant(1, dl));

28559

Hi = DAG.getBitcast(MVT::v32i1, Hi);

28560

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

28561

}

28562

28563

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

28564

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

28565

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28565, __PRETTY_FUNCTION__));

28566

MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

28567

SDLoc DL(Op);

28568

SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

28569

V = getPMOVMSKB(DL, V, DAG, Subtarget);

28570

return DAG.getZExtOrTrunc(V, DL, DstVT);

28571

}

28572

28573

assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28574, __PRETTY_FUNCTION__))

28574

SrcVT == MVT::i64) && "Unexpected VT!")(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28574, __PRETTY_FUNCTION__));

28575

28576

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28576, __PRETTY_FUNCTION__));

28577

if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

28578

!(DstVT == MVT::x86mmx && SrcVT.isVector()))

28579

// This conversion needs to be expanded.

28580

return SDValue();

28581

28582

SDLoc dl(Op);

28583

if (SrcVT.isVector()) {

28584

// Widen the vector in input in the case of MVT::v2i32.

28585

// Example: from MVT::v2i32 to MVT::v4i32.

28586

MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

28587

SrcVT.getVectorNumElements() * 2);

28588

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

28589

DAG.getUNDEF(SrcVT));

28590

} else {

28591

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28592, __PRETTY_FUNCTION__))

28592

"Unexpected source type in LowerBITCAST")((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28592, __PRETTY_FUNCTION__));

28593

Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

28594

}

28595

28596

MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

28597

Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

28598

28599

if (DstVT == MVT::x86mmx)

28600

return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

28601

28602

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

28603

DAG.getIntPtrConstant(0, dl));

28604

}

28605

28606

/// Compute the horizontal sum of bytes in V for the elements of VT.

28607

///

28608

/// Requires V to be a byte vector and VT to be an integer vector type with

28609

/// wider elements than V's type. The width of the elements of VT determines

28610

/// how many bytes of V are summed horizontally to produce each element of the

28611

/// result.

28612

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

28613

const X86Subtarget &Subtarget,

28614

SelectionDAG &DAG) {

28615

SDLoc DL(V);

28616

MVT ByteVecVT = V.getSimpleValueType();

28617

MVT EltVT = VT.getVectorElementType();

28618

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28619, __PRETTY_FUNCTION__))

28619

"Expected value to have byte element type.")((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28619, __PRETTY_FUNCTION__));

28620

assert(EltVT != MVT::i8 &&((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28621, __PRETTY_FUNCTION__))

28621

"Horizontal byte sum only makes sense for wider elements!")((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28621, __PRETTY_FUNCTION__));

28622

unsigned VecSize = VT.getSizeInBits();

28623

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28623, __PRETTY_FUNCTION__));

28624

28625

// PSADBW instruction horizontally add all bytes and leave the result in i64

28626

// chunks, thus directly computes the pop count for v2i64 and v4i64.

28627

if (EltVT == MVT::i64) {

28628

SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

28629

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

28630

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

28631

return DAG.getBitcast(VT, V);

28632

}

28633

28634

if (EltVT == MVT::i32) {

28635

// We unpack the low half and high half into i32s interleaved with zeros so

28636

// that we can use PSADBW to horizontally sum them. The most useful part of

28637

// this is that it lines up the results of two PSADBW instructions to be

28638

// two v2i64 vectors which concatenated are the 4 population counts. We can

28639

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

28640

SDValue Zeros = DAG.getConstant(0, DL, VT);

28641

SDValue V32 = DAG.getBitcast(VT, V);

28642

SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

28643

SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

28644

28645

// Do the horizontal sums into two v2i64s.

28646

Zeros = DAG.getConstant(0, DL, ByteVecVT);

28647

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

28648

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

28649

DAG.getBitcast(ByteVecVT, Low), Zeros);

28650

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

28651

DAG.getBitcast(ByteVecVT, High), Zeros);

28652

28653

// Merge them together.

28654

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

28655

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

28656

DAG.getBitcast(ShortVecVT, Low),

28657

DAG.getBitcast(ShortVecVT, High));

28658

28659

return DAG.getBitcast(VT, V);

28660

}

28661

28662

// The only element type left is i16.

28663

assert(EltVT == MVT::i16 && "Unknown how to handle type")((EltVT == MVT::i16 && "Unknown how to handle type") ?
static_cast<void> (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28663, __PRETTY_FUNCTION__));

28664

28665

// To obtain pop count for each i16 element starting from the pop count for

28666

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

28667

// right by 8. It is important to shift as i16s as i8 vector shift isn't

28668

// directly supported.

28669

SDValue ShifterV = DAG.getConstant(8, DL, VT);

28670

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

28671

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

28672

DAG.getBitcast(ByteVecVT, V));

28673

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

28674

}

28675

28676

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

28677

const X86Subtarget &Subtarget,

28678

SelectionDAG &DAG) {

28679

MVT VT = Op.getSimpleValueType();

28680

MVT EltVT = VT.getVectorElementType();

28681

int NumElts = VT.getVectorNumElements();

28682

(void)EltVT;

28683

assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? static_cast<void> (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28683, __PRETTY_FUNCTION__));

28684

28685

// Implement a lookup table in register by using an algorithm based on:

28686

// http://wm.ite.pl/articles/sse-popcount.html

28687

//

28688

// The general idea is that every lower byte nibble in the input vector is an

28689

// index into a in-register pre-computed pop count table. We then split up the

28690

// input vector in two new ones: (1) a vector with only the shifted-right

28691

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

28692

// masked out higher ones) for each byte. PSHUFB is used separately with both

28693

// to index the in-register table. Next, both are added and the result is a

28694

// i8 vector where each element contains the pop count for input byte.

28695

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

28696

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

28697

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

28698

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

28699

28700

SmallVector<SDValue, 64> LUTVec;

28701

for (int i = 0; i < NumElts; ++i)

28702

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

28703

SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

28704

SDValue M0F = DAG.getConstant(0x0F, DL, VT);

28705

28706

// High nibbles

28707

SDValue FourV = DAG.getConstant(4, DL, VT);

28708

SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

28709

28710

// Low nibbles

28711

SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

28712

28713

// The input vector is used as the shuffle mask that index elements into the

28714

// LUT. After counting low and high nibbles, add the vector to obtain the

28715

// final pop count per i8 element.

28716

SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

28717

SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

28718

return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

28719

}

28720

28721

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

28722

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

28723

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

28724

SelectionDAG &DAG) {

28725

MVT VT = Op.getSimpleValueType();

28726

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28727, __PRETTY_FUNCTION__))

28727

"Unknown CTPOP type to handle")(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28727, __PRETTY_FUNCTION__));

28728

SDLoc DL(Op.getNode());

28729

SDValue Op0 = Op.getOperand(0);

28730

28731

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

28732

if (Subtarget.hasVPOPCNTDQ()) {

28733

unsigned NumElems = VT.getVectorNumElements();

28734

assert((VT.getVectorElementType() == MVT::i8 ||(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28735, __PRETTY_FUNCTION__))

28735

VT.getVectorElementType() == MVT::i16) && "Unexpected type")(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28735, __PRETTY_FUNCTION__));

28736

if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

28737

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

28738

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

28739

Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

28740

return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

28741

}

28742

}

28743

28744

// Decompose 256-bit ops into smaller 128-bit ops.

28745

if (VT.is256BitVector() && !Subtarget.hasInt256())

28746

return splitVectorIntUnary(Op, DAG);

28747

28748

// Decompose 512-bit ops into smaller 256-bit ops.

28749

if (VT.is512BitVector() && !Subtarget.hasBWI())

28750

return splitVectorIntUnary(Op, DAG);

28751

28752

// For element types greater than i8, do vXi8 pop counts and a bytesum.

28753

if (VT.getScalarType() != MVT::i8) {

28754

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

28755

SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

28756

SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

28757

return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

28758

}

28759

28760

// We can't use the fast LUT approach, so fall back on LegalizeDAG.

28761

if (!Subtarget.hasSSSE3())

28762

return SDValue();

28763

28764

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

28765

}

28766

28767

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

28768

SelectionDAG &DAG) {

28769

assert(Op.getSimpleValueType().isVector() &&((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28770, __PRETTY_FUNCTION__))

28770

"We only do custom lowering for vector population count.")((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28770, __PRETTY_FUNCTION__));

28771

return LowerVectorCTPOP(Op, Subtarget, DAG);

28772

}

28773

28774

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

28775

MVT VT = Op.getSimpleValueType();

28776

SDValue In = Op.getOperand(0);

28777

SDLoc DL(Op);

28778

28779

// For scalars, its still beneficial to transfer to/from the SIMD unit to

28780

// perform the BITREVERSE.

28781

if (!VT.isVector()) {

28782

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

28783

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

28784

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

28785

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

28786

DAG.getIntPtrConstant(0, DL));

28787

}

28788

28789

int NumElts = VT.getVectorNumElements();

28790

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

28791

28792

// Decompose 256-bit ops into smaller 128-bit ops.

28793

if (VT.is256BitVector())

28794

return splitVectorIntUnary(Op, DAG);

28795

28796

assert(VT.is128BitVector() &&((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28797, __PRETTY_FUNCTION__))

28797

"Only 128-bit vector bitreverse lowering supported.")((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28797, __PRETTY_FUNCTION__));

28798

28799

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

28800

// perform the BSWAP in the shuffle.

28801

// Its best to shuffle using the second operand as this will implicitly allow

28802

// memory folding for multiple vectors.

28803

SmallVector<SDValue, 16> MaskElts;

28804

for (int i = 0; i != NumElts; ++i) {

28805

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

28806

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

28807

int PermuteByte = SourceByte | (2 << 5);

28808

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

28809

}

28810

}

28811

28812

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

28813

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

28814

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

28815

Res, Mask);

28816

return DAG.getBitcast(VT, Res);

28817

}

28818

28819

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

28820

SelectionDAG &DAG) {

28821

MVT VT = Op.getSimpleValueType();

28822

28823

if (Subtarget.hasXOP() && !VT.is512BitVector())

28824

return LowerBITREVERSE_XOP(Op, DAG);

28825

28826

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28826, __PRETTY_FUNCTION__));

28827

28828

SDValue In = Op.getOperand(0);

28829

SDLoc DL(Op);

28830

28831

// Split v64i8 without BWI so that we can still use the PSHUFB lowering.

28832

if (VT == MVT::v64i8 && !Subtarget.hasBWI())

28833

return splitVectorIntUnary(Op, DAG);

28834

28835

unsigned NumElts = VT.getVectorNumElements();

28836

assert(VT.getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28837, __PRETTY_FUNCTION__))

28837

"Only byte vector BITREVERSE supported")((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28837, __PRETTY_FUNCTION__));

28838

28839

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

28840

if (VT.is256BitVector() && !Subtarget.hasInt256())

28841

return splitVectorIntUnary(Op, DAG);

28842

28843

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

28844

// two nibbles and a PSHUFB lookup to find the bitreverse of each

28845

// 0-15 value (moved to the other nibble).

28846

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

28847

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

28848

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

28849

28850

const int LoLUT[16] = {

28851

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

28852

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

28853

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

28854

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

28855

const int HiLUT[16] = {

28856

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

28857

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

28858

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

28859

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

28860

28861

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

28862

for (unsigned i = 0; i < NumElts; ++i) {

28863

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

28864

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

28865

}

28866

28867

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

28868

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

28869

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

28870

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

28871

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

28872

}

28873

28874

static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

28875

SelectionDAG &DAG) {

28876

SDLoc DL(Op);

28877

SDValue X = Op.getOperand(0);

28878

MVT VT = Op.getSimpleValueType();

28879

28880

// Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

28881

if (VT == MVT::i8 ||

28882

DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

28883

X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

28884

SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

28885

DAG.getConstant(0, DL, MVT::i8));

28886

// Copy the inverse of the parity flag into a register with setcc.

28887

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

28888

// Extend to the original type.

28889

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

28890

}

28891

28892

if (VT == MVT::i64) {

28893

// Xor the high and low 16-bits together using a 32-bit operation.

28894

SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

28895

DAG.getNode(ISD::SRL, DL, MVT::i64, X,

28896

DAG.getConstant(32, DL, MVT::i8)));

28897

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

28898

X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

28899

}

28900

28901

if (VT != MVT::i16) {

28902

// Xor the high and low 16-bits together using a 32-bit operation.

28903

SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

28904

DAG.getConstant(16, DL, MVT::i8));

28905

X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

28906

} else {

28907

// If the input is 16-bits, we need to extend to use an i32 shift below.

28908

X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

28909

}

28910

28911

// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

28912

// This should allow an h-reg to be used to save a shift.

28913

SDValue Hi = DAG.getNode(

28914

ISD::TRUNCATE, DL, MVT::i8,

28915

DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

28916

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

28917

SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

28918

SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

28919

28920

// Copy the inverse of the parity flag into a register with setcc.

28921

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

28922

// Extend to the original type.

28923

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

28924

}

28925

28926

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

28927

const X86Subtarget &Subtarget) {

28928

unsigned NewOpc = 0;

28929

switch (N->getOpcode()) {

28930

case ISD::ATOMIC_LOAD_ADD:

28931

NewOpc = X86ISD::LADD;

28932

break;

28933

case ISD::ATOMIC_LOAD_SUB:

28934

NewOpc = X86ISD::LSUB;

28935

break;

28936

case ISD::ATOMIC_LOAD_OR:

28937

NewOpc = X86ISD::LOR;

28938

break;

28939

case ISD::ATOMIC_LOAD_XOR:

28940

NewOpc = X86ISD::LXOR;

28941

break;

28942

case ISD::ATOMIC_LOAD_AND:

28943

NewOpc = X86ISD::LAND;

28944

break;

28945

default:

28946

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28946);

28947

}

28948

28949

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

28950

28951

return DAG.getMemIntrinsicNode(

28952

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

28953

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

28954

/*MemVT=*/N->getSimpleValueType(0), MMO);

28955

}

28956

28957

/// Lower atomic_load_ops into LOCK-prefixed operations.

28958

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

28959

const X86Subtarget &Subtarget) {

28960

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

28961

SDValue Chain = N->getOperand(0);

28962

SDValue LHS = N->getOperand(1);

28963

SDValue RHS = N->getOperand(2);

28964

unsigned Opc = N->getOpcode();

28965

MVT VT = N->getSimpleValueType(0);

28966

SDLoc DL(N);

28967

28968

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

28969

// can only be lowered when the result is unused. They should have already

28970

// been transformed into a cmpxchg loop in AtomicExpand.

28971

if (N->hasAnyUseOfValue(0)) {

28972

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

28973

// select LXADD if LOCK_SUB can't be selected.

28974

if (Opc == ISD::ATOMIC_LOAD_SUB) {

28975

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

28976

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

28977

RHS, AN->getMemOperand());

28978

}

28979

assert(Opc == ISD::ATOMIC_LOAD_ADD &&((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28980, __PRETTY_FUNCTION__))

28980

"Used AtomicRMW ops other than Add should have been expanded!")((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28980, __PRETTY_FUNCTION__));

28981

return N;

28982

}

28983

28984

// Specialized lowering for the canonical form of an idemptotent atomicrmw.

28985

// The core idea here is that since the memory location isn't actually

28986

// changing, all we need is a lowering for the *ordering* impacts of the

28987

// atomicrmw. As such, we can chose a different operation and memory

28988

// location to minimize impact on other code.

28989

if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {

28990

// On X86, the only ordering which actually requires an instruction is

28991

// seq_cst which isn't SingleThread, everything just needs to be preserved

28992

// during codegen and then dropped. Note that we expect (but don't assume),

28993

// that orderings other than seq_cst and acq_rel have been canonicalized to

28994

// a store or load.

28995

if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&

28996

AN->getSyncScopeID() == SyncScope::System) {

28997

// Prefer a locked operation against a stack location to minimize cache

28998

// traffic. This assumes that stack locations are very likely to be

28999

// accessed only by the owning thread.

29000

SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

29001

assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29001, __PRETTY_FUNCTION__));

29002

// NOTE: The getUNDEF is needed to give something for the unused result 0.

29003

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

29004

DAG.getUNDEF(VT), NewChain);

29005

}

29006

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

29007

SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);

29008

assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29008, __PRETTY_FUNCTION__));

29009

// NOTE: The getUNDEF is needed to give something for the unused result 0.

29010

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

29011

DAG.getUNDEF(VT), NewChain);

29012

}

29013

29014

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

29015

// RAUW the chain, but don't worry about the result, as it's unused.

29016

assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29016, __PRETTY_FUNCTION__));

29017

// NOTE: The getUNDEF is needed to give something for the unused result 0.

29018

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

29019

DAG.getUNDEF(VT), LockOp.getValue(1));

29020

}

29021

29022

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

29023

const X86Subtarget &Subtarget) {

29024

auto *Node = cast<AtomicSDNode>(Op.getNode());

29025

SDLoc dl(Node);

29026

EVT VT = Node->getMemoryVT();

29027

29028

bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;

29029

bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

29030

29031

// If this store is not sequentially consistent and the type is legal

29032

// we can just keep it.

29033

if (!IsSeqCst && IsTypeLegal)

29034

return Op;

29035

29036

if (VT == MVT::i64 && !IsTypeLegal) {

29037

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

29038

// is enabled.

29039

bool NoImplicitFloatOps =

29040

DAG.getMachineFunction().getFunction().hasFnAttribute(

29041

Attribute::NoImplicitFloat);

29042

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

29043

SDValue Chain;

29044

if (Subtarget.hasSSE1()) {

29045

SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

29046

Node->getOperand(2));

29047

MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

29048

SclToVec = DAG.getBitcast(StVT, SclToVec);

29049

SDVTList Tys = DAG.getVTList(MVT::Other);

29050

SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

29051

Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

29052

MVT::i64, Node->getMemOperand());

29053

} else if (Subtarget.hasX87()) {

29054

// First load this into an 80-bit X87 register using a stack temporary.

29055

// This will put the whole integer into the significand.

29056

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

29057

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

29058

MachinePointerInfo MPI =

29059

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

29060

Chain =

29061

DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

29062

MPI, MaybeAlign(), MachineMemOperand::MOStore);

29063

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

29064

SDValue LdOps[] = {Chain, StackPtr};

29065

SDValue Value =

29066

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

29067

/*Align*/ None, MachineMemOperand::MOLoad);

29068

Chain = Value.getValue(1);

29069

29070

// Now use an FIST to do the atomic store.

29071

SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

29072

Chain =

29073

DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

29074

StoreOps, MVT::i64, Node->getMemOperand());

29075

}

29076

29077

if (Chain) {

29078

// If this is a sequentially consistent store, also emit an appropriate

29079

// barrier.

29080

if (IsSeqCst)

29081

Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

29082

29083

return Chain;

29084

}

29085

}

29086

}

29087

29088

// Convert seq_cst store -> xchg

29089

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

29090

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

29091

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

29092

Node->getMemoryVT(),

29093

Node->getOperand(0),

29094

Node->getOperand(1), Node->getOperand(2),

29095

Node->getMemOperand());

29096

return Swap.getValue(1);

29097

}

29098

29099

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

29100

SDNode *N = Op.getNode();

29101

MVT VT = N->getSimpleValueType(0);

29102

29103

// Let legalize expand this if it isn't a legal type yet.

29104

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

29105

return SDValue();

29106

29107

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

29108

SDLoc DL(N);

29109

29110

// Set the carry flag.

29111

SDValue Carry = Op.getOperand(2);

29112

EVT CarryVT = Carry.getValueType();

29113

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

29114

Carry, DAG.getAllOnesConstant(DL, CarryVT));

29115

29116

unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;

29117

SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),

29118

Op.getOperand(1), Carry.getValue(1));

29119

29120

SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);

29121

if (N->getValueType(1) == MVT::i1)

29122

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

29123

29124

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

29125

}

29126

29127

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

29128

SelectionDAG &DAG) {

29129

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((Subtarget.isTargetDarwin() && Subtarget.is64Bit()) ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29129, __PRETTY_FUNCTION__));

29130

29131

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

29132

// which returns the values as { float, float } (in XMM0) or

29133

// { double, double } (which is returned in XMM0, XMM1).

29134

SDLoc dl(Op);

29135

SDValue Arg = Op.getOperand(0);

29136

EVT ArgVT = Arg.getValueType();

29137

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

29138

29139

TargetLowering::ArgListTy Args;

29140

TargetLowering::ArgListEntry Entry;

29141

29142

Entry.Node = Arg;

29143

Entry.Ty = ArgTy;

29144

Entry.IsSExt = false;

29145

Entry.IsZExt = false;

29146

Args.push_back(Entry);

29147

29148

bool isF64 = ArgVT == MVT::f64;

29149

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

29150

// the small struct {f32, f32} is returned in (eax, edx). For f64,

29151

// the results are returned via SRet in memory.

29152

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

29153

RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

29154

const char *LibcallName = TLI.getLibcallName(LC);

29155

SDValue Callee =

29156

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

29157

29158

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

29159

: (Type *)FixedVectorType::get(ArgTy, 4);

29160

29161

TargetLowering::CallLoweringInfo CLI(DAG);

29162

CLI.setDebugLoc(dl)

29163

.setChain(DAG.getEntryNode())

29164

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

29165

29166

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

29167

29168

if (isF64)

29169

// Returned in xmm0 and xmm1.

29170

return CallResult.first;

29171

29172

// Returned in bits 0:31 and 32:64 xmm0.

29173

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

29174

CallResult.first, DAG.getIntPtrConstant(0, dl));

29175

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

29176

CallResult.first, DAG.getIntPtrConstant(1, dl));

29177

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

29178

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

29179

}

29180

29181

/// Widen a vector input to a vector of NVT. The

29182

/// input vector must have the same element type as NVT.

29183

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

29184

bool FillWithZeroes = false) {

29185

// Check if InOp already has the right width.

29186

MVT InVT = InOp.getSimpleValueType();

29187

if (InVT == NVT)

29188

return InOp;

29189

29190

if (InOp.isUndef())

29191

return DAG.getUNDEF(NVT);

29192

29193

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29194, __PRETTY_FUNCTION__))

29194

"input and widen element type must match")((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29194, __PRETTY_FUNCTION__));

29195

29196

unsigned InNumElts = InVT.getVectorNumElements();

29197

unsigned WidenNumElts = NVT.getVectorNumElements();

29198

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29199, __PRETTY_FUNCTION__))

29199

"Unexpected request for vector widening")((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29199, __PRETTY_FUNCTION__));

29200

29201

SDLoc dl(InOp);

29202

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

29203

InOp.getNumOperands() == 2) {

29204

SDValue N1 = InOp.getOperand(1);

29205

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

29206

N1.isUndef()) {

29207

InOp = InOp.getOperand(0);

29208

InVT = InOp.getSimpleValueType();

29209

InNumElts = InVT.getVectorNumElements();

29210

}

29211

}

29212

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

29213

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

29214

SmallVector<SDValue, 16> Ops;

29215

for (unsigned i = 0; i < InNumElts; ++i)

29216

Ops.push_back(InOp.getOperand(i));

29217

29218

EVT EltVT = InOp.getOperand(0).getValueType();

29219

29220

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

29221

DAG.getUNDEF(EltVT);

29222

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

29223

Ops.push_back(FillVal);

29224

return DAG.getBuildVector(NVT, dl, Ops);

29225

}

29226

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

29227

DAG.getUNDEF(NVT);

29228

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

29229

InOp, DAG.getIntPtrConstant(0, dl));

29230

}

29231

29232

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

29233

SelectionDAG &DAG) {

29234

assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29235, __PRETTY_FUNCTION__))

29235

"MGATHER/MSCATTER are supported on AVX-512 arch only")((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29235, __PRETTY_FUNCTION__));

29236

29237

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

29238

SDValue Src = N->getValue();

29239

MVT VT = Src.getSimpleValueType();

29240

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29240, __PRETTY_FUNCTION__));

29241

SDLoc dl(Op);

29242

29243

SDValue Scale = N->getScale();

29244

SDValue Index = N->getIndex();

29245

SDValue Mask = N->getMask();

29246

SDValue Chain = N->getChain();

29247

SDValue BasePtr = N->getBasePtr();

29248

29249

if (VT == MVT::v2f32 || VT == MVT::v2i32) {

29250

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29250, __PRETTY_FUNCTION__));

29251

// If the index is v2i64 and we have VLX we can use xmm for data and index.

29252

if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

29253

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

29254

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

29255

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

29256

SDVTList VTs = DAG.getVTList(MVT::Other);

29257

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

29258

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

29259

N->getMemoryVT(), N->getMemOperand());

29260

}

29261

return SDValue();

29262

}

29263

29264

MVT IndexVT = Index.getSimpleValueType();

29265

29266

// If the index is v2i32, we're being called by type legalization and we

29267

// should just let the default handling take care of it.

29268

if (IndexVT == MVT::v2i32)

29269

return SDValue();

29270

29271

// If we don't have VLX and neither the passthru or index is 512-bits, we

29272

// need to widen until one is.

29273

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

29274

!Index.getSimpleValueType().is512BitVector()) {

29275

// Determine how much we need to widen by to get a 512-bit type.

29276

unsigned Factor = std::min(512/VT.getSizeInBits(),

29277

512/IndexVT.getSizeInBits());

29278

unsigned NumElts = VT.getVectorNumElements() * Factor;

29279

29280

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

29281

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

29282

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

29283

29284

Src = ExtendToType(Src, VT, DAG);

29285

Index = ExtendToType(Index, IndexVT, DAG);

29286

Mask = ExtendToType(Mask, MaskVT, DAG, true);

29287

}

29288

29289

SDVTList VTs = DAG.getVTList(MVT::Other);

29290

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

29291

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

29292

N->getMemoryVT(), N->getMemOperand());

29293

}

29294

29295

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

29296

SelectionDAG &DAG) {

29297

29298

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

29299

MVT VT = Op.getSimpleValueType();

29300

MVT ScalarVT = VT.getScalarType();

29301

SDValue Mask = N->getMask();

29302

MVT MaskVT = Mask.getSimpleValueType();

29303

SDValue PassThru = N->getPassThru();

29304

SDLoc dl(Op);

29305

29306

// Handle AVX masked loads which don't support passthru other than 0.

29307

if (MaskVT.getVectorElementType() != MVT::i1) {

29308

// We also allow undef in the isel pattern.

29309

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

29310

return Op;

29311

29312

SDValue NewLoad = DAG.getMaskedLoad(

29313

VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

29314

getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

29315

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

29316

N->isExpandingLoad());

29317

// Emit a blend.

29318

SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

29319

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

29320

}

29321

29322

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29323, __PRETTY_FUNCTION__))

29323

"Expanding masked load is supported on AVX-512 target only!")(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29323, __PRETTY_FUNCTION__));

29324

29325

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29326, __PRETTY_FUNCTION__))

29326

"Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29326, __PRETTY_FUNCTION__));

29327

29328

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29329, __PRETTY_FUNCTION__))

29329

"Cannot lower masked load op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29329, __PRETTY_FUNCTION__));

29330

29331

assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29334, __PRETTY_FUNCTION__))

29332

(Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29334, __PRETTY_FUNCTION__))

29333

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29334, __PRETTY_FUNCTION__))

29334

"Unsupported masked load op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29334, __PRETTY_FUNCTION__));

29335

29336

// This operation is legal for targets with VLX, but without

29337

// VLX the vector should be widened to 512 bit

29338

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

29339

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

29340

PassThru = ExtendToType(PassThru, WideDataVT, DAG);

29341

29342

// Mask element has to be i1.

29343

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29344, __PRETTY_FUNCTION__))

29344

"Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29344, __PRETTY_FUNCTION__));

29345

29346

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

29347

29348

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

29349

SDValue NewLoad = DAG.getMaskedLoad(

29350

WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

29351

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

29352

N->getExtensionType(), N->isExpandingLoad());

29353

29354

SDValue Extract =

29355

DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

29356

DAG.getIntPtrConstant(0, dl));

29357

SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

29358

return DAG.getMergeValues(RetOps, dl);

29359

}

29360

29361

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

29362

SelectionDAG &DAG) {

29363

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

29364

SDValue DataToStore = N->getValue();

29365

MVT VT = DataToStore.getSimpleValueType();

29366

MVT ScalarVT = VT.getScalarType();

29367

SDValue Mask = N->getMask();

29368

SDLoc dl(Op);

29369

29370

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29371, __PRETTY_FUNCTION__))

29371

"Expanding masked load is supported on AVX-512 target only!")(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29371, __PRETTY_FUNCTION__));

29372

29373

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29374, __PRETTY_FUNCTION__))

29374

"Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29374, __PRETTY_FUNCTION__));

29375

29376

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29377, __PRETTY_FUNCTION__))

29377

"Cannot lower masked store op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29377, __PRETTY_FUNCTION__));

29378

29379

assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29382, __PRETTY_FUNCTION__))

29380

(Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29382, __PRETTY_FUNCTION__))

29381

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29382, __PRETTY_FUNCTION__))

29382

"Unsupported masked store op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29382, __PRETTY_FUNCTION__));

29383

29384

// This operation is legal for targets with VLX, but without

29385

// VLX the vector should be widened to 512 bit

29386

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

29387

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

29388

29389

// Mask element has to be i1.

29390

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29391, __PRETTY_FUNCTION__))

29391

"Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29391, __PRETTY_FUNCTION__));

29392

29393

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

29394

29395

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

29396

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

29397

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

29398

N->getOffset(), Mask, N->getMemoryVT(),

29399

N->getMemOperand(), N->getAddressingMode(),

29400

N->isTruncatingStore(), N->isCompressingStore());

29401

}

29402

29403

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

29404

SelectionDAG &DAG) {

29405

assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29406, __PRETTY_FUNCTION__))

29406

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29406, __PRETTY_FUNCTION__));

29407

29408

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

29409

SDLoc dl(Op);

29410

MVT VT = Op.getSimpleValueType();

29411

SDValue Index = N->getIndex();

29412

SDValue Mask = N->getMask();

29413

SDValue PassThru = N->getPassThru();

29414

MVT IndexVT = Index.getSimpleValueType();

29415

29416

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29416, __PRETTY_FUNCTION__));

29417

29418

// If the index is v2i32, we're being called by type legalization.

29419

if (IndexVT == MVT::v2i32)

29420

return SDValue();

29421

29422

// If we don't have VLX and neither the passthru or index is 512-bits, we

29423

// need to widen until one is.

29424

MVT OrigVT = VT;

29425

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

29426

!IndexVT.is512BitVector()) {

29427

// Determine how much we need to widen by to get a 512-bit type.

29428

unsigned Factor = std::min(512/VT.getSizeInBits(),

29429

512/IndexVT.getSizeInBits());

29430

29431

unsigned NumElts = VT.getVectorNumElements() * Factor;

29432

29433

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

29434

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

29435

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

29436

29437

PassThru = ExtendToType(PassThru, VT, DAG);

29438

Index = ExtendToType(Index, IndexVT, DAG);

29439

Mask = ExtendToType(Mask, MaskVT, DAG, true);

29440

}

29441

29442

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

29443

N->getScale() };

29444

SDValue NewGather = DAG.getMemIntrinsicNode(

29445

X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

29446

N->getMemOperand());

29447

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

29448

NewGather, DAG.getIntPtrConstant(0, dl));

29449

return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

29450

}

29451

29452

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

29453

SDLoc dl(Op);

29454

SDValue Src = Op.getOperand(0);

29455

MVT DstVT = Op.getSimpleValueType();

29456

29457

AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

29458

unsigned SrcAS = N->getSrcAddressSpace();

29459

29460

assert(SrcAS != N->getDestAddressSpace() &&((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29461, __PRETTY_FUNCTION__))

29461

"addrspacecast must be between different address spaces")((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29461, __PRETTY_FUNCTION__));

29462

29463

if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

29464

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

29465

} else if (DstVT == MVT::i64) {

29466

Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

29467

} else if (DstVT == MVT::i32) {

29468

Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

29469

} else {

29470

report_fatal_error("Bad address space in addrspacecast");

29471

}

29472

return Op;

29473

}

29474

29475

SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

29476

SelectionDAG &DAG) const {

29477

// TODO: Eventually, the lowering of these nodes should be informed by or

29478

// deferred to the GC strategy for the function in which they appear. For

29479

// now, however, they must be lowered to something. Since they are logically

29480

// no-ops in the case of a null GC strategy (or a GC strategy which does not

29481

// require special handling for these nodes), lower them as literal NOOPs for

29482

// the time being.

29483

SmallVector<SDValue, 2> Ops;

29484

29485

Ops.push_back(Op.getOperand(0));

29486

if (Op->getGluedNode())

29487

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

29488

29489

SDLoc OpDL(Op);

29490

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

29491

SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

29492

29493

return NOOP;

29494

}

29495

29496

SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,

29497

RTLIB::Libcall Call) const {

29498

29499

bool IsStrict = Op->isStrictFPOpcode();

29500

unsigned Offset = IsStrict ? 1 : 0;

29501

SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());

29502

29503

SDLoc dl(Op);

29504

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

29505

MakeLibCallOptions CallOptions;

29506

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,

29507

CallOptions, dl, Chain);

29508

29509

if (IsStrict)

29510

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

29511

29512

return Tmp.first;

29513

}

29514

29515

// Custom split CVTPS2PH with wide types.

29516

static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

29517

SDLoc dl(Op);

29518

EVT VT = Op.getValueType();

29519

SDValue Lo, Hi;

29520

std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

29521

EVT LoVT, HiVT;

29522

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

29523

SDValue RC = Op.getOperand(1);

29524

Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

29525

Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

29526

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

29527

}

29528

29529

/// Provide custom lowering hooks for some operations.

29530

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

29531

switch (Op.getOpcode()) {

29532

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29532);

29533

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

29534

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

29535

return LowerCMP_SWAP(Op, Subtarget, DAG);

29536

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

29537

case ISD::ATOMIC_LOAD_ADD:

29538

case ISD::ATOMIC_LOAD_SUB:

29539

case ISD::ATOMIC_LOAD_OR:

29540

case ISD::ATOMIC_LOAD_XOR:

29541

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

29542

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);

29543

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

29544

case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);

29545

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

29546

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

29547

case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

29548

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

29549

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

29550

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

29551

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

29552

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

29553

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

29554

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

29555

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

29556

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

29557

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

29558

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

29559

case ISD::SHL_PARTS:

29560

case ISD::SRA_PARTS:

29561

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

29562

case ISD::FSHL:

29563

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

29564

case ISD::STRICT_SINT_TO_FP:

29565

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

29566

case ISD::STRICT_UINT_TO_FP:

29567

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

29568

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

29569

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

29570

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

29571

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

29572

case ISD::ZERO_EXTEND_VECTOR_INREG:

29573

case ISD::SIGN_EXTEND_VECTOR_INREG:

29574

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

29575

case ISD::FP_TO_SINT:

29576

case ISD::STRICT_FP_TO_SINT:

29577

case ISD::FP_TO_UINT:

29578

case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

29579

case ISD::FP_EXTEND:

29580

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

29581

case ISD::FP_ROUND:

29582

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

29583

case ISD::FP16_TO_FP:

29584

case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

29585

case ISD::FP_TO_FP16:

29586

case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

29587

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

29588

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

29589

case ISD::FADD:

29590

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

29591

case ISD::FROUND: return LowerFROUND(Op, DAG);

29592

case ISD::FABS:

29593

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

29594

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

29595

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

29596

case ISD::LRINT:

29597

case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

29598

case ISD::SETCC:

29599

case ISD::STRICT_FSETCC:

29600

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

29601

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

29602

case ISD::SELECT: return LowerSELECT(Op, DAG);

29603

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

29604

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

29605

case ISD::VASTART: return LowerVASTART(Op, DAG);

29606

case ISD::VAARG: return LowerVAARG(Op, DAG);

29607

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

29608

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

29609

case ISD::INTRINSIC_VOID:

29610

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

29611

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

29612

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

29613

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

29614

case ISD::FRAME_TO_ARGS_OFFSET:

29615

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

29616

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

29617

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

29618

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

29619

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

29620

case ISD::EH_SJLJ_SETUP_DISPATCH:

29621

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

29622

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

29623

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

29624

case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);

29625

case ISD::CTLZ:

29626

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

29627

case ISD::CTTZ:

29628

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);

29629

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

29630

case ISD::MULHS:

29631

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

29632

case ISD::ROTL:

29633

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

29634

case ISD::SRA:

29635

case ISD::SRL:

29636

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

29637

case ISD::SADDO:

29638

case ISD::UADDO:

29639

case ISD::SSUBO:

29640

case ISD::USUBO:

29641

case ISD::SMULO:

29642

case ISD::UMULO: return LowerXALUO(Op, DAG);

29643

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

29644

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

29645

case ISD::ADDCARRY:

29646

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

29647

case ISD::ADD:

29648

case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);

29649

case ISD::UADDSAT:

29650

case ISD::SADDSAT:

29651

case ISD::USUBSAT:

29652

case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

29653

case ISD::SMAX:

29654

case ISD::SMIN:

29655

case ISD::UMAX:

29656

case ISD::UMIN: return LowerMINMAX(Op, DAG);

29657

case ISD::ABS: return LowerABS(Op, Subtarget, DAG);

29658

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

29659

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

29660

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

29661

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

29662

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

29663

case ISD::GC_TRANSITION_START:

29664

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

29665

case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

29666

case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

29667

}

29668

}

29669

29670

/// Places new result values for the node in Results (their number

29671

/// and types must exactly match those of the original return values of

29672

/// the node), or leaves Results empty, which indicates that the node is not

29673

/// to be custom lowered after all.

29674

void X86TargetLowering::LowerOperationWrapper(SDNode *N,

29675

SmallVectorImpl<SDValue> &Results,

29676

SelectionDAG &DAG) const {

29677

SDValue Res = LowerOperation(SDValue(N, 0), DAG);

29678

29679

if (!Res.getNode())

29680

return;

29681

29682

// If the original node has one result, take the return value from

29683

// LowerOperation as is. It might not be result number 0.

29684

if (N->getNumValues() == 1) {

29685

Results.push_back(Res);

29686

return;

29687

}

29688

29689

// If the original node has multiple results, then the return node should

29690

// have the same number of results.

29691

assert((N->getNumValues() == Res->getNumValues()) &&(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29692, __PRETTY_FUNCTION__))

29692

"Lowering returned the wrong number of results!")(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29692, __PRETTY_FUNCTION__));

29693

29694

// Places new result values base on N result number.

29695

for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)

29696

Results.push_back(Res.getValue(I));

29697

}

29698

29699

/// Replace a node with an illegal result type with a new node built out of

29700

/// custom code.

29701

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

29702

SmallVectorImpl<SDValue>&Results,

29703

SelectionDAG &DAG) const {

29704

SDLoc dl(N);

29705

switch (N->getOpcode()) {

29706

default:

29707

#ifndef NDEBUG

29708

dbgs() << "ReplaceNodeResults: ";

29709

N->dump(&DAG);

29710

#endif

29711

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29711);

29712

case X86ISD::CVTPH2PS: {

29713

EVT VT = N->getValueType(0);

29714

SDValue Lo, Hi;

29715

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

29716

EVT LoVT, HiVT;

29717

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

29718

Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

29719

Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

29720

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

29721

Results.push_back(Res);

29722

return;

29723

}

29724

case X86ISD::STRICT_CVTPH2PS: {

29725

EVT VT = N->getValueType(0);

29726

SDValue Lo, Hi;

29727

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

29728

EVT LoVT, HiVT;

29729

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

29730

Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

29731

{N->getOperand(0), Lo});

29732

Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

29733

{N->getOperand(0), Hi});

29734

SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

29735

Lo.getValue(1), Hi.getValue(1));

29736

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

29737

Results.push_back(Res);

29738

Results.push_back(Chain);

29739

return;

29740

}

29741

case ISD::CTPOP: {

29742

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29742, __PRETTY_FUNCTION__));

29743

// Use a v2i64 if possible.

29744

bool NoImplicitFloatOps =

29745

DAG.getMachineFunction().getFunction().hasFnAttribute(

29746

Attribute::NoImplicitFloat);

29747

if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

29748

SDValue Wide =

29749

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

29750

Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

29751

// Bit count should fit in 32-bits, extract it as that and then zero

29752

// extend to i64. Otherwise we end up extracting bits 63:32 separately.

29753

Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

29754

Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

29755

DAG.getIntPtrConstant(0, dl));

29756

Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

29757

Results.push_back(Wide);

29758

}

29759

return;

29760

}

29761

case ISD::MUL: {

29762

EVT VT = N->getValueType(0);

29763

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29764, __PRETTY_FUNCTION__))

29764

VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29764, __PRETTY_FUNCTION__));

29765

// Pre-promote these to vXi16 to avoid op legalization thinking all 16

29766

// elements are needed.

29767

MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

29768

SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

29769

SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

29770

SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

29771

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

29772

unsigned NumConcats = 16 / VT.getVectorNumElements();

29773

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

29774

ConcatOps[0] = Res;

29775

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

29776

Results.push_back(Res);

29777

return;

29778

}

29779

case X86ISD::VPMADDWD:

29780

case X86ISD::AVG: {

29781

// Legalize types for X86ISD::AVG/VPMADDWD by widening.

29782

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29782, __PRETTY_FUNCTION__));

29783

29784

EVT VT = N->getValueType(0);

29785

EVT InVT = N->getOperand(0).getValueType();

29786

assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29787, __PRETTY_FUNCTION__))

29787

"Expected a VT that divides into 128 bits.")((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29787, __PRETTY_FUNCTION__));

29788

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29789, __PRETTY_FUNCTION__))

29789

"Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29789, __PRETTY_FUNCTION__));

29790

unsigned NumConcat = 128 / InVT.getSizeInBits();

29791

29792

EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

29793

InVT.getVectorElementType(),

29794

NumConcat * InVT.getVectorNumElements());

29795

EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

29796

VT.getVectorElementType(),

29797

NumConcat * VT.getVectorNumElements());

29798

29799

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

29800

Ops[0] = N->getOperand(0);

29801

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

29802

Ops[0] = N->getOperand(1);

29803

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

29804

29805

SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);

29806

Results.push_back(Res);

29807

return;

29808

}

29809

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

29810

case X86ISD::FMINC:

29811

case X86ISD::FMIN:

29812

case X86ISD::FMAXC:

29813

case X86ISD::FMAX: {

29814

EVT VT = N->getValueType(0);

29815

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29815, __PRETTY_FUNCTION__));

29816

SDValue UNDEF = DAG.getUNDEF(VT);

29817

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

29818

N->getOperand(0), UNDEF);

29819

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

29820

N->getOperand(1), UNDEF);

29821

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

29822

return;

29823

}

29824

case ISD::SDIV:

29825

case ISD::UDIV:

29826

case ISD::SREM:

29827

case ISD::UREM: {

29828

EVT VT = N->getValueType(0);

29829

if (VT.isVector()) {

29830

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29831, __PRETTY_FUNCTION__))

29831

"Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29831, __PRETTY_FUNCTION__));

29832

// If this RHS is a constant splat vector we can widen this and let

29833

// division/remainder by constant optimize it.

29834

// TODO: Can we do something for non-splat?

29835

APInt SplatVal;

29836

if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

29837

unsigned NumConcats = 128 / VT.getSizeInBits();

29838

SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

29839

Ops0[0] = N->getOperand(0);

29840

EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

29841

SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

29842

SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

29843

SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);

29844

Results.push_back(Res);

29845

}

29846

return;

29847

}

29848

29849

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

29850

Results.push_back(V);

29851

return;

29852

}

29853

case ISD::TRUNCATE: {

29854

MVT VT = N->getSimpleValueType(0);

29855

if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

29856

return;

29857

29858

// The generic legalizer will try to widen the input type to the same

29859

// number of elements as the widened result type. But this isn't always

29860

// the best thing so do some custom legalization to avoid some cases.

29861

MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

29862

SDValue In = N->getOperand(0);

29863

EVT InVT = In.getValueType();

29864

29865

unsigned InBits = InVT.getSizeInBits();

29866

if (128 % InBits == 0) {

29867

// 128 bit and smaller inputs should avoid truncate all together and

29868

// just use a build_vector that will become a shuffle.

29869

// TODO: Widen and use a shuffle directly?

29870

MVT InEltVT = InVT.getSimpleVT().getVectorElementType();

29871

EVT EltVT = VT.getVectorElementType();

29872

unsigned WidenNumElts = WidenVT.getVectorNumElements();

29873

SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));

29874

// Use the original element count so we don't do more scalar opts than

29875

// necessary.

29876

unsigned MinElts = VT.getVectorNumElements();

29877

for (unsigned i=0; i < MinElts; ++i) {

29878

SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,

29879

DAG.getIntPtrConstant(i, dl));

29880

Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);

29881

}

29882

Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));

29883

return;

29884

}

29885

// With AVX512 there are some cases that can use a target specific

29886

// truncate node to go from 256/512 to less than 128 with zeros in the

29887

// upper elements of the 128 bit result.

29888

if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

29889

// We can use VTRUNC directly if for 256 bits with VLX or for any 512.

29890

if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

29891

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

29892

return;

29893

}

29894

// There's one case we can widen to 512 bits and use VTRUNC.

29895

if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

29896

In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

29897

DAG.getUNDEF(MVT::v4i64));

29898

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

29899

return;

29900

}

29901

}

29902

if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

29903

getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

29904

isTypeLegal(MVT::v4i64)) {

29905

// Input needs to be split and output needs to widened. Let's use two

29906

// VTRUNCs, and shuffle their results together into the wider type.

29907

SDValue Lo, Hi;

29908

std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

29909

29910

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

29911

Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

29912

SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

29913

{ 0, 1, 2, 3, 16, 17, 18, 19,

29914

-1, -1, -1, -1, -1, -1, -1, -1 });

29915

Results.push_back(Res);

29916

return;

29917

}

29918

29919

return;

29920

}

29921

case ISD::ANY_EXTEND:

29922

// Right now, only MVT::v8i8 has Custom action for an illegal type.

29923

// It's intended to custom handle the input type.

29924

assert(N->getValueType(0) == MVT::v8i8 &&((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29925, __PRETTY_FUNCTION__))

29925

"Do not know how to legalize this Node")((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29925, __PRETTY_FUNCTION__));

29926

return;

29927

case ISD::SIGN_EXTEND:

29928

case ISD::ZERO_EXTEND: {

29929

EVT VT = N->getValueType(0);

29930

SDValue In = N->getOperand(0);

29931

EVT InVT = In.getValueType();

29932

if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

29933

(InVT == MVT::v4i16 || InVT == MVT::v4i8)){

29934

assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29935, __PRETTY_FUNCTION__))

29935

"Unexpected type action!")((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29935, __PRETTY_FUNCTION__));

29936

assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")((N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29936, __PRETTY_FUNCTION__));

29937

// Custom split this so we can extend i8/i16->i32 invec. This is better

29938

// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

29939

// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

29940

// we allow the sra from the extend to i32 to be shared by the split.

29941

In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

29942

29943

// Fill a vector with sign bits for each element.

29944

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

29945

SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

29946

29947

// Create an unpackl and unpackh to interleave the sign bits then bitcast

29948

// to v2i64.

29949

SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

29950

{0, 4, 1, 5});

29951

Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

29952

SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

29953

{2, 6, 3, 7});

29954

Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

29955

29956

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

29957

Results.push_back(Res);

29958

return;

29959

}

29960

29961

if (VT == MVT::v16i32 || VT == MVT::v8i64) {

29962

if (!InVT.is128BitVector()) {

29963

// Not a 128 bit vector, but maybe type legalization will promote

29964

// it to 128 bits.

29965

if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

29966

return;

29967

InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

29968

if (!InVT.is128BitVector())

29969

return;

29970

29971

// Promote the input to 128 bits. Type legalization will turn this into

29972

// zext_inreg/sext_inreg.

29973

In = DAG.getNode(N->getOpcode(), dl, InVT, In);

29974

}

29975

29976

// Perform custom splitting instead of the two stage extend we would get

29977

// by default.

29978

EVT LoVT, HiVT;

29979

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

29980

assert(isTypeLegal(LoVT) && "Split VT not legal?")((isTypeLegal(LoVT) && "Split VT not legal?") ? static_cast
<void> (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29980, __PRETTY_FUNCTION__));

29981

29982

SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);

29983

29984

// We need to shift the input over by half the number of elements.

29985

unsigned NumElts = InVT.getVectorNumElements();

29986

unsigned HalfNumElts = NumElts / 2;

29987

SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

29988

for (unsigned i = 0; i != HalfNumElts; ++i)

29989

ShufMask[i] = i + HalfNumElts;

29990

29991

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

29992

Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);

29993

29994

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

29995

Results.push_back(Res);

29996

}

29997

return;

29998

}

29999

case ISD::FP_TO_SINT:

30000

case ISD::STRICT_FP_TO_SINT:

30001

case ISD::FP_TO_UINT:

30002

case ISD::STRICT_FP_TO_UINT: {

30003

bool IsStrict = N->isStrictFPOpcode();

30004

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

30005

N->getOpcode() == ISD::STRICT_FP_TO_SINT;

30006

EVT VT = N->getValueType(0);

30007

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

30008

EVT SrcVT = Src.getValueType();

30009

30010

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

30011

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30012, __PRETTY_FUNCTION__))

30012

"Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30012, __PRETTY_FUNCTION__));

30013

30014

// Try to create a 128 bit vector, but don't exceed a 32 bit element.

30015

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

30016

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

30017

VT.getVectorNumElements());

30018

SDValue Res;

30019

SDValue Chain;

30020

if (IsStrict) {

30021

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

30022

{N->getOperand(0), Src});

30023

Chain = Res.getValue(1);

30024

} else

30025

Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

30026

30027

// Preserve what we know about the size of the original result. Except

30028

// when the result is v2i32 since we can't widen the assert.

30029

if (PromoteVT != MVT::v2i32)

30030

Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,

30031

dl, PromoteVT, Res,

30032

DAG.getValueType(VT.getVectorElementType()));

30033

30034

// Truncate back to the original width.

30035

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

30036

30037

// Now widen to 128 bits.

30038

unsigned NumConcats = 128 / VT.getSizeInBits();

30039

MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

30040

VT.getVectorNumElements() * NumConcats);

30041

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

30042

ConcatOps[0] = Res;

30043

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

30044

Results.push_back(Res);

30045

if (IsStrict)

30046

Results.push_back(Chain);

30047

return;

30048

}

30049

30050

30051

if (VT == MVT::v2i32) {

30052

assert((IsSigned || Subtarget.hasAVX512()) &&(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30053, __PRETTY_FUNCTION__))

30053

"Can only handle signed conversion without AVX512")(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30053, __PRETTY_FUNCTION__));

30054

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30054, __PRETTY_FUNCTION__));

30055

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30056, __PRETTY_FUNCTION__))

30056

"Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30056, __PRETTY_FUNCTION__));

30057

if (Src.getValueType() == MVT::v2f64) {

30058

unsigned Opc;

30059

if (IsStrict)

30060

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

30061

else

30062

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

30063

30064

// If we have VLX we can emit a target specific FP_TO_UINT node,.

30065

if (!IsSigned && !Subtarget.hasVLX()) {

30066

// Otherwise we can defer to the generic legalizer which will widen

30067

// the input as well. This will be further widened during op

30068

// legalization to v8i32<-v8f64.

30069

// For strict nodes we'll need to widen ourselves.

30070

// FIXME: Fix the type legalizer to safely widen strict nodes?

30071

if (!IsStrict)

30072

return;

30073

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

30074

DAG.getConstantFP(0.0, dl, MVT::v2f64));

30075

Opc = N->getOpcode();

30076

}

30077

SDValue Res;

30078

SDValue Chain;

30079

if (IsStrict) {

30080

Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

30081

{N->getOperand(0), Src});

30082

Chain = Res.getValue(1);

30083

} else {

30084

Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

30085

}

30086

Results.push_back(Res);

30087

if (IsStrict)

30088

Results.push_back(Chain);

30089

return;

30090

}

30091

30092

// Custom widen strict v2f32->v2i32 by padding with zeros.

30093

// FIXME: Should generic type legalizer do this?

30094

if (Src.getValueType() == MVT::v2f32 && IsStrict) {

30095

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

30096

DAG.getConstantFP(0.0, dl, MVT::v2f32));

30097

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

30098

{N->getOperand(0), Src});

30099

Results.push_back(Res);

30100

Results.push_back(Res.getValue(1));

30101

return;

30102

}

30103

30104

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

30105

// so early out here.

30106

return;

30107

}

30108

30109

assert(!VT.isVector() && "Vectors should have been handled above!")((!VT.isVector() && "Vectors should have been handled above!"
) ? static_cast<void> (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30109, __PRETTY_FUNCTION__));

30110

30111

if (Subtarget.hasDQI() && VT == MVT::i64 &&

30112

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {

30113

assert(!Subtarget.is64Bit() && "i64 should be legal")((!Subtarget.is64Bit() && "i64 should be legal") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30113, __PRETTY_FUNCTION__));

30114

unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

30115

// If we use a 128-bit result we might need to use a target specific node.

30116

unsigned SrcElts =

30117

std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

30118

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

30119

MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

30120

unsigned Opc = N->getOpcode();

30121

if (NumElts != SrcElts) {

30122

if (IsStrict)

30123

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

30124

else

30125

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

30126

}

30127

30128

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

30129

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

30130

DAG.getConstantFP(0.0, dl, VecInVT), Src,

30131

ZeroIdx);

30132

SDValue Chain;

30133

if (IsStrict) {

30134

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

30135

Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

30136

Chain = Res.getValue(1);

30137

} else

30138

Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

30139

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

30140

Results.push_back(Res);

30141

if (IsStrict)

30142

Results.push_back(Chain);

30143

return;

30144

}

30145

30146

SDValue Chain;

30147

if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

30148

Results.push_back(V);

30149

if (IsStrict)

30150

Results.push_back(Chain);

30151

}

30152

return;

30153

}

30154

case ISD::LRINT:

30155

case ISD::LLRINT: {

30156

if (SDValue V = LRINT_LLRINTHelper(N, DAG))

30157

Results.push_back(V);

30158

return;

30159

}

30160

30161

case ISD::SINT_TO_FP:

30162

case ISD::STRICT_SINT_TO_FP:

30163

case ISD::UINT_TO_FP:

30164

case ISD::STRICT_UINT_TO_FP: {

30165

bool IsStrict = N->isStrictFPOpcode();

30166

bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

30167

N->getOpcode() == ISD::STRICT_SINT_TO_FP;

30168

EVT VT = N->getValueType(0);

30169

if (VT != MVT::v2f32)

30170

return;

30171

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

30172

EVT SrcVT = Src.getValueType();

30173

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

30174

if (IsStrict) {

30175

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

30176

: X86ISD::STRICT_CVTUI2P;

30177

SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

30178

{N->getOperand(0), Src});

30179

Results.push_back(Res);

30180

Results.push_back(Res.getValue(1));

30181

} else {

30182

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

30183

Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

30184

}

30185

return;

30186

}

30187

if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

30188

Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

30189

SDValue Zero = DAG.getConstant(0, dl, SrcVT);

30190

SDValue One = DAG.getConstant(1, dl, SrcVT);

30191

SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

30192

DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

30193

DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

30194

SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

30195

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

30196

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

30197

for (int i = 0; i != 2; ++i) {

30198

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

30199

SignSrc, DAG.getIntPtrConstant(i, dl));

30200

if (IsStrict)

30201

SignCvts[i] =

30202

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

30203

{N->getOperand(0), Elt});

30204

else

30205

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

30206

};

30207

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

30208

SDValue Slow, Chain;

30209

if (IsStrict) {

30210

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

30211

SignCvts[0].getValue(1), SignCvts[1].getValue(1));

30212

Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

30213

{Chain, SignCvt, SignCvt});

30214

Chain = Slow.getValue(1);

30215

} else {

30216

Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

30217

}

30218

IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

30219

IsNeg =

30220

DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

30221

SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

30222

Results.push_back(Cvt);

30223

if (IsStrict)

30224

Results.push_back(Chain);

30225

return;

30226

}

30227

30228

if (SrcVT != MVT::v2i32)

30229

return;

30230

30231

if (IsSigned || Subtarget.hasAVX512()) {

30232

if (!IsStrict)

30233

return;

30234

30235

// Custom widen strict v2i32->v2f32 to avoid scalarization.

30236

// FIXME: Should generic type legalizer do this?

30237

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

30238

DAG.getConstant(0, dl, MVT::v2i32));

30239

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

30240

{N->getOperand(0), Src});

30241

Results.push_back(Res);

30242

Results.push_back(Res.getValue(1));

30243

return;

30244

}

30245

30246

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30246, __PRETTY_FUNCTION__));

30247

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

30248

SDValue VBias =

30249

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);

30250

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

30251

DAG.getBitcast(MVT::v2i64, VBias));

30252

Or = DAG.getBitcast(MVT::v2f64, Or);

30253

if (IsStrict) {

30254

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

30255

{N->getOperand(0), Or, VBias});

30256

SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

30257

{MVT::v4f32, MVT::Other},

30258

{Sub.getValue(1), Sub});

30259

Results.push_back(Res);

30260

Results.push_back(Res.getValue(1));

30261

} else {

30262

// TODO: Are there any fast-math-flags to propagate here?

30263

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

30264

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

30265

}

30266

return;

30267

}

30268

case ISD::STRICT_FP_ROUND:

30269

case ISD::FP_ROUND: {

30270

bool IsStrict = N->isStrictFPOpcode();

30271

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

30272

if (!isTypeLegal(Src.getValueType()))

30273

return;

30274

SDValue V;

30275

if (IsStrict)

30276

V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},

30277

{N->getOperand(0), N->getOperand(1)});

30278

else

30279

V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));

30280

Results.push_back(V);

30281

if (IsStrict)

30282

Results.push_back(V.getValue(1));

30283

return;

30284

}

30285

case ISD::FP_EXTEND:

30286

case ISD::STRICT_FP_EXTEND: {

30287

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

30288

// No other ValueType for FP_EXTEND should reach this point.

30289

assert(N->getValueType(0) == MVT::v2f32 &&((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30290, __PRETTY_FUNCTION__))

30290

"Do not know how to legalize this Node")((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30290, __PRETTY_FUNCTION__));

30291

return;

30292

}

30293

case ISD::INTRINSIC_W_CHAIN: {

30294

unsigned IntNo = N->getConstantOperandVal(1);

30295

switch (IntNo) {

30296

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30297)

30297

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30297);

30298

case Intrinsic::x86_rdtsc:

30299

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

30300

Results);

30301

case Intrinsic::x86_rdtscp:

30302

return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

30303

Results);

30304

case Intrinsic::x86_rdpmc:

30305

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

30306

Results);

30307

return;

30308

case Intrinsic::x86_xgetbv:

30309

expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

30310

Results);

30311

return;

30312

}

30313

}

30314

case ISD::READCYCLECOUNTER: {

30315

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

30316

}

30317

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

30318

EVT T = N->getValueType(0);

30319

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"
) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30319, __PRETTY_FUNCTION__));

30320

bool Regs64bit = T == MVT::i128;

30321

assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30322, __PRETTY_FUNCTION__))

30322

"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30322, __PRETTY_FUNCTION__));

30323

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

30324

SDValue cpInL, cpInH;

30325

cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

30326

DAG.getConstant(0, dl, HalfT));

30327

cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

30328

DAG.getConstant(1, dl, HalfT));

30329

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

30330

Regs64bit ? X86::RAX : X86::EAX,

30331

cpInL, SDValue());

30332

cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,

30333

Regs64bit ? X86::RDX : X86::EDX,

30334

cpInH, cpInL.getValue(1));

30335

SDValue swapInL, swapInH;

30336

swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

30337

DAG.getConstant(0, dl, HalfT));

30338

swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

30339

DAG.getConstant(1, dl, HalfT));

30340

swapInH =

30341

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

30342

swapInH, cpInH.getValue(1));

30343

// If the current function needs the base pointer, RBX,

30344

// we shouldn't use cmpxchg directly.

30345

// Indeed the lowering of that instruction will clobber

30346

// that register and since RBX will be a reserved register

30347

// the register allocator will not make sure its value will

30348

// be properly saved and restored around this live-range.

30349

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

30350

SDValue Result;

30351

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

30352

Register BasePtr = TRI->getBaseRegister();

30353

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

30354

if (TRI->hasBasePointer(DAG.getMachineFunction()) &&

30355

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

30356

// ISel prefers the LCMPXCHG64 variant.

30357

// If that assert breaks, that means it is not the case anymore,

30358

// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,

30359

// not just EBX. This is a matter of accepting i64 input for that

30360

// pseudo, and restoring into the register of the right wide

30361

// in expand pseudo. Everything else should just work.

30362

assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30363, __PRETTY_FUNCTION__))

30363

"Saving only half of the RBX")((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30363, __PRETTY_FUNCTION__));

30364

unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG

30365

: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;

30366

SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,

30367

Regs64bit ? X86::RBX : X86::EBX,

30368

HalfT, swapInH.getValue(1));

30369

SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,

30370

RBXSave,

30371

/*Glue*/ RBXSave.getValue(2)};

30372

Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);

30373

} else {

30374

unsigned Opcode =

30375

Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;

30376

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,

30377

Regs64bit ? X86::RBX : X86::EBX, swapInL,

30378

swapInH.getValue(1));

30379

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

30380

swapInL.getValue(1)};

30381

Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);

30382

}

30383

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

30384

Regs64bit ? X86::RAX : X86::EAX,

30385

HalfT, Result.getValue(1));

30386

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

30387

Regs64bit ? X86::RDX : X86::EDX,

30388

HalfT, cpOutL.getValue(2));

30389

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

30390

30391

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

30392

MVT::i32, cpOutH.getValue(2));

30393

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

30394

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

30395

30396

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

30397

Results.push_back(Success);

30398

Results.push_back(EFLAGS.getValue(1));

30399

return;

30400

}

30401

case ISD::ATOMIC_LOAD: {

30402

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30402, __PRETTY_FUNCTION__));

30403

bool NoImplicitFloatOps =

30404

DAG.getMachineFunction().getFunction().hasFnAttribute(

30405

Attribute::NoImplicitFloat);

30406

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

30407

auto *Node = cast<AtomicSDNode>(N);

30408

if (Subtarget.hasSSE1()) {

30409

// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

30410

// Then extract the lower 64-bits.

30411

MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

30412

SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

30413

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

30414

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

30415

MVT::i64, Node->getMemOperand());

30416

if (Subtarget.hasSSE2()) {

30417

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

30418

DAG.getIntPtrConstant(0, dl));

30419

Results.push_back(Res);

30420

Results.push_back(Ld.getValue(1));

30421

return;

30422

}

30423

// We use an alternative sequence for SSE1 that extracts as v2f32 and

30424

// then casts to i64. This avoids a 128-bit stack temporary being

30425

// created by type legalization if we were to cast v4f32->v2i64.

30426

SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

30427

DAG.getIntPtrConstant(0, dl));

30428

Res = DAG.getBitcast(MVT::i64, Res);

30429

Results.push_back(Res);

30430

Results.push_back(Ld.getValue(1));

30431

return;

30432

}

30433

if (Subtarget.hasX87()) {

30434

// First load this into an 80-bit X87 register. This will put the whole

30435

// integer into the significand.

30436

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

30437

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

30438

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

30439

dl, Tys, Ops, MVT::i64,

30440

Node->getMemOperand());

30441

SDValue Chain = Result.getValue(1);

30442

30443

// Now store the X87 register to a stack temporary and convert to i64.

30444

// This store is not atomic and doesn't need to be.

30445

// FIXME: We don't need a stack temporary if the result of the load

30446

// is already being stored. We could just directly store there.

30447

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

30448

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30449

MachinePointerInfo MPI =

30450

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30451

SDValue StoreOps[] = { Chain, Result, StackPtr };

30452

Chain = DAG.getMemIntrinsicNode(

30453

X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

30454

MPI, None /*Align*/, MachineMemOperand::MOStore);

30455

30456

// Finally load the value back from the stack temporary and return it.

30457

// This load is not atomic and doesn't need to be.

30458

// This load will be further type legalized.

30459

Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

30460

Results.push_back(Result);

30461

Results.push_back(Result.getValue(1));

30462

return;

30463

}

30464

}

30465

// TODO: Use MOVLPS when SSE1 is available?

30466

// Delegate to generic TypeLegalization. Situations we can really handle

30467

// should have already been dealt with by AtomicExpandPass.cpp.

30468

break;

30469

}

30470

case ISD::ATOMIC_SWAP:

30471

case ISD::ATOMIC_LOAD_ADD:

30472

case ISD::ATOMIC_LOAD_SUB:

30473

case ISD::ATOMIC_LOAD_AND:

30474

case ISD::ATOMIC_LOAD_OR:

30475

case ISD::ATOMIC_LOAD_XOR:

30476

case ISD::ATOMIC_LOAD_NAND:

30477

case ISD::ATOMIC_LOAD_MIN:

30478

case ISD::ATOMIC_LOAD_MAX:

30479

case ISD::ATOMIC_LOAD_UMIN:

30480

case ISD::ATOMIC_LOAD_UMAX:

30481

// Delegate to generic TypeLegalization. Situations we can really handle

30482

// should have already been dealt with by AtomicExpandPass.cpp.

30483

break;

30484

30485

case ISD::BITCAST: {

30486

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30486, __PRETTY_FUNCTION__));

30487

EVT DstVT = N->getValueType(0);

30488

EVT SrcVT = N->getOperand(0).getValueType();

30489

30490

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

30491

// we can split using the k-register rather than memory.

30492

if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

30493

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30493, __PRETTY_FUNCTION__));

30494

SDValue Lo, Hi;

30495

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

30496

Lo = DAG.getBitcast(MVT::i32, Lo);

30497

Hi = DAG.getBitcast(MVT::i32, Hi);

30498

SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

30499

Results.push_back(Res);

30500

return;

30501

}

30502

30503

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

30504

// FIXME: Use v4f32 for SSE1?

30505

assert(Subtarget.hasSSE2() && "Requires SSE2")((Subtarget.hasSSE2() && "Requires SSE2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30505, __PRETTY_FUNCTION__));

30506

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30507, __PRETTY_FUNCTION__))

30507

"Unexpected type action!")((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30507, __PRETTY_FUNCTION__));

30508

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

30509

SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

30510

N->getOperand(0));

30511

Res = DAG.getBitcast(WideVT, Res);

30512

Results.push_back(Res);

30513

return;

30514

}

30515

30516

return;

30517

}

30518

case ISD::MGATHER: {

30519

EVT VT = N->getValueType(0);

30520

if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

30521

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

30522

auto *Gather = cast<MaskedGatherSDNode>(N);

30523

SDValue Index = Gather->getIndex();

30524

if (Index.getValueType() != MVT::v2i64)

30525

return;

30526

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30527, __PRETTY_FUNCTION__))

30527

"Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30527, __PRETTY_FUNCTION__));

30528

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

30529

SDValue Mask = Gather->getMask();

30530

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30530, __PRETTY_FUNCTION__));

30531

SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

30532

Gather->getPassThru(),

30533

DAG.getUNDEF(VT));

30534

if (!Subtarget.hasVLX()) {

30535

// We need to widen the mask, but the instruction will only use 2

30536

// of its elements. So we can use undef.

30537

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

30538

DAG.getUNDEF(MVT::v2i1));

30539

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

30540

}

30541

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

30542

Gather->getBasePtr(), Index, Gather->getScale() };

30543

SDValue Res = DAG.getMemIntrinsicNode(

30544

X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

30545

Gather->getMemoryVT(), Gather->getMemOperand());

30546

Results.push_back(Res);

30547

Results.push_back(Res.getValue(1));

30548

return;

30549

}

30550

return;

30551

}

30552

case ISD::LOAD: {

30553

// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

30554

// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

30555

// cast since type legalization will try to use an i64 load.

30556

MVT VT = N->getSimpleValueType(0);

30557

assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")((VT.isVector() && VT.getSizeInBits() == 64 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30557, __PRETTY_FUNCTION__));

30558

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30559, __PRETTY_FUNCTION__))

30559

"Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30559, __PRETTY_FUNCTION__));

30560

if (!ISD::isNON_EXTLoad(N))

30561

return;

30562

auto *Ld = cast<LoadSDNode>(N);

30563

if (Subtarget.hasSSE2()) {

30564

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

30565

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

30566

Ld->getPointerInfo(), Ld->getOriginalAlign(),

30567

Ld->getMemOperand()->getFlags());

30568

SDValue Chain = Res.getValue(1);

30569

MVT VecVT = MVT::getVectorVT(LdVT, 2);

30570

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

30571

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

30572

Res = DAG.getBitcast(WideVT, Res);

30573

Results.push_back(Res);

30574

Results.push_back(Chain);

30575

return;

30576

}

30577

assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30577, __PRETTY_FUNCTION__));

30578

SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

30579

SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

30580

SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

30581

MVT::i64, Ld->getMemOperand());

30582

Results.push_back(Res);

30583

Results.push_back(Res.getValue(1));

30584

return;

30585

}

30586

case ISD::ADDRSPACECAST: {

30587

SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

30588

Results.push_back(V);

30589

return;

30590

}

30591

}

30592

}

30593

30594

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

30595

switch ((X86ISD::NodeType)Opcode) {

30596

case X86ISD::FIRST_NUMBER: break;

30597

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

30598

NODE_NAME_CASE(BSF)

30599

NODE_NAME_CASE(BSR)

30600

NODE_NAME_CASE(FSHL)

30601

NODE_NAME_CASE(FSHR)

30602

NODE_NAME_CASE(FAND)

30603

NODE_NAME_CASE(FANDN)

30604

NODE_NAME_CASE(FOR)

30605

NODE_NAME_CASE(FXOR)

30606

NODE_NAME_CASE(FILD)

30607

NODE_NAME_CASE(FIST)

30608

NODE_NAME_CASE(FP_TO_INT_IN_MEM)

30609

NODE_NAME_CASE(FLD)

30610

NODE_NAME_CASE(FST)

30611

NODE_NAME_CASE(CALL)

30612

NODE_NAME_CASE(BT)

30613

NODE_NAME_CASE(CMP)

30614

NODE_NAME_CASE(FCMP)

30615

NODE_NAME_CASE(STRICT_FCMP)

30616

NODE_NAME_CASE(STRICT_FCMPS)

30617

NODE_NAME_CASE(COMI)

30618

NODE_NAME_CASE(UCOMI)

30619

NODE_NAME_CASE(CMPM)

30620

NODE_NAME_CASE(CMPMM)

30621

NODE_NAME_CASE(STRICT_CMPM)

30622

NODE_NAME_CASE(CMPMM_SAE)

30623

NODE_NAME_CASE(SETCC)

30624

NODE_NAME_CASE(SETCC_CARRY)

30625

NODE_NAME_CASE(FSETCC)

30626

NODE_NAME_CASE(FSETCCM)

30627

NODE_NAME_CASE(FSETCCM_SAE)

30628

NODE_NAME_CASE(CMOV)

30629

NODE_NAME_CASE(BRCOND)

30630

NODE_NAME_CASE(RET_FLAG)

30631

NODE_NAME_CASE(IRET)

30632

NODE_NAME_CASE(REP_STOS)

30633

NODE_NAME_CASE(REP_MOVS)

30634

NODE_NAME_CASE(GlobalBaseReg)

30635

NODE_NAME_CASE(Wrapper)

30636

NODE_NAME_CASE(WrapperRIP)

30637

NODE_NAME_CASE(MOVQ2DQ)

30638

NODE_NAME_CASE(MOVDQ2Q)

30639

NODE_NAME_CASE(MMX_MOVD2W)

30640

NODE_NAME_CASE(MMX_MOVW2D)

30641

NODE_NAME_CASE(PEXTRB)

30642

NODE_NAME_CASE(PEXTRW)

30643

NODE_NAME_CASE(INSERTPS)

30644

NODE_NAME_CASE(PINSRB)

30645

NODE_NAME_CASE(PINSRW)

30646

NODE_NAME_CASE(PSHUFB)

30647

NODE_NAME_CASE(ANDNP)

30648

NODE_NAME_CASE(BLENDI)

30649

NODE_NAME_CASE(BLENDV)

30650

NODE_NAME_CASE(HADD)

30651

NODE_NAME_CASE(HSUB)

30652

NODE_NAME_CASE(FHADD)

30653

NODE_NAME_CASE(FHSUB)

30654

NODE_NAME_CASE(CONFLICT)

30655

NODE_NAME_CASE(FMAX)

30656

NODE_NAME_CASE(FMAXS)

30657

NODE_NAME_CASE(FMAX_SAE)

30658

NODE_NAME_CASE(FMAXS_SAE)

30659

NODE_NAME_CASE(FMIN)

30660

NODE_NAME_CASE(FMINS)

30661

NODE_NAME_CASE(FMIN_SAE)

30662

NODE_NAME_CASE(FMINS_SAE)

30663

NODE_NAME_CASE(FMAXC)

30664

NODE_NAME_CASE(FMINC)

30665

NODE_NAME_CASE(FRSQRT)

30666

NODE_NAME_CASE(FRCP)

30667

NODE_NAME_CASE(EXTRQI)

30668

NODE_NAME_CASE(INSERTQI)

30669

NODE_NAME_CASE(TLSADDR)

30670

NODE_NAME_CASE(TLSBASEADDR)

30671

NODE_NAME_CASE(TLSCALL)

30672

NODE_NAME_CASE(EH_SJLJ_SETJMP)

30673

NODE_NAME_CASE(EH_SJLJ_LONGJMP)

30674

NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

30675

NODE_NAME_CASE(EH_RETURN)

30676

NODE_NAME_CASE(TC_RETURN)

30677

NODE_NAME_CASE(FNSTCW16m)

30678

NODE_NAME_CASE(LCMPXCHG_DAG)

30679

NODE_NAME_CASE(LCMPXCHG8_DAG)

30680

NODE_NAME_CASE(LCMPXCHG16_DAG)

30681

NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)

30682

NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

30683

NODE_NAME_CASE(MWAITX_DAG)

30684

NODE_NAME_CASE(LADD)

30685

NODE_NAME_CASE(LSUB)

30686

NODE_NAME_CASE(LOR)

30687

NODE_NAME_CASE(LXOR)

30688

NODE_NAME_CASE(LAND)

30689

NODE_NAME_CASE(VZEXT_MOVL)

30690

NODE_NAME_CASE(VZEXT_LOAD)

30691

NODE_NAME_CASE(VEXTRACT_STORE)

30692

NODE_NAME_CASE(VTRUNC)

30693

NODE_NAME_CASE(VTRUNCS)

30694

NODE_NAME_CASE(VTRUNCUS)

30695

NODE_NAME_CASE(VMTRUNC)

30696

NODE_NAME_CASE(VMTRUNCS)

30697

NODE_NAME_CASE(VMTRUNCUS)

30698

NODE_NAME_CASE(VTRUNCSTORES)

30699

NODE_NAME_CASE(VTRUNCSTOREUS)

30700

NODE_NAME_CASE(VMTRUNCSTORES)

30701

NODE_NAME_CASE(VMTRUNCSTOREUS)

30702

NODE_NAME_CASE(VFPEXT)

30703

NODE_NAME_CASE(STRICT_VFPEXT)

30704

NODE_NAME_CASE(VFPEXT_SAE)

30705

NODE_NAME_CASE(VFPEXTS)

30706

NODE_NAME_CASE(VFPEXTS_SAE)

30707

NODE_NAME_CASE(VFPROUND)

30708

NODE_NAME_CASE(STRICT_VFPROUND)

30709

NODE_NAME_CASE(VMFPROUND)

30710

NODE_NAME_CASE(VFPROUND_RND)

30711

NODE_NAME_CASE(VFPROUNDS)

30712

NODE_NAME_CASE(VFPROUNDS_RND)

30713

NODE_NAME_CASE(VSHLDQ)

30714

NODE_NAME_CASE(VSRLDQ)

30715

NODE_NAME_CASE(VSHL)

30716

NODE_NAME_CASE(VSRL)

30717

NODE_NAME_CASE(VSRA)

30718

NODE_NAME_CASE(VSHLI)

30719

NODE_NAME_CASE(VSRLI)

30720

NODE_NAME_CASE(VSRAI)

30721

NODE_NAME_CASE(VSHLV)

30722

NODE_NAME_CASE(VSRLV)

30723

NODE_NAME_CASE(VSRAV)

30724

NODE_NAME_CASE(VROTLI)

30725

NODE_NAME_CASE(VROTRI)

30726

NODE_NAME_CASE(VPPERM)

30727

NODE_NAME_CASE(CMPP)

30728

NODE_NAME_CASE(STRICT_CMPP)

30729

NODE_NAME_CASE(PCMPEQ)

30730

NODE_NAME_CASE(PCMPGT)

30731

NODE_NAME_CASE(PHMINPOS)

30732

NODE_NAME_CASE(ADD)

30733

NODE_NAME_CASE(SUB)

30734

NODE_NAME_CASE(ADC)

30735

NODE_NAME_CASE(SBB)

30736

NODE_NAME_CASE(SMUL)

30737

NODE_NAME_CASE(UMUL)

30738

NODE_NAME_CASE(OR)

30739

NODE_NAME_CASE(XOR)

30740

NODE_NAME_CASE(AND)

30741

NODE_NAME_CASE(BEXTR)

30742

NODE_NAME_CASE(BZHI)

30743

NODE_NAME_CASE(PDEP)

30744

NODE_NAME_CASE(PEXT)

30745

NODE_NAME_CASE(MUL_IMM)

30746

NODE_NAME_CASE(MOVMSK)

30747

NODE_NAME_CASE(PTEST)

30748

NODE_NAME_CASE(TESTP)

30749

NODE_NAME_CASE(KORTEST)

30750

NODE_NAME_CASE(KTEST)

30751

NODE_NAME_CASE(KADD)

30752

NODE_NAME_CASE(KSHIFTL)

30753

NODE_NAME_CASE(KSHIFTR)

30754

NODE_NAME_CASE(PACKSS)

30755

NODE_NAME_CASE(PACKUS)

30756

NODE_NAME_CASE(PALIGNR)

30757

NODE_NAME_CASE(VALIGN)

30758

NODE_NAME_CASE(VSHLD)

30759

NODE_NAME_CASE(VSHRD)

30760

NODE_NAME_CASE(VSHLDV)

30761

NODE_NAME_CASE(VSHRDV)

30762

NODE_NAME_CASE(PSHUFD)

30763

NODE_NAME_CASE(PSHUFHW)

30764

NODE_NAME_CASE(PSHUFLW)

30765

NODE_NAME_CASE(SHUFP)

30766

NODE_NAME_CASE(SHUF128)

30767

NODE_NAME_CASE(MOVLHPS)

30768

NODE_NAME_CASE(MOVHLPS)

30769

NODE_NAME_CASE(MOVDDUP)

30770

NODE_NAME_CASE(MOVSHDUP)

30771

NODE_NAME_CASE(MOVSLDUP)

30772

NODE_NAME_CASE(MOVSD)

30773

NODE_NAME_CASE(MOVSS)

30774

NODE_NAME_CASE(UNPCKL)

30775

NODE_NAME_CASE(UNPCKH)

30776

NODE_NAME_CASE(VBROADCAST)

30777

NODE_NAME_CASE(VBROADCAST_LOAD)

30778

NODE_NAME_CASE(VBROADCASTM)

30779

NODE_NAME_CASE(SUBV_BROADCAST)

30780

NODE_NAME_CASE(VPERMILPV)

30781

NODE_NAME_CASE(VPERMILPI)

30782

NODE_NAME_CASE(VPERM2X128)

30783

NODE_NAME_CASE(VPERMV)

30784

NODE_NAME_CASE(VPERMV3)

30785

NODE_NAME_CASE(VPERMI)

30786

NODE_NAME_CASE(VPTERNLOG)

30787

NODE_NAME_CASE(VFIXUPIMM)

30788

NODE_NAME_CASE(VFIXUPIMM_SAE)

30789

NODE_NAME_CASE(VFIXUPIMMS)

30790

NODE_NAME_CASE(VFIXUPIMMS_SAE)

30791

NODE_NAME_CASE(VRANGE)

30792

NODE_NAME_CASE(VRANGE_SAE)

30793

NODE_NAME_CASE(VRANGES)

30794

NODE_NAME_CASE(VRANGES_SAE)

30795

NODE_NAME_CASE(PMULUDQ)

30796

NODE_NAME_CASE(PMULDQ)

30797

NODE_NAME_CASE(PSADBW)

30798

NODE_NAME_CASE(DBPSADBW)

30799

NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

30800

NODE_NAME_CASE(VAARG_64)

30801

NODE_NAME_CASE(WIN_ALLOCA)

30802

NODE_NAME_CASE(MEMBARRIER)

30803

NODE_NAME_CASE(MFENCE)

30804

NODE_NAME_CASE(SEG_ALLOCA)

30805

NODE_NAME_CASE(PROBED_ALLOCA)

30806

NODE_NAME_CASE(RDRAND)

30807

NODE_NAME_CASE(RDSEED)

30808

NODE_NAME_CASE(RDPKRU)

30809

NODE_NAME_CASE(WRPKRU)

30810

NODE_NAME_CASE(VPMADDUBSW)

30811

NODE_NAME_CASE(VPMADDWD)

30812

NODE_NAME_CASE(VPSHA)

30813

NODE_NAME_CASE(VPSHL)

30814

NODE_NAME_CASE(VPCOM)

30815

NODE_NAME_CASE(VPCOMU)

30816

NODE_NAME_CASE(VPERMIL2)

30817

NODE_NAME_CASE(FMSUB)

30818

NODE_NAME_CASE(STRICT_FMSUB)

30819

NODE_NAME_CASE(FNMADD)

30820

NODE_NAME_CASE(STRICT_FNMADD)

30821

NODE_NAME_CASE(FNMSUB)

30822

NODE_NAME_CASE(STRICT_FNMSUB)

30823

NODE_NAME_CASE(FMADDSUB)

30824

NODE_NAME_CASE(FMSUBADD)

30825

NODE_NAME_CASE(FMADD_RND)

30826

NODE_NAME_CASE(FNMADD_RND)

30827

NODE_NAME_CASE(FMSUB_RND)

30828

NODE_NAME_CASE(FNMSUB_RND)

30829

NODE_NAME_CASE(FMADDSUB_RND)

30830

NODE_NAME_CASE(FMSUBADD_RND)

30831

NODE_NAME_CASE(VPMADD52H)

30832

NODE_NAME_CASE(VPMADD52L)

30833

NODE_NAME_CASE(VRNDSCALE)

30834

NODE_NAME_CASE(STRICT_VRNDSCALE)

30835

NODE_NAME_CASE(VRNDSCALE_SAE)

30836

NODE_NAME_CASE(VRNDSCALES)

30837

NODE_NAME_CASE(VRNDSCALES_SAE)

30838

NODE_NAME_CASE(VREDUCE)

30839

NODE_NAME_CASE(VREDUCE_SAE)

30840

NODE_NAME_CASE(VREDUCES)

30841

NODE_NAME_CASE(VREDUCES_SAE)

30842

NODE_NAME_CASE(VGETMANT)

30843

NODE_NAME_CASE(VGETMANT_SAE)

30844

NODE_NAME_CASE(VGETMANTS)

30845

NODE_NAME_CASE(VGETMANTS_SAE)

30846

NODE_NAME_CASE(PCMPESTR)

30847

NODE_NAME_CASE(PCMPISTR)

30848

NODE_NAME_CASE(XTEST)

30849

NODE_NAME_CASE(COMPRESS)

30850

NODE_NAME_CASE(EXPAND)

30851

NODE_NAME_CASE(SELECTS)

30852

NODE_NAME_CASE(ADDSUB)

30853

NODE_NAME_CASE(RCP14)

30854

NODE_NAME_CASE(RCP14S)

30855

NODE_NAME_CASE(RCP28)

30856

NODE_NAME_CASE(RCP28_SAE)

30857

NODE_NAME_CASE(RCP28S)

30858

NODE_NAME_CASE(RCP28S_SAE)

30859

NODE_NAME_CASE(EXP2)

30860

NODE_NAME_CASE(EXP2_SAE)

30861

NODE_NAME_CASE(RSQRT14)

30862

NODE_NAME_CASE(RSQRT14S)

30863

NODE_NAME_CASE(RSQRT28)

30864

NODE_NAME_CASE(RSQRT28_SAE)

30865

NODE_NAME_CASE(RSQRT28S)

30866

NODE_NAME_CASE(RSQRT28S_SAE)

30867

NODE_NAME_CASE(FADD_RND)

30868

NODE_NAME_CASE(FADDS)

30869

NODE_NAME_CASE(FADDS_RND)

30870

NODE_NAME_CASE(FSUB_RND)

30871

NODE_NAME_CASE(FSUBS)

30872

NODE_NAME_CASE(FSUBS_RND)

30873

NODE_NAME_CASE(FMUL_RND)

30874

NODE_NAME_CASE(FMULS)

30875

NODE_NAME_CASE(FMULS_RND)

30876

NODE_NAME_CASE(FDIV_RND)

30877

NODE_NAME_CASE(FDIVS)

30878

NODE_NAME_CASE(FDIVS_RND)

30879

NODE_NAME_CASE(FSQRT_RND)

30880

NODE_NAME_CASE(FSQRTS)

30881

NODE_NAME_CASE(FSQRTS_RND)

30882

NODE_NAME_CASE(FGETEXP)

30883

NODE_NAME_CASE(FGETEXP_SAE)

30884

NODE_NAME_CASE(FGETEXPS)

30885

NODE_NAME_CASE(FGETEXPS_SAE)

30886

NODE_NAME_CASE(SCALEF)

30887

NODE_NAME_CASE(SCALEF_RND)

30888

NODE_NAME_CASE(SCALEFS)

30889

NODE_NAME_CASE(SCALEFS_RND)

30890

NODE_NAME_CASE(AVG)

30891

NODE_NAME_CASE(MULHRS)

30892

NODE_NAME_CASE(SINT_TO_FP_RND)

30893

NODE_NAME_CASE(UINT_TO_FP_RND)

30894

NODE_NAME_CASE(CVTTP2SI)

30895

NODE_NAME_CASE(CVTTP2UI)

30896

NODE_NAME_CASE(STRICT_CVTTP2SI)

30897

NODE_NAME_CASE(STRICT_CVTTP2UI)

30898

NODE_NAME_CASE(MCVTTP2SI)

30899

NODE_NAME_CASE(MCVTTP2UI)

30900

NODE_NAME_CASE(CVTTP2SI_SAE)

30901

NODE_NAME_CASE(CVTTP2UI_SAE)

30902

NODE_NAME_CASE(CVTTS2SI)

30903

NODE_NAME_CASE(CVTTS2UI)

30904

NODE_NAME_CASE(CVTTS2SI_SAE)

30905

NODE_NAME_CASE(CVTTS2UI_SAE)

30906

NODE_NAME_CASE(CVTSI2P)

30907

NODE_NAME_CASE(CVTUI2P)

30908

NODE_NAME_CASE(STRICT_CVTSI2P)

30909

NODE_NAME_CASE(STRICT_CVTUI2P)

30910

NODE_NAME_CASE(MCVTSI2P)

30911

NODE_NAME_CASE(MCVTUI2P)

30912

NODE_NAME_CASE(VFPCLASS)

30913

NODE_NAME_CASE(VFPCLASSS)

30914

NODE_NAME_CASE(MULTISHIFT)

30915

NODE_NAME_CASE(SCALAR_SINT_TO_FP)

30916

NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

30917

NODE_NAME_CASE(SCALAR_UINT_TO_FP)

30918

NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

30919

NODE_NAME_CASE(CVTPS2PH)

30920

NODE_NAME_CASE(STRICT_CVTPS2PH)

30921

NODE_NAME_CASE(MCVTPS2PH)

30922

NODE_NAME_CASE(CVTPH2PS)

30923

NODE_NAME_CASE(STRICT_CVTPH2PS)

30924

NODE_NAME_CASE(CVTPH2PS_SAE)

30925

NODE_NAME_CASE(CVTP2SI)

30926

NODE_NAME_CASE(CVTP2UI)

30927

NODE_NAME_CASE(MCVTP2SI)

30928

NODE_NAME_CASE(MCVTP2UI)

30929

NODE_NAME_CASE(CVTP2SI_RND)

30930

NODE_NAME_CASE(CVTP2UI_RND)

30931

NODE_NAME_CASE(CVTS2SI)

30932

NODE_NAME_CASE(CVTS2UI)

30933

NODE_NAME_CASE(CVTS2SI_RND)

30934

NODE_NAME_CASE(CVTS2UI_RND)

30935

NODE_NAME_CASE(CVTNE2PS2BF16)

30936

NODE_NAME_CASE(CVTNEPS2BF16)

30937

NODE_NAME_CASE(MCVTNEPS2BF16)

30938

NODE_NAME_CASE(DPBF16PS)

30939

NODE_NAME_CASE(LWPINS)

30940

NODE_NAME_CASE(MGATHER)

30941

NODE_NAME_CASE(MSCATTER)

30942

NODE_NAME_CASE(VPDPBUSD)

30943

NODE_NAME_CASE(VPDPBUSDS)

30944

NODE_NAME_CASE(VPDPWSSD)

30945

NODE_NAME_CASE(VPDPWSSDS)

30946

NODE_NAME_CASE(VPSHUFBITQMB)

30947

NODE_NAME_CASE(GF2P8MULB)

30948

NODE_NAME_CASE(GF2P8AFFINEQB)

30949

NODE_NAME_CASE(GF2P8AFFINEINVQB)

30950

NODE_NAME_CASE(NT_CALL)

30951

NODE_NAME_CASE(NT_BRIND)

30952

NODE_NAME_CASE(UMWAIT)

30953

NODE_NAME_CASE(TPAUSE)

30954

NODE_NAME_CASE(ENQCMD)

30955

NODE_NAME_CASE(ENQCMDS)

30956

NODE_NAME_CASE(VP2INTERSECT)

30957

}

30958

return nullptr;

30959

#undef NODE_NAME_CASE

30960

}

30961

30962

/// Return true if the addressing mode represented by AM is legal for this

30963

/// target, for a load/store of the specified type.

30964

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

30965

const AddrMode &AM, Type *Ty,

30966

unsigned AS,

30967

Instruction *I) const {

30968

// X86 supports extremely general addressing modes.

30969

CodeModel::Model M = getTargetMachine().getCodeModel();

30970

30971

// X86 allows a sign-extended 32-bit immediate field as a displacement.

30972

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

30973

return false;

30974

30975

if (AM.BaseGV) {

30976

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

30977

30978

// If a reference to this global requires an extra load, we can't fold it.

30979

if (isGlobalStubReference(GVFlags))

30980

return false;

30981

30982

// If BaseGV requires a register for the PIC base, we cannot also have a

30983

// BaseReg specified.

30984

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

30985

return false;

30986

30987

// If lower 4G is not available, then we must use rip-relative addressing.

30988

if ((M != CodeModel::Small || isPositionIndependent()) &&

30989

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

30990

return false;

30991

}

30992

30993

switch (AM.Scale) {

30994

case 0:

30995

case 1:

30996

case 2:

30997

case 4:

30998

case 8:

30999

// These scales always work.

31000

break;

31001

case 3:

31002

case 5:

31003

case 9:

31004

// These scales are formed with basereg+scalereg. Only accept if there is

31005

// no basereg yet.

31006

if (AM.HasBaseReg)

31007

return false;

31008

break;

31009

default: // Other stuff never works.

31010

return false;

31011

}

31012

31013

return true;

31014

}

31015

31016

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

31017

unsigned Bits = Ty->getScalarSizeInBits();

31018

31019

// 8-bit shifts are always expensive, but versions with a scalar amount aren't

31020

// particularly cheaper than those without.

31021

if (Bits == 8)

31022

return false;

31023

31024

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

31025

// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

31026

if (Subtarget.hasXOP() &&

31027

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

31028

return false;

31029

31030

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

31031

// shifts just as cheap as scalar ones.

31032

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

31033

return false;

31034

31035

// AVX512BW has shifts such as vpsllvw.

31036

if (Subtarget.hasBWI() && Bits == 16)

31037

return false;

31038

31039

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

31040

// fully general vector.

31041

return true;

31042

}

31043

31044

bool X86TargetLowering::isBinOp(unsigned Opcode) const {

31045

switch (Opcode) {

31046

// These are non-commutative binops.

31047

// TODO: Add more X86ISD opcodes once we have test coverage.

31048

case X86ISD::ANDNP:

31049

case X86ISD::PCMPGT:

31050

case X86ISD::FMAX:

31051

case X86ISD::FMIN:

31052

case X86ISD::FANDN:

31053

return true;

31054

}

31055

31056

return TargetLoweringBase::isBinOp(Opcode);

31057

}

31058

31059

bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

31060

switch (Opcode) {

31061

// TODO: Add more X86ISD opcodes once we have test coverage.

31062

case X86ISD::PCMPEQ:

31063

case X86ISD::PMULDQ:

31064

case X86ISD::PMULUDQ:

31065

case X86ISD::FMAXC:

31066

case X86ISD::FMINC:

31067

case X86ISD::FAND:

31068

case X86ISD::FOR:

31069

case X86ISD::FXOR:

31070

return true;

31071

}

31072

31073

return TargetLoweringBase::isCommutativeBinOp(Opcode);

31074

}

31075

31076

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

31077

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

31078

return false;

31079

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

31080

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

31081

return NumBits1 > NumBits2;

31082

}

31083

31084

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

31085

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

31086

return false;

31087

31088

if (!isTypeLegal(EVT::getEVT(Ty1)))

31089

return false;

31090

31091

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"
) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31091, __PRETTY_FUNCTION__));

31092

31093

// Assuming the caller doesn't have a zeroext or signext return parameter,

31094

// truncation all the way down to i1 is valid.

31095

return true;

31096

}

31097

31098

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

31099

return isInt<32>(Imm);

31100

}

31101

31102

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

31103

// Can also use sub to handle negated immediates.

31104

return isInt<32>(Imm);

31105

}

31106

31107

bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

31108

return isInt<32>(Imm);

31109

}

31110

31111

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

31112

if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

31113

return false;

31114

unsigned NumBits1 = VT1.getSizeInBits();

31115

unsigned NumBits2 = VT2.getSizeInBits();

31116

return NumBits1 > NumBits2;

31117

}

31118

31119

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

31120

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

31121

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

31122

}

31123

31124

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

31125

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

31126

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

31127

}

31128

31129

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

31130

EVT VT1 = Val.getValueType();

31131

if (isZExtFree(VT1, VT2))

31132

return true;

31133

31134

if (Val.getOpcode() != ISD::LOAD)

31135

return false;

31136

31137

if (!VT1.isSimple() || !VT1.isInteger() ||

31138

!VT2.isSimple() || !VT2.isInteger())

31139

return false;

31140

31141

switch (VT1.getSimpleVT().SimpleTy) {

31142

default: break;

31143

case MVT::i8:

31144

case MVT::i16:

31145

case MVT::i32:

31146

// X86 has 8, 16, and 32-bit zero-extending loads.

31147

return true;

31148

}

31149

31150

return false;

31151

}

31152

31153

bool X86TargetLowering::shouldSinkOperands(Instruction *I,

31154

SmallVectorImpl<Use *> &Ops) const {

31155

// A uniform shift amount in a vector shift or funnel shift may be much

31156

// cheaper than a generic variable vector shift, so make that pattern visible

31157

// to SDAG by sinking the shuffle instruction next to the shift.

31158

int ShiftAmountOpNum = -1;

31159

if (I->isShift())

31160

ShiftAmountOpNum = 1;

31161

else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

31162

if (II->getIntrinsicID() == Intrinsic::fshl ||

31163

II->getIntrinsicID() == Intrinsic::fshr)

31164

ShiftAmountOpNum = 2;

31165

}

31166

31167

if (ShiftAmountOpNum == -1)

31168

return false;

31169

31170

auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

31171

if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

31172

isVectorShiftByScalarCheap(I->getType())) {

31173

Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

31174

return true;

31175

}

31176

31177

return false;

31178

}

31179

31180

bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

31181

if (!Subtarget.is64Bit())

31182

return false;

31183

return TargetLowering::shouldConvertPhiType(From, To);

31184

}

31185

31186

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

31187

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

31188

return false;

31189

31190

EVT SrcVT = ExtVal.getOperand(0).getValueType();

31191

31192

// There is no extending load for vXi1.

31193

if (SrcVT.getScalarType() == MVT::i1)

31194

return false;

31195

31196

return true;

31197

}

31198

31199

bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

31200

EVT VT) const {

31201

if (!Subtarget.hasAnyFMA())

31202

return false;

31203

31204

VT = VT.getScalarType();

31205

31206

if (!VT.isSimple())

31207

return false;

31208

31209

switch (VT.getSimpleVT().SimpleTy) {

31210

case MVT::f32:

31211

case MVT::f64:

31212

return true;

31213

default:

31214

break;

31215

}

31216

31217

return false;

31218

}

31219

31220

bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {

31221

// i16 instructions are longer (0x66 prefix) and potentially slower.

31222

return !(VT1 == MVT::i32 && VT2 == MVT::i16);

31223

}

31224

31225

/// Targets can use this to indicate that they only support *some*

31226

/// VECTOR_SHUFFLE operations, those with specific masks.

31227

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

31228

/// are assumed to be legal.

31229

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

31230

if (!VT.isSimple())

31231

return false;

31232

31233

// Not for i1 vectors

31234

if (VT.getSimpleVT().getScalarType() == MVT::i1)

31235

return false;

31236

31237

// Very little shuffling can be done for 64-bit vectors right now.

31238

if (VT.getSimpleVT().getSizeInBits() == 64)

31239

return false;

31240

31241

// We only care that the types being shuffled are legal. The lowering can

31242

// handle any possible shuffle mask that results.

31243

return isTypeLegal(VT.getSimpleVT());

31244

}

31245

31246

bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

31247

EVT VT) const {

31248

// Don't convert an 'and' into a shuffle that we don't directly support.

31249

// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

31250

if (!Subtarget.hasAVX2())

31251

if (VT == MVT::v32i8 || VT == MVT::v16i16)

31252

return false;

31253

31254

// Just delegate to the generic legality, clear masks aren't special.

31255

return isShuffleMaskLegal(Mask, VT);

31256

}

31257

31258

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

31259

// If the subtarget is using thunks, we need to not generate jump tables.

31260

if (Subtarget.useIndirectThunkBranches())

31261

return false;

31262

31263

// Otherwise, fallback on the generic logic.

31264

return TargetLowering::areJTsAllowed(Fn);

31265

}

31266

31267

//===----------------------------------------------------------------------===//

31268

// X86 Scheduler Hooks

31269

//===----------------------------------------------------------------------===//

31270

31271

// Returns true if EFLAG is consumed after this iterator in the rest of the

31272

// basic block or any successors of the basic block.

31273

static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,

31274

MachineBasicBlock *BB) {

31275

// Scan forward through BB for a use/def of EFLAGS.

31276

for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();

31277

miI != miE; ++miI) {

31278

const MachineInstr& mi = *miI;

31279

if (mi.readsRegister(X86::EFLAGS))

31280

return true;

31281

// If we found a def, we can stop searching.

31282

if (mi.definesRegister(X86::EFLAGS))

31283

return false;

31284

}

31285

31286

// If we hit the end of the block, check whether EFLAGS is live into a

31287

// successor.

31288

for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),

31289

sEnd = BB->succ_end();

31290

sItr != sEnd; ++sItr) {

31291

MachineBasicBlock* succ = *sItr;

31292

if (succ->isLiveIn(X86::EFLAGS))

31293

return true;

31294

}

31295

31296

return false;

31297

}

31298

31299

/// Utility function to emit xbegin specifying the start of an RTM region.

31300

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

31301

const TargetInstrInfo *TII) {

31302

const DebugLoc &DL = MI.getDebugLoc();

31303

31304

const BasicBlock *BB = MBB->getBasicBlock();

31305

MachineFunction::iterator I = ++MBB->getIterator();

31306

31307

// For the v = xbegin(), we generate

31308

//

31309

// thisMBB:

31310

// xbegin sinkMBB

31311

//

31312

// mainMBB:

31313

// s0 = -1

31314

//

31315

// fallBB:

31316

// eax = # XABORT_DEF

31317

// s1 = eax

31318

//

31319

// sinkMBB:

31320

// v = phi(s0/mainBB, s1/fallBB)

31321

31322

MachineBasicBlock *thisMBB = MBB;

31323

MachineFunction *MF = MBB->getParent();

31324

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

31325

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

31326

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

31327

MF->insert(I, mainMBB);

31328

MF->insert(I, fallMBB);

31329

MF->insert(I, sinkMBB);

31330

31331

if (isEFLAGSLiveAfter(MI, MBB)) {

31332

mainMBB->addLiveIn(X86::EFLAGS);

31333

fallMBB->addLiveIn(X86::EFLAGS);

31334

sinkMBB->addLiveIn(X86::EFLAGS);

31335

}

31336

31337

// Transfer the remainder of BB and its successor edges to sinkMBB.

31338

sinkMBB->splice(sinkMBB->begin(), MBB,

31339

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

31340

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

31341

31342

MachineRegisterInfo &MRI = MF->getRegInfo();

31343

Register DstReg = MI.getOperand(0).getReg();

31344

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

31345

Register mainDstReg = MRI.createVirtualRegister(RC);

31346

Register fallDstReg = MRI.createVirtualRegister(RC);

31347

31348

// thisMBB:

31349

// xbegin fallMBB

31350

// # fallthrough to mainMBB

31351

// # abortion to fallMBB

31352

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

31353

thisMBB->addSuccessor(mainMBB);

31354

thisMBB->addSuccessor(fallMBB);

31355

31356

// mainMBB:

31357

// mainDstReg := -1

31358

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

31359

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

31360

mainMBB->addSuccessor(sinkMBB);

31361

31362

// fallMBB:

31363

// ; pseudo instruction to model hardware's definition from XABORT

31364

// EAX := XABORT_DEF

31365

// fallDstReg := EAX

31366

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

31367

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

31368

.addReg(X86::EAX);

31369

fallMBB->addSuccessor(sinkMBB);

31370

31371

// sinkMBB:

31372

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

31373

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

31374

.addReg(mainDstReg).addMBB(mainMBB)

31375

.addReg(fallDstReg).addMBB(fallMBB);

31376

31377

MI.eraseFromParent();

31378

return sinkMBB;

31379

}

31380

31381

31382

31383

MachineBasicBlock *

31384

X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,

31385

MachineBasicBlock *MBB) const {

31386

// Emit va_arg instruction on X86-64.

31387

31388

// Operands to this pseudo-instruction:

31389

// 0 ) Output : destination address (reg)

31390

// 1-5) Input : va_list address (addr, i64mem)

31391

// 6 ) ArgSize : Size (in bytes) of vararg type

31392

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

31393

// 8 ) Align : Alignment of type

31394

// 9 ) EFLAGS (implicit-def)

31395

31396

assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"
) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31396, __PRETTY_FUNCTION__));

31397

static_assert(X86::AddrNumOperands == 5,

31398

"VAARG_64 assumes 5 address operands");

31399

31400

Register DestReg = MI.getOperand(0).getReg();

31401

MachineOperand &Base = MI.getOperand(1);

31402

MachineOperand &Scale = MI.getOperand(2);

31403

MachineOperand &Index = MI.getOperand(3);

31404

MachineOperand &Disp = MI.getOperand(4);

31405

MachineOperand &Segment = MI.getOperand(5);

31406

unsigned ArgSize = MI.getOperand(6).getImm();

31407

unsigned ArgMode = MI.getOperand(7).getImm();

31408

Align Alignment = Align(MI.getOperand(8).getImm());

31409

31410

MachineFunction *MF = MBB->getParent();

31411

31412

// Memory Reference

31413

assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? static_cast<void> (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31413, __PRETTY_FUNCTION__));

31414

31415

MachineMemOperand *OldMMO = MI.memoperands().front();

31416

31417

// Clone the MMO into two separate MMOs for loading and storing

31418

MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

31419

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

31420

MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

31421

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

31422

31423

// Machine Information

31424

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

31425

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

31426

const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);

31427

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

31428

const DebugLoc &DL = MI.getDebugLoc();

31429

31430

// struct va_list {

31431

// i32 gp_offset

31432

// i32 fp_offset

31433

// i64 overflow_area (address)

31434

// i64 reg_save_area (address)

31435

// }

31436

// sizeof(va_list) = 24

31437

// alignment(va_list) = 8

31438

31439

unsigned TotalNumIntRegs = 6;

31440

unsigned TotalNumXMMRegs = 8;

31441

bool UseGPOffset = (ArgMode == 1);

31442

bool UseFPOffset = (ArgMode == 2);

31443

unsigned MaxOffset = TotalNumIntRegs * 8 +

31444

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

31445

31446

/* Align ArgSize to a multiple of 8 */

31447

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

31448

bool NeedsAlign = (Alignment > 8);

31449

31450

MachineBasicBlock *thisMBB = MBB;

31451

MachineBasicBlock *overflowMBB;

31452

MachineBasicBlock *offsetMBB;

31453

MachineBasicBlock *endMBB;

31454

31455

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

31456

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

31457

unsigned OffsetReg = 0;

31458

31459

if (!UseGPOffset && !UseFPOffset) {

31460

// If we only pull from the overflow region, we don't create a branch.

31461

// We don't need to alter control flow.

31462

OffsetDestReg = 0; // unused

31463

OverflowDestReg = DestReg;

31464

31465

offsetMBB = nullptr;

31466

overflowMBB = thisMBB;

31467

endMBB = thisMBB;

31468

} else {

31469

// First emit code to check if gp_offset (or fp_offset) is below the bound.

31470

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

31471

// If not, pull from overflow_area. (branch to overflowMBB)

31472

//

31473

// thisMBB

31474

// | .

31475

// | .

31476

// offsetMBB overflowMBB

31477

// | .

31478

// | .

31479

// endMBB

31480

31481

// Registers for the PHI in endMBB

31482

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

31483

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

31484

31485

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

31486

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

31487

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

31488

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

31489

31490

MachineFunction::iterator MBBIter = ++MBB->getIterator();

31491

31492

// Insert the new basic blocks

31493

MF->insert(MBBIter, offsetMBB);

31494

MF->insert(MBBIter, overflowMBB);

31495

MF->insert(MBBIter, endMBB);

31496

31497

// Transfer the remainder of MBB and its successor edges to endMBB.

31498

endMBB->splice(endMBB->begin(), thisMBB,

31499

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

31500

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

31501

31502

// Make offsetMBB and overflowMBB successors of thisMBB

31503

thisMBB->addSuccessor(offsetMBB);

31504

thisMBB->addSuccessor(overflowMBB);

31505

31506

// endMBB is a successor of both offsetMBB and overflowMBB

31507

offsetMBB->addSuccessor(endMBB);

31508

overflowMBB->addSuccessor(endMBB);

31509

31510

// Load the offset value into a register

31511

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

31512

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

31513

.add(Base)

31514

.add(Scale)

31515

.add(Index)

31516

.addDisp(Disp, UseFPOffset ? 4 : 0)

31517

.add(Segment)

31518

.setMemRefs(LoadOnlyMMO);

31519

31520

// Check if there is enough room left to pull this argument.

31521

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

31522

.addReg(OffsetReg)

31523

.addImm(MaxOffset + 8 - ArgSizeA8);

31524

31525

// Branch to "overflowMBB" if offset >= max

31526

// Fall through to "offsetMBB" otherwise

31527

BuildMI(thisMBB, DL, TII->get(X86::JCC_1))

31528

.addMBB(overflowMBB).addImm(X86::COND_AE);

31529

}

31530

31531

// In offsetMBB, emit code to use the reg_save_area.

31532

if (offsetMBB) {

31533

assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31533, __PRETTY_FUNCTION__));

31534

31535

// Read the reg_save_area address.

31536

Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

31537

BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)

31538

.add(Base)

31539

.add(Scale)

31540

.add(Index)

31541

.addDisp(Disp, 16)

31542

.add(Segment)

31543

.setMemRefs(LoadOnlyMMO);

31544

31545

// Zero-extend the offset

31546

Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

31547

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

31548

.addImm(0)

31549

.addReg(OffsetReg)

31550

.addImm(X86::sub_32bit);

31551

31552

// Add the offset to the reg_save_area to get the final address.

31553

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

31554

.addReg(OffsetReg64)

31555

.addReg(RegSaveReg);

31556

31557

// Compute the offset for the next argument

31558

Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

31559

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

31560

.addReg(OffsetReg)

31561

.addImm(UseFPOffset ? 16 : 8);

31562

31563

// Store it back into the va_list.

31564

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

31565

.add(Base)

31566

.add(Scale)

31567

.add(Index)

31568

.addDisp(Disp, UseFPOffset ? 4 : 0)

31569

.add(Segment)

31570

.addReg(NextOffsetReg)

31571

.setMemRefs(StoreOnlyMMO);

31572

31573

// Jump to endMBB

31574

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

31575

.addMBB(endMBB);

31576

}

31577

31578

//

31579

// Emit code to use overflow area

31580

//

31581

31582

// Load the overflow_area address into a register.

31583

Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

31584

BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)

31585

.add(Base)

31586

.add(Scale)

31587

.add(Index)

31588

.addDisp(Disp, 8)

31589

.add(Segment)

31590

.setMemRefs(LoadOnlyMMO);

31591

31592

// If we need to align it, do so. Otherwise, just copy the address

31593

// to OverflowDestReg.

31594

if (NeedsAlign) {

31595

// Align the overflow address

31596

Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

31597

31598

// aligned_addr = (addr + (align-1)) & ~(align-1)

31599

BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)

31600

.addReg(OverflowAddrReg)

31601

.addImm(Alignment.value() - 1);

31602

31603

BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)

31604

.addReg(TmpReg)

31605

.addImm(~(uint64_t)(Alignment.value() - 1));

31606

} else {

31607

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

31608

.addReg(OverflowAddrReg);

31609

}

31610

31611

// Compute the next overflow address after this argument.

31612

// (the overflow address should be kept 8-byte aligned)

31613

Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

31614

BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)

31615

.addReg(OverflowDestReg)

31616

.addImm(ArgSizeA8);

31617

31618

// Store the new overflow address.

31619

BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))

31620

.add(Base)

31621

.add(Scale)

31622

.add(Index)

31623

.addDisp(Disp, 8)

31624

.add(Segment)

31625

.addReg(NextAddrReg)

31626

.setMemRefs(StoreOnlyMMO);

31627

31628

// If we branched, emit the PHI to the front of endMBB.

31629

if (offsetMBB) {

31630

BuildMI(*endMBB, endMBB->begin(), DL,

31631

TII->get(X86::PHI), DestReg)

31632

.addReg(OffsetDestReg).addMBB(offsetMBB)

31633

.addReg(OverflowDestReg).addMBB(overflowMBB);

31634

}

31635

31636

// Erase the pseudo instruction

31637

MI.eraseFromParent();

31638

31639

return endMBB;

31640

}

31641

31642

MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(

31643

MachineInstr &MI, MachineBasicBlock *MBB) const {

31644

// Emit code to save XMM registers to the stack. The ABI says that the

31645

// number of registers to save is given in %al, so it's theoretically

31646

// possible to do an indirect jump trick to avoid saving all of them,

31647

// however this code takes a simpler approach and just executes all

31648

// of the stores if %al is non-zero. It's less code, and it's probably

31649

// easier on the hardware branch predictor, and stores aren't all that

31650

// expensive anyway.

31651

31652

// Create the new basic blocks. One block contains all the XMM stores,

31653

// and one block is the final destination regardless of whether any

31654

// stores were performed.

31655

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

31656

MachineFunction *F = MBB->getParent();

31657

MachineFunction::iterator MBBIter = ++MBB->getIterator();

31658

MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);

31659

MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);

31660

F->insert(MBBIter, XMMSaveMBB);

31661

F->insert(MBBIter, EndMBB);

31662

31663

// Transfer the remainder of MBB and its successor edges to EndMBB.

31664

EndMBB->splice(EndMBB->begin(), MBB,

31665

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

31666

EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

31667

31668

// The original block will now fall through to the XMM save block.

31669

MBB->addSuccessor(XMMSaveMBB);

31670

// The XMMSaveMBB will fall through to the end block.

31671

XMMSaveMBB->addSuccessor(EndMBB);

31672

31673

// Now add the instructions.

31674

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

31675

const DebugLoc &DL = MI.getDebugLoc();

31676

31677

Register CountReg = MI.getOperand(0).getReg();

31678

int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();

31679

int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

31680

31681

if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {

31682

// If %al is 0, branch around the XMM save block.

31683

BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);

31684

BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);

31685

MBB->addSuccessor(EndMBB);

31686

}

31687

31688

// Make sure the last operand is EFLAGS, which gets clobbered by the branch

31689

// that was just emitted, but clearly shouldn't be "saved".

31690

assert((MI.getNumOperands() <= 3 ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31693, __PRETTY_FUNCTION__))

31691

!MI.getOperand(MI.getNumOperands() - 1).isReg() ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31693, __PRETTY_FUNCTION__))

31692

MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31693, __PRETTY_FUNCTION__))

31693

"Expected last argument to be EFLAGS")(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31693, __PRETTY_FUNCTION__));

31694

unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;

31695

// In the XMM save block, save all the XMM argument registers.

31696

for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {

31697

int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;

31698

MachineMemOperand *MMO = F->getMachineMemOperand(

31699

MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),

31700

MachineMemOperand::MOStore,

31701

/*Size=*/16, Align(16));

31702

BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))

31703

.addFrameIndex(RegSaveFrameIndex)

31704

.addImm(/*Scale=*/1)

31705

.addReg(/*IndexReg=*/0)

31706

.addImm(/*Disp=*/Offset)

31707

.addReg(/*Segment=*/0)

31708

.addReg(MI.getOperand(i).getReg())

31709

.addMemOperand(MMO);

31710

}

31711

31712

MI.eraseFromParent(); // The pseudo instruction is gone now.

31713

31714

return EndMBB;

31715

}

31716

31717

// The EFLAGS operand of SelectItr might be missing a kill marker

31718

// because there were multiple uses of EFLAGS, and ISel didn't know

31719

// which to mark. Figure out whether SelectItr should have had a

31720

// kill marker, and set it if it should. Returns the correct kill

31721

// marker value.

31722

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

31723

MachineBasicBlock* BB,

31724

const TargetRegisterInfo* TRI) {

31725

if (isEFLAGSLiveAfter(SelectItr, BB))

31726

return false;

31727

31728

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

31729

// out. SelectMI should have a kill flag on EFLAGS.

31730

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

31731

return true;

31732

}

31733

31734

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

31735

// together with other CMOV pseudo-opcodes into a single basic-block with

31736

// conditional jump around it.

31737

static bool isCMOVPseudo(MachineInstr &MI) {

31738

switch (MI.getOpcode()) {

31739

case X86::CMOV_FR32:

31740

case X86::CMOV_FR32X:

31741

case X86::CMOV_FR64:

31742

case X86::CMOV_FR64X:

31743

case X86::CMOV_GR8:

31744

case X86::CMOV_GR16:

31745

case X86::CMOV_GR32:

31746

case X86::CMOV_RFP32:

31747

case X86::CMOV_RFP64:

31748

case X86::CMOV_RFP80:

31749

case X86::CMOV_VR64:

31750

case X86::CMOV_VR128:

31751

case X86::CMOV_VR128X:

31752

case X86::CMOV_VR256:

31753

case X86::CMOV_VR256X:

31754

case X86::CMOV_VR512:

31755

case X86::CMOV_VK1:

31756

case X86::CMOV_VK2:

31757

case X86::CMOV_VK4:

31758

case X86::CMOV_VK8:

31759

case X86::CMOV_VK16:

31760

case X86::CMOV_VK32:

31761

case X86::CMOV_VK64:

31762

return true;

31763

31764

default:

31765

return false;

31766

}

31767

}

31768

31769

// Helper function, which inserts PHI functions into SinkMBB:

31770

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

31771

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

31772

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

31773

// the last PHI function inserted.

31774

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

31775

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

31776

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

31777

MachineBasicBlock *SinkMBB) {

31778

MachineFunction *MF = TrueMBB->getParent();

31779

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

31780

DebugLoc DL = MIItBegin->getDebugLoc();

31781

31782

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

31783

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

31784

31785

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

31786

31787

// As we are creating the PHIs, we have to be careful if there is more than

31788

// one. Later CMOVs may reference the results of earlier CMOVs, but later

31789

// PHIs have to reference the individual true/false inputs from earlier PHIs.

31790

// That also means that PHI construction must work forward from earlier to

31791

// later, and that the code must maintain a mapping from earlier PHI's

31792

// destination registers, and the registers that went into the PHI.

31793

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

31794

MachineInstrBuilder MIB;

31795

31796

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

31797

Register DestReg = MIIt->getOperand(0).getReg();

31798

Register Op1Reg = MIIt->getOperand(1).getReg();

31799

Register Op2Reg = MIIt->getOperand(2).getReg();

31800

31801

// If this CMOV we are generating is the opposite condition from

31802

// the jump we generated, then we have to swap the operands for the

31803

// PHI that is going to be generated.

31804

if (MIIt->getOperand(3).getImm() == OppCC)

31805

std::swap(Op1Reg, Op2Reg);

31806

31807

if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())

31808

Op1Reg = RegRewriteTable[Op1Reg].first;

31809

31810

if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())

31811

Op2Reg = RegRewriteTable[Op2Reg].second;

31812

31813

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

31814

.addReg(Op1Reg)

31815

.addMBB(FalseMBB)

31816

.addReg(Op2Reg)

31817

.addMBB(TrueMBB);

31818

31819

// Add this PHI to the rewrite table.

31820

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

31821

}

31822

31823

return MIB;

31824

}

31825

31826

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

31827

MachineBasicBlock *

31828

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

31829

MachineInstr &SecondCascadedCMOV,

31830

MachineBasicBlock *ThisMBB) const {

31831

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

31832

DebugLoc DL = FirstCMOV.getDebugLoc();

31833

31834

// We lower cascaded CMOVs such as

31835

//

31836

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

31837

//

31838

// to two successive branches.

31839

//

31840

// Without this, we would add a PHI between the two jumps, which ends up

31841

// creating a few copies all around. For instance, for

31842

//

31843

// (sitofp (zext (fcmp une)))

31844

//

31845

// we would generate:

31846

//

31847

// ucomiss %xmm1, %xmm0

31848

// movss <1.0f>, %xmm0

31849

// movaps %xmm0, %xmm1

31850

// jne .LBB5_2

31851

// xorps %xmm1, %xmm1

31852

// .LBB5_2:

31853

// jp .LBB5_4

31854

// movaps %xmm1, %xmm0

31855

// .LBB5_4:

31856

// retq

31857

//

31858

// because this custom-inserter would have generated:

31859

//

31860

// A

31861

// | \

31862

// | B

31863

// | /

31864

// C

31865

// | \

31866

// | D

31867

// | /

31868

// E

31869

//

31870

// A: X = ...; Y = ...

31871

// B: empty

31872

// C: Z = PHI [X, A], [Y, B]

31873

// D: empty

31874

// E: PHI [X, C], [Z, D]

31875

//

31876

// If we lower both CMOVs in a single step, we can instead generate:

31877

//

31878

// A

31879

// | \

31880

// | C

31881

// | /|

31882

// |/ |

31883

// | |

31884

// | D

31885

// | /

31886

// E

31887

//

31888

// A: X = ...; Y = ...

31889

// D: empty

31890

// E: PHI [X, A], [X, C], [Y, D]

31891

//

31892

// Which, in our sitofp/fcmp example, gives us something like:

31893

//

31894

// ucomiss %xmm1, %xmm0

31895

// movss <1.0f>, %xmm0

31896

// jne .LBB5_4

31897

// jp .LBB5_4

31898

// xorps %xmm0, %xmm0

31899

// .LBB5_4:

31900

// retq

31901

//

31902

31903

// We lower cascaded CMOV into two successive branches to the same block.

31904

// EFLAGS is used by both, so mark it as live in the second.

31905

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

31906

MachineFunction *F = ThisMBB->getParent();

31907

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

31908

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

31909

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

31910

31911

MachineFunction::iterator It = ++ThisMBB->getIterator();

31912

F->insert(It, FirstInsertedMBB);

31913

F->insert(It, SecondInsertedMBB);

31914

F->insert(It, SinkMBB);

31915

31916

// For a cascaded CMOV, we lower it to two successive branches to

31917

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

31918

// the FirstInsertedMBB.

31919

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

31920

31921

// If the EFLAGS register isn't dead in the terminator, then claim that it's

31922

// live into the sink and copy blocks.

31923

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

31924

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

31925

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

31926

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

31927

SinkMBB->addLiveIn(X86::EFLAGS);

31928

}

31929

31930

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

31931

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

31932

std::next(MachineBasicBlock::iterator(FirstCMOV)),

31933

ThisMBB->end());

31934

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

31935

31936

// Fallthrough block for ThisMBB.

31937

ThisMBB->addSuccessor(FirstInsertedMBB);

31938

// The true block target of the first branch is always SinkMBB.

31939

ThisMBB->addSuccessor(SinkMBB);

31940

// Fallthrough block for FirstInsertedMBB.

31941

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

31942

// The true block for the branch of FirstInsertedMBB.

31943

FirstInsertedMBB->addSuccessor(SinkMBB);

31944

// This is fallthrough.

31945

SecondInsertedMBB->addSuccessor(SinkMBB);

31946

31947

// Create the conditional branch instructions.

31948

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

31949

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

31950

31951

X86::CondCode SecondCC =

31952

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

31953

BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

31954

31955

// SinkMBB:

31956

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

31957

Register DestReg = FirstCMOV.getOperand(0).getReg();

31958

Register Op1Reg = FirstCMOV.getOperand(1).getReg();

31959

Register Op2Reg = FirstCMOV.getOperand(2).getReg();

31960

MachineInstrBuilder MIB =

31961

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

31962

.addReg(Op1Reg)

31963

.addMBB(SecondInsertedMBB)

31964

.addReg(Op2Reg)

31965

.addMBB(ThisMBB);

31966

31967

// The second SecondInsertedMBB provides the same incoming value as the

31968

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

31969

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

31970

// Copy the PHI result to the register defined by the second CMOV.

31971

BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,

31972

TII->get(TargetOpcode::COPY),

31973

SecondCascadedCMOV.getOperand(0).getReg())

31974

.addReg(FirstCMOV.getOperand(0).getReg());

31975

31976

// Now remove the CMOVs.

31977

FirstCMOV.eraseFromParent();

31978

SecondCascadedCMOV.eraseFromParent();

31979

31980

return SinkMBB;

31981

}

31982

31983

MachineBasicBlock *

31984

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

31985

MachineBasicBlock *ThisMBB) const {

31986

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

31987

const DebugLoc &DL = MI.getDebugLoc();

31988

31989

// To "insert" a SELECT_CC instruction, we actually have to insert the

31990

// diamond control-flow pattern. The incoming instruction knows the

31991

// destination vreg to set, the condition code register to branch on, the

31992

// true/false values to select between and a branch opcode to use.

31993

31994

// ThisMBB:

31995

// ...

31996

// TrueVal = ...

31997

// cmpTY ccX, r1, r2

31998

// bCC copy1MBB

31999

// fallthrough --> FalseMBB

32000

32001

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

32002

// as described above, by inserting a BB, and then making a PHI at the join

32003

// point to select the true and false operands of the CMOV in the PHI.

32004

//

32005

// The code also handles two different cases of multiple CMOV opcodes

32006

// in a row.

32007

//

32008

// Case 1:

32009

// In this case, there are multiple CMOVs in a row, all which are based on

32010

// the same condition setting (or the exact opposite condition setting).

32011

// In this case we can lower all the CMOVs using a single inserted BB, and

32012

// then make a number of PHIs at the join point to model the CMOVs. The only

32013

// trickiness here, is that in a case like:

32014

//

32015

// t2 = CMOV cond1 t1, f1

32016

// t3 = CMOV cond1 t2, f2

32017

//

32018

// when rewriting this into PHIs, we have to perform some renaming on the

32019

// temps since you cannot have a PHI operand refer to a PHI result earlier

32020

// in the same block. The "simple" but wrong lowering would be:

32021

//

32022

// t2 = PHI t1(BB1), f1(BB2)

32023

// t3 = PHI t2(BB1), f2(BB2)

32024

//

32025

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

32026

// renaming is to note that on the path through BB1, t2 is really just a

32027

// copy of t1, and do that renaming, properly generating:

32028

//

32029

// t2 = PHI t1(BB1), f1(BB2)

32030

// t3 = PHI t1(BB1), f2(BB2)

32031

//

32032

// Case 2:

32033

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

32034

// function - EmitLoweredCascadedSelect.

32035

32036

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

32037

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

32038

MachineInstr *LastCMOV = &MI;

32039

MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

32040

32041

// Check for case 1, where there are multiple CMOVs with the same condition

32042

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

32043

// number of jumps the most.

32044

32045

if (isCMOVPseudo(MI)) {

32046

// See if we have a string of CMOVS with the same condition. Skip over

32047

// intervening debug insts.

32048

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

32049

(NextMIIt->getOperand(3).getImm() == CC ||

32050

NextMIIt->getOperand(3).getImm() == OppCC)) {

32051

LastCMOV = &*NextMIIt;

32052

NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

32053

}

32054

}

32055

32056

// This checks for case 2, but only do this if we didn't already find

32057

// case 1, as indicated by LastCMOV == MI.

32058

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

32059

NextMIIt->getOpcode() == MI.getOpcode() &&

32060

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

32061

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

32062

NextMIIt->getOperand(1).isKill()) {

32063

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

32064

}

32065

32066

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

32067

MachineFunction *F = ThisMBB->getParent();

32068

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

32069

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

32070

32071

MachineFunction::iterator It = ++ThisMBB->getIterator();

32072

F->insert(It, FalseMBB);

32073

F->insert(It, SinkMBB);

32074

32075

// If the EFLAGS register isn't dead in the terminator, then claim that it's

32076

// live into the sink and copy blocks.

32077

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

32078

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

32079

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

32080

FalseMBB->addLiveIn(X86::EFLAGS);

32081

SinkMBB->addLiveIn(X86::EFLAGS);

32082

}

32083

32084

// Transfer any debug instructions inside the CMOV sequence to the sunk block.

32085

auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);

32086

auto DbgIt = MachineBasicBlock::iterator(MI);

32087

while (DbgIt != DbgEnd) {

32088

auto Next = std::next(DbgIt);

32089

if (DbgIt->isDebugInstr())

32090

SinkMBB->push_back(DbgIt->removeFromParent());

32091

DbgIt = Next;

32092

}

32093

32094

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

32095

SinkMBB->splice(SinkMBB->end(), ThisMBB,

32096

std::next(MachineBasicBlock::iterator(LastCMOV)),

32097

ThisMBB->end());

32098

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

32099

32100

// Fallthrough block for ThisMBB.

32101

ThisMBB->addSuccessor(FalseMBB);

32102

// The true block target of the first (or only) branch is always a SinkMBB.

32103

ThisMBB->addSuccessor(SinkMBB);

32104

// Fallthrough block for FalseMBB.

32105

FalseMBB->addSuccessor(SinkMBB);

32106

32107

// Create the conditional branch instruction.

32108

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

32109

32110

// SinkMBB:

32111

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

32112

// ...

32113

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

32114

MachineBasicBlock::iterator MIItEnd =

32115

std::next(MachineBasicBlock::iterator(LastCMOV));

32116

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

32117

32118

// Now remove the CMOV(s).

32119

ThisMBB->erase(MIItBegin, MIItEnd);

32120

32121

return SinkMBB;

32122

}

32123

32124

static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

32125

if (IsLP64) {

32126

if (isInt<8>(Imm))

32127

return X86::SUB64ri8;

32128

return X86::SUB64ri32;

32129

} else {

32130

if (isInt<8>(Imm))

32131

return X86::SUB32ri8;

32132

return X86::SUB32ri;

32133

}

32134

}

32135

32136

MachineBasicBlock *

32137

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

32138

MachineBasicBlock *MBB) const {

32139

MachineFunction *MF = MBB->getParent();

32140

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

32141

const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

32142

const DebugLoc &DL = MI.getDebugLoc();

32143

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

32144

32145

const unsigned ProbeSize = getStackProbeSize(*MF);

32146

32147

MachineRegisterInfo &MRI = MF->getRegInfo();

32148

MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

32149

MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

32150

MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

32151

32152

MachineFunction::iterator MBBIter = ++MBB->getIterator();

32153

MF->insert(MBBIter, testMBB);

32154

MF->insert(MBBIter, blockMBB);

32155

MF->insert(MBBIter, tailMBB);

32156

32157

Register sizeVReg = MI.getOperand(1).getReg();

32158

32159

Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

32160

32161

Register TmpStackPtr = MRI.createVirtualRegister(

32162

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

32163

Register FinalStackPtr = MRI.createVirtualRegister(

32164

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

32165

32166

BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

32167

.addReg(physSPReg);

32168

{

32169

const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

32170

BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

32171

.addReg(TmpStackPtr)

32172

.addReg(sizeVReg);

32173

}

32174

32175

// test rsp size

32176

32177

BuildMI(testMBB, DL,

32178

TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

32179

.addReg(FinalStackPtr)

32180

.addReg(physSPReg);

32181

32182

BuildMI(testMBB, DL, TII->get(X86::JCC_1))

32183

.addMBB(tailMBB)

32184

.addImm(X86::COND_L);

32185

testMBB->addSuccessor(blockMBB);

32186

testMBB->addSuccessor(tailMBB);

32187

32188

// Touch the block then extend it. This is done on the opposite side of

32189

// static probe where we allocate then touch, to avoid the need of probing the

32190

// tail of the static alloca. Possible scenarios are:

32191

//

32192

// + ---- <- ------------ <- ------------- <- ------------ +

32193

// | |

32194

// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

32195

// | |

32196

// + <- ----------- <- ------------ <- ----------- <- ------------ +

32197

//

32198

// The property we want to enforce is to never have more than [page alloc] between two probes.

32199

32200

const unsigned MovMIOpc =

32201

TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;

32202

addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)

32203

.addImm(0);

32204

32205

BuildMI(blockMBB, DL,

32206

TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

32207

.addReg(physSPReg)

32208

.addImm(ProbeSize);

32209

32210

32211

BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

32212

blockMBB->addSuccessor(testMBB);

32213

32214

// Replace original instruction by the expected stack ptr

32215

BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

32216

.addReg(FinalStackPtr);

32217

32218

tailMBB->splice(tailMBB->end(), MBB,

32219

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

32220

tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

32221

MBB->addSuccessor(testMBB);

32222

32223

// Delete the original pseudo instruction.

32224

MI.eraseFromParent();

32225

32226

// And we're done.

32227

return tailMBB;

32228

}

32229

32230

MachineBasicBlock *

32231

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

32232

MachineBasicBlock *BB) const {

32233

MachineFunction *MF = BB->getParent();

32234

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

32235

const DebugLoc &DL = MI.getDebugLoc();

32236

const BasicBlock *LLVM_BB = BB->getBasicBlock();

32237

32238

assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail
("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32238, __PRETTY_FUNCTION__));

32239

32240

const bool Is64Bit = Subtarget.is64Bit();

32241

const bool IsLP64 = Subtarget.isTarget64BitLP64();

32242

32243

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

32244

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

32245

32246

// BB:

32247

// ... [Till the alloca]

32248

// If stacklet is not large enough, jump to mallocMBB

32249

//

32250

// bumpMBB:

32251

// Allocate by subtracting from RSP

32252

// Jump to continueMBB

32253

//

32254

// mallocMBB:

32255

// Allocate by call to runtime

32256

//

32257

// continueMBB:

32258

// ...

32259

// [rest of original BB]

32260

//

32261

32262

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

32263

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

32264

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

32265

32266

MachineRegisterInfo &MRI = MF->getRegInfo();

32267

const TargetRegisterClass *AddrRegClass =

32268

getRegClassFor(getPointerTy(MF->getDataLayout()));

32269

32270

Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

32271

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

32272

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

32273

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

32274

sizeVReg = MI.getOperand(1).getReg(),

32275

physSPReg =

32276

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

32277

32278

MachineFunction::iterator MBBIter = ++BB->getIterator();

32279

32280

MF->insert(MBBIter, bumpMBB);

32281

MF->insert(MBBIter, mallocMBB);

32282

MF->insert(MBBIter, continueMBB);

32283

32284

continueMBB->splice(continueMBB->begin(), BB,

32285

std::next(MachineBasicBlock::iterator(MI)), BB->end());

32286

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

32287

32288

// Add code to the main basic block to check if the stack limit has been hit,

32289

// and if so, jump to mallocMBB otherwise to bumpMBB.

32290

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

32291

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

32292

.addReg(tmpSPVReg).addReg(sizeVReg);

32293

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

32294

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

32295

.addReg(SPLimitVReg);

32296

BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

32297

32298

// bumpMBB simply decreases the stack pointer, since we know the current

32299

// stacklet has enough space.

32300

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

32301

.addReg(SPLimitVReg);

32302

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

32303

.addReg(SPLimitVReg);

32304

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

32305

32306

// Calls into a routine in libgcc to allocate more space from the heap.

32307

const uint32_t *RegMask =

32308

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

32309

if (IsLP64) {

32310

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

32311

.addReg(sizeVReg);

32312

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

32313

.addExternalSymbol("__morestack_allocate_stack_space")

32314

.addRegMask(RegMask)

32315

.addReg(X86::RDI, RegState::Implicit)

32316

.addReg(X86::RAX, RegState::ImplicitDefine);

32317

} else if (Is64Bit) {

32318

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

32319

.addReg(sizeVReg);

32320

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

32321

.addExternalSymbol("__morestack_allocate_stack_space")

32322

.addRegMask(RegMask)

32323

.addReg(X86::EDI, RegState::Implicit)

32324

.addReg(X86::EAX, RegState::ImplicitDefine);

32325

} else {

32326

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

32327

.addImm(12);

32328

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

32329

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

32330

.addExternalSymbol("__morestack_allocate_stack_space")

32331

.addRegMask(RegMask)

32332

.addReg(X86::EAX, RegState::ImplicitDefine);

32333

}

32334

32335

if (!Is64Bit)

32336

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

32337

.addImm(16);

32338

32339

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

32340

.addReg(IsLP64 ? X86::RAX : X86::EAX);

32341

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

32342

32343

// Set up the CFG correctly.

32344

BB->addSuccessor(bumpMBB);

32345

BB->addSuccessor(mallocMBB);

32346

mallocMBB->addSuccessor(continueMBB);

32347

bumpMBB->addSuccessor(continueMBB);

32348

32349

// Take care of the PHI nodes.

32350

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

32351

MI.getOperand(0).getReg())

32352

.addReg(mallocPtrVReg)

32353

.addMBB(mallocMBB)

32354

.addReg(bumpSPPtrVReg)

32355

.addMBB(bumpMBB);

32356

32357

// Delete the original pseudo instruction.

32358

MI.eraseFromParent();

32359

32360

// And we're done.

32361

return continueMBB;

32362

}

32363

32364

MachineBasicBlock *

32365

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

32366

MachineBasicBlock *BB) const {

32367

MachineFunction *MF = BB->getParent();

32368

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

32369

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

32370

const DebugLoc &DL = MI.getDebugLoc();

32371

32372

assert(!isAsynchronousEHPersonality(((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32374, __PRETTY_FUNCTION__))

32373

classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32374, __PRETTY_FUNCTION__))

32374

"SEH does not use catchret!")((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32374, __PRETTY_FUNCTION__));

32375

32376

// Only 32-bit EH needs to worry about manually restoring stack pointers.

32377

if (!Subtarget.is32Bit())

32378

return BB;

32379

32380

// C++ EH creates a new target block to hold the restore code, and wires up

32381

// the new block to the return destination with a normal JMP_4.

32382

MachineBasicBlock *RestoreMBB =

32383

MF->CreateMachineBasicBlock(BB->getBasicBlock());

32384

assert(BB->succ_size() == 1)((BB->succ_size() == 1) ? static_cast<void> (0) : __assert_fail
("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32384, __PRETTY_FUNCTION__));

32385

MF->insert(std::next(BB->getIterator()), RestoreMBB);

32386

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

32387

BB->addSuccessor(RestoreMBB);

32388

MI.getOperand(0).setMBB(RestoreMBB);

32389

32390

// Marking this as an EH pad but not a funclet entry block causes PEI to

32391

// restore stack pointers in the block.

32392

RestoreMBB->setIsEHPad(true);

32393

32394

auto RestoreMBBI = RestoreMBB->begin();

32395

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

32396

return BB;

32397

}

32398

32399

MachineBasicBlock *

32400

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

32401

MachineBasicBlock *BB) const {

32402

// So, here we replace TLSADDR with the sequence:

32403

// adjust_stackdown -> TLSADDR -> adjust_stackup.

32404

// We need this because TLSADDR is lowered into calls

32405

// inside MC, therefore without the two markers shrink-wrapping

32406

// may push the prologue/epilogue pass them.

32407

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

32408

const DebugLoc &DL = MI.getDebugLoc();

32409

MachineFunction &MF = *BB->getParent();

32410

32411

// Emit CALLSEQ_START right before the instruction.

32412

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

32413

MachineInstrBuilder CallseqStart =

32414

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

32415

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

32416

32417

// Emit CALLSEQ_END right after the instruction.

32418

// We don't call erase from parent because we want to keep the

32419

// original instruction around.

32420

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

32421

MachineInstrBuilder CallseqEnd =

32422

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

32423

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

32424

32425

return BB;

32426

}

32427

32428

MachineBasicBlock *

32429

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

32430

MachineBasicBlock *BB) const {

32431

// This is pretty easy. We're taking the value that we received from

32432

// our load from the relocation, sticking it in either RDI (x86-64)

32433

// or EAX and doing an indirect call. The return value will then

32434

// be in the normal return register.

32435

MachineFunction *F = BB->getParent();

32436

const X86InstrInfo *TII = Subtarget.getInstrInfo();

32437

const DebugLoc &DL = MI.getDebugLoc();

32438

32439

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((Subtarget.isTargetDarwin() && "Darwin only instr emitted?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32439, __PRETTY_FUNCTION__));

32440

assert(MI.getOperand(3).isGlobal() && "This should be a global")((MI.getOperand(3).isGlobal() && "This should be a global"
) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32440, __PRETTY_FUNCTION__));

32441

32442

// Get a register mask for the lowered call.

32443

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

32444

// proper register mask.

32445

const uint32_t *RegMask =

32446

Subtarget.is64Bit() ?

32447

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

32448

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

32449

if (Subtarget.is64Bit()) {

32450

MachineInstrBuilder MIB =

32451

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

32452

.addReg(X86::RIP)

32453

.addImm(0)

32454

.addReg(0)

32455

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

32456

MI.getOperand(3).getTargetFlags())

32457

.addReg(0);

32458

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

32459

addDirectMem(MIB, X86::RDI);

32460

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

32461

} else if (!isPositionIndependent()) {

32462

MachineInstrBuilder MIB =

32463

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

32464

.addReg(0)

32465

.addImm(0)

32466

.addReg(0)

32467

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

32468

MI.getOperand(3).getTargetFlags())

32469

.addReg(0);

32470

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

32471

addDirectMem(MIB, X86::EAX);

32472

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

32473

} else {

32474

MachineInstrBuilder MIB =

32475

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

32476

.addReg(TII->getGlobalBaseReg(F))

32477

.addImm(0)

32478

.addReg(0)

32479

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

32480

MI.getOperand(3).getTargetFlags())

32481

.addReg(0);

32482

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

32483

addDirectMem(MIB, X86::EAX);

32484

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

32485

}

32486

32487

MI.eraseFromParent(); // The pseudo instruction is gone now.

32488

return BB;

32489

}

32490

32491

static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

32492

switch (RPOpc) {

32493

case X86::INDIRECT_THUNK_CALL32:

32494

return X86::CALLpcrel32;

32495

case X86::INDIRECT_THUNK_CALL64:

32496

return X86::CALL64pcrel32;

32497

case X86::INDIRECT_THUNK_TCRETURN32:

32498

return X86::TCRETURNdi;

32499

case X86::INDIRECT_THUNK_TCRETURN64:

32500

return X86::TCRETURNdi64;

32501

}

32502

llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32502);

32503

}

32504

32505

static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

32506

unsigned Reg) {

32507

if (Subtarget.useRetpolineExternalThunk()) {

32508

// When using an external thunk for retpolines, we pick names that match the

32509

// names GCC happens to use as well. This helps simplify the implementation

32510

// of the thunks for kernels where they have no easy ability to create

32511

// aliases and are doing non-trivial configuration of the thunk's body. For

32512

// example, the Linux kernel will do boot-time hot patching of the thunk

32513

// bodies and cannot easily export aliases of these to loaded modules.

32514

//

32515

// Note that at any point in the future, we may need to change the semantics

32516

// of how we implement retpolines and at that time will likely change the

32517

// name of the called thunk. Essentially, there is no hard guarantee that

32518

// LLVM will generate calls to specific thunks, we merely make a best-effort

32519

// attempt to help out kernels and other systems where duplicating the

32520

// thunks is costly.

32521

switch (Reg) {

32522

case X86::EAX:

32523

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32523, __PRETTY_FUNCTION__));

32524

return "__x86_indirect_thunk_eax";

32525

case X86::ECX:

32526

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32526, __PRETTY_FUNCTION__));

32527

return "__x86_indirect_thunk_ecx";

32528

case X86::EDX:

32529

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32529, __PRETTY_FUNCTION__));

32530

return "__x86_indirect_thunk_edx";

32531

case X86::EDI:

32532

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32532, __PRETTY_FUNCTION__));

32533

return "__x86_indirect_thunk_edi";

32534

case X86::R11:

32535

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32535, __PRETTY_FUNCTION__));

32536

return "__x86_indirect_thunk_r11";

32537

}

32538

llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32538);

32539

}

32540

32541

if (Subtarget.useRetpolineIndirectCalls() ||

32542

Subtarget.useRetpolineIndirectBranches()) {

32543

// When targeting an internal COMDAT thunk use an LLVM-specific name.

32544

switch (Reg) {

32545

case X86::EAX:

32546

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32546, __PRETTY_FUNCTION__));

32547

return "__llvm_retpoline_eax";

32548

case X86::ECX:

32549

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32549, __PRETTY_FUNCTION__));

32550

return "__llvm_retpoline_ecx";

32551

case X86::EDX:

32552

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32552, __PRETTY_FUNCTION__));

32553

return "__llvm_retpoline_edx";

32554

case X86::EDI:

32555

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32555, __PRETTY_FUNCTION__));

32556

return "__llvm_retpoline_edi";

32557

case X86::R11:

32558

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32558, __PRETTY_FUNCTION__));

32559

return "__llvm_retpoline_r11";

32560

}

32561

llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32561);

32562

}

32563

32564

if (Subtarget.useLVIControlFlowIntegrity()) {

32565

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32565, __PRETTY_FUNCTION__));

32566

return "__llvm_lvi_thunk_r11";

32567

}

32568

llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32568);

32569

}

32570

32571

MachineBasicBlock *

32572

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

32573

MachineBasicBlock *BB) const {

32574

// Copy the virtual register into the R11 physical register and

32575

// call the retpoline thunk.

32576

const DebugLoc &DL = MI.getDebugLoc();

32577

const X86InstrInfo *TII = Subtarget.getInstrInfo();

32578

Register CalleeVReg = MI.getOperand(0).getReg();

32579

unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

32580

32581

// Find an available scratch register to hold the callee. On 64-bit, we can

32582

// just use R11, but we scan for uses anyway to ensure we don't generate

32583

// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

32584

// already a register use operand to the call to hold the callee. If none

32585

// are available, use EDI instead. EDI is chosen because EBX is the PIC base

32586

// register and ESI is the base pointer to realigned stack frames with VLAs.

32587

SmallVector<unsigned, 3> AvailableRegs;

32588

if (Subtarget.is64Bit())

32589

AvailableRegs.push_back(X86::R11);

32590

else

32591

AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

32592

32593

// Zero out any registers that are already used.

32594

for (const auto &MO : MI.operands()) {

32595

if (MO.isReg() && MO.isUse())

32596

for (unsigned &Reg : AvailableRegs)

32597

if (Reg == MO.getReg())

32598

Reg = 0;

32599

}

32600

32601

// Choose the first remaining non-zero available register.

32602

unsigned AvailableReg = 0;

32603

for (unsigned MaybeReg : AvailableRegs) {

32604

if (MaybeReg) {

32605

AvailableReg = MaybeReg;

32606

break;

32607

}

32608

}

32609

if (!AvailableReg)

32610

report_fatal_error("calling convention incompatible with retpoline, no "

32611

"available registers");

32612

32613

const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

32614

32615

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

32616

.addReg(CalleeVReg);

32617

MI.getOperand(0).ChangeToES(Symbol);

32618

MI.setDesc(TII->get(Opc));

32619

MachineInstrBuilder(*BB->getParent(), &MI)

32620

.addReg(AvailableReg, RegState::Implicit | RegState::Kill);

32621

return BB;

32622

}

32623

32624

/// SetJmp implies future control flow change upon calling the corresponding

32625

/// LongJmp.

32626

/// Instead of using the 'return' instruction, the long jump fixes the stack and

32627

/// performs an indirect branch. To do so it uses the registers that were stored

32628

/// in the jump buffer (when calling SetJmp).

32629

/// In case the shadow stack is enabled we need to fix it as well, because some

32630

/// return addresses will be skipped.

32631

/// The function will save the SSP for future fixing in the function

32632

/// emitLongJmpShadowStackFix.

32633

/// \sa emitLongJmpShadowStackFix

32634

/// \param [in] MI The temporary Machine Instruction for the builtin.

32635

/// \param [in] MBB The Machine Basic Block that will be modified.

32636

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

32637

MachineBasicBlock *MBB) const {

32638

const DebugLoc &DL = MI.getDebugLoc();

32639

MachineFunction *MF = MBB->getParent();

32640

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

32641

MachineRegisterInfo &MRI = MF->getRegInfo();

32642

MachineInstrBuilder MIB;

32643

32644

// Memory Reference.

32645

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

32646

MI.memoperands_end());

32647

32648

// Initialize a register with zero.

32649

MVT PVT = getPointerTy(MF->getDataLayout());

32650

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

32651

Register ZReg = MRI.createVirtualRegister(PtrRC);

32652

unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

32653

BuildMI(*MBB, MI, DL, TII->get(XorRROpc))

32654

.addDef(ZReg)

32655

.addReg(ZReg, RegState::Undef)

32656

.addReg(ZReg, RegState::Undef);

32657

32658

// Read the current SSP Register value to the zeroed register.

32659

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

32660

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

32661

BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

32662

32663

// Write the SSP register value to offset 3 in input memory buffer.

32664

unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

32665

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));

32666

const int64_t SSPOffset = 3 * PVT.getStoreSize();

32667

const unsigned MemOpndSlot = 1;

32668

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

32669

if (i == X86::AddrDisp)

32670

MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

32671

else

32672

MIB.add(MI.getOperand(MemOpndSlot + i));

32673

}

32674

MIB.addReg(SSPCopyReg);

32675

MIB.setMemRefs(MMOs);

32676

}

32677

32678

MachineBasicBlock *

32679

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

32680

MachineBasicBlock *MBB) const {

32681

const DebugLoc &DL = MI.getDebugLoc();

32682

MachineFunction *MF = MBB->getParent();

32683

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

32684

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

32685

MachineRegisterInfo &MRI = MF->getRegInfo();

32686

32687

const BasicBlock *BB = MBB->getBasicBlock();

32688

MachineFunction::iterator I = ++MBB->getIterator();

32689

32690

// Memory Reference

32691

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

32692

MI.memoperands_end());

32693

32694

unsigned DstReg;

32695

unsigned MemOpndSlot = 0;

32696

32697

unsigned CurOp = 0;

32698

32699

DstReg = MI.getOperand(CurOp++).getReg();

32700

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

32701

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")((TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"
) ? static_cast<void> (0) : __assert_fail ("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32701, __PRETTY_FUNCTION__));

32702

(void)TRI;

32703

Register mainDstReg = MRI.createVirtualRegister(RC);

32704

Register restoreDstReg = MRI.createVirtualRegister(RC);

32705

32706

MemOpndSlot = CurOp;

32707

32708

MVT PVT = getPointerTy(MF->getDataLayout());

32709

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32710, __PRETTY_FUNCTION__))

32710

"Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32710, __PRETTY_FUNCTION__));

32711

32712

// For v = setjmp(buf), we generate

32713

//

32714

// thisMBB:

32715

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

32716

// SjLjSetup restoreMBB

32717

//

32718

// mainMBB:

32719

// v_main = 0

32720

//

32721

// sinkMBB:

32722

// v = phi(main, restore)

32723

//

32724

// restoreMBB:

32725

// if base pointer being used, load it from frame

32726

// v_restore = 1

32727

32728

MachineBasicBlock *thisMBB = MBB;

32729

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

32730

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

32731

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

32732

MF->insert(I, mainMBB);

32733

MF->insert(I, sinkMBB);

32734

MF->push_back(restoreMBB);

32735

restoreMBB->setHasAddressTaken();

32736

32737

MachineInstrBuilder MIB;

32738

32739

// Transfer the remainder of BB and its successor edges to sinkMBB.

32740

sinkMBB->splice(sinkMBB->begin(), MBB,

32741

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

32742

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

32743

32744

// thisMBB:

32745

unsigned PtrStoreOpc = 0;

32746

unsigned LabelReg = 0;

32747

const int64_t LabelOffset = 1 * PVT.getStoreSize();

32748

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

32749

!isPositionIndependent();

32750

32751

// Prepare IP either in reg or imm.

32752

if (!UseImmLabel) {

32753

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

32754

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

32755

LabelReg = MRI.createVirtualRegister(PtrRC);

32756

if (Subtarget.is64Bit()) {

32757

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

32758

.addReg(X86::RIP)

32759

.addImm(0)

32760

.addReg(0)

32761

.addMBB(restoreMBB)

32762

.addReg(0);

32763

} else {

32764

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

32765

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

32766

.addReg(XII->getGlobalBaseReg(MF))

32767

.addImm(0)

32768

.addReg(0)

32769

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

32770

.addReg(0);

32771

}

32772

} else

32773

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

32774

// Store IP

32775

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

32776

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

32777

if (i == X86::AddrDisp)

32778

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

32779

else

32780

MIB.add(MI.getOperand(MemOpndSlot + i));

32781

}

32782

if (!UseImmLabel)

32783

MIB.addReg(LabelReg);

32784

else

32785

MIB.addMBB(restoreMBB);

32786

MIB.setMemRefs(MMOs);

32787

32788

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

32789

emitSetJmpShadowStackFix(MI, thisMBB);

32790

}

32791

32792

// Setup

32793

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

32794

.addMBB(restoreMBB);

32795

32796

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

32797

MIB.addRegMask(RegInfo->getNoPreservedMask());

32798

thisMBB->addSuccessor(mainMBB);

32799

thisMBB->addSuccessor(restoreMBB);

32800

32801

// mainMBB:

32802

// EAX = 0

32803

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

32804

mainMBB->addSuccessor(sinkMBB);

32805

32806

// sinkMBB:

32807

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

32808

TII->get(X86::PHI), DstReg)

32809

.addReg(mainDstReg).addMBB(mainMBB)

32810

.addReg(restoreDstReg).addMBB(restoreMBB);

32811

32812

// restoreMBB:

32813

if (RegInfo->hasBasePointer(*MF)) {

32814

const bool Uses64BitFramePtr =

32815

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

32816

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

32817

X86FI->setRestoreBasePointer(MF);

32818

Register FramePtr = RegInfo->getFrameRegister(*MF);

32819

Register BasePtr = RegInfo->getBaseRegister();

32820

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

32821

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

32822

FramePtr, true, X86FI->getRestoreBasePointerOffset())

32823

.setMIFlag(MachineInstr::FrameSetup);

32824

}

32825

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

32826

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

32827

restoreMBB->addSuccessor(sinkMBB);

32828

32829

MI.eraseFromParent();

32830

return sinkMBB;

32831

}

32832

32833

/// Fix the shadow stack using the previously saved SSP pointer.

32834

/// \sa emitSetJmpShadowStackFix

32835

/// \param [in] MI The temporary Machine Instruction for the builtin.

32836

/// \param [in] MBB The Machine Basic Block that will be modified.

32837

/// \return The sink MBB that will perform the future indirect branch.

32838

MachineBasicBlock *

32839

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

32840

MachineBasicBlock *MBB) const {

32841

const DebugLoc &DL = MI.getDebugLoc();

32842

MachineFunction *MF = MBB->getParent();

32843

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

32844

MachineRegisterInfo &MRI = MF->getRegInfo();

32845

32846

// Memory Reference

32847

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

32848

MI.memoperands_end());

32849

32850

MVT PVT = getPointerTy(MF->getDataLayout());

32851

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

32852

32853

// checkSspMBB:

32854

// xor vreg1, vreg1

32855

// rdssp vreg1

32856

// test vreg1, vreg1

32857

// je sinkMBB # Jump if Shadow Stack is not supported

32858

// fallMBB:

32859

// mov buf+24/12(%rip), vreg2

32860

// sub vreg1, vreg2

32861

// jbe sinkMBB # No need to fix the Shadow Stack

32862

// fixShadowMBB:

32863

// shr 3/2, vreg2

32864

// incssp vreg2 # fix the SSP according to the lower 8 bits

32865

// shr 8, vreg2

32866

// je sinkMBB

32867

// fixShadowLoopPrepareMBB:

32868

// shl vreg2

32869

// mov 128, vreg3

32870

// fixShadowLoopMBB:

32871

// incssp vreg3

32872

// dec vreg2

32873

// jne fixShadowLoopMBB # Iterate until you finish fixing

32874

// # the Shadow Stack

32875

// sinkMBB:

32876

32877

MachineFunction::iterator I = ++MBB->getIterator();

32878

const BasicBlock *BB = MBB->getBasicBlock();

32879

32880

MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

32881

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

32882

MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

32883

MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

32884

MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

32885

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

32886

MF->insert(I, checkSspMBB);

32887

MF->insert(I, fallMBB);

32888

MF->insert(I, fixShadowMBB);

32889

MF->insert(I, fixShadowLoopPrepareMBB);

32890

MF->insert(I, fixShadowLoopMBB);

32891

MF->insert(I, sinkMBB);

32892

32893

// Transfer the remainder of BB and its successor edges to sinkMBB.

32894

sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

32895

MBB->end());

32896

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

32897

32898

MBB->addSuccessor(checkSspMBB);

32899

32900

// Initialize a register with zero.

32901

Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

32902

BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

32903

32904

if (PVT == MVT::i64) {

32905

Register TmpZReg = MRI.createVirtualRegister(PtrRC);

32906

BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

32907

.addImm(0)

32908

.addReg(ZReg)

32909

.addImm(X86::sub_32bit);

32910

ZReg = TmpZReg;

32911

}

32912

32913

// Read the current SSP Register value to the zeroed register.

32914

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

32915

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

32916

BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

32917

32918

// Check whether the result of the SSP register is zero and jump directly

32919

// to the sink.

32920

unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

32921

BuildMI(checkSspMBB, DL, TII->get(TestRROpc))

32922

.addReg(SSPCopyReg)

32923

.addReg(SSPCopyReg);

32924

BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

32925

checkSspMBB->addSuccessor(sinkMBB);

32926

checkSspMBB->addSuccessor(fallMBB);

32927

32928

// Reload the previously saved SSP register value.

32929

Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

32930

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

32931

const int64_t SPPOffset = 3 * PVT.getStoreSize();

32932

MachineInstrBuilder MIB =

32933

BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);

32934

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

32935

const MachineOperand &MO = MI.getOperand(i);

32936

if (i == X86::AddrDisp)

32937

MIB.addDisp(MO, SPPOffset);

32938

else if (MO.isReg()) // Don't add the whole operand, we don't want to

32939

// preserve kill flags.

32940

MIB.addReg(MO.getReg());

32941

else

32942

MIB.add(MO);

32943

}

32944

MIB.setMemRefs(MMOs);

32945

32946

// Subtract the current SSP from the previous SSP.

32947

Register SspSubReg = MRI.createVirtualRegister(PtrRC);

32948

unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

32949

BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)

32950

.addReg(PrevSSPReg)

32951

.addReg(SSPCopyReg);

32952

32953

// Jump to sink in case PrevSSPReg <= SSPCopyReg.

32954

BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);

32955

fallMBB->addSuccessor(sinkMBB);

32956

fallMBB->addSuccessor(fixShadowMBB);

32957

32958

// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

32959

unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

32960

unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

32961

Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

32962

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)

32963

.addReg(SspSubReg)

32964

.addImm(Offset);

32965

32966

// Increase SSP when looking only on the lower 8 bits of the delta.

32967

unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

32968

BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

32969

32970

// Reset the lower 8 bits.

32971

Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

32972

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)

32973

.addReg(SspFirstShrReg)

32974

.addImm(8);

32975

32976

// Jump if the result of the shift is zero.

32977

BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

32978

fixShadowMBB->addSuccessor(sinkMBB);

32979

fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

32980

32981

// Do a single shift left.

32982

unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;

32983

Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

32984

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)

32985

.addReg(SspSecondShrReg);

32986

32987

// Save the value 128 to a register (will be used next with incssp).

32988

Register Value128InReg = MRI.createVirtualRegister(PtrRC);

32989

unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

32990

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)

32991

.addImm(128);

32992

fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

32993

32994

// Since incssp only looks at the lower 8 bits, we might need to do several

32995

// iterations of incssp until we finish fixing the shadow stack.

32996

Register DecReg = MRI.createVirtualRegister(PtrRC);

32997

Register CounterReg = MRI.createVirtualRegister(PtrRC);

32998

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)

32999

.addReg(SspAfterShlReg)

33000

.addMBB(fixShadowLoopPrepareMBB)

33001

.addReg(DecReg)

33002

.addMBB(fixShadowLoopMBB);

33003

33004

// Every iteration we increase the SSP by 128.

33005

BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

33006

33007

// Every iteration we decrement the counter by 1.

33008

unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

33009

BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

33010

33011

// Jump if the counter is not zero yet.

33012

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);

33013

fixShadowLoopMBB->addSuccessor(sinkMBB);

33014

fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

33015

33016

return sinkMBB;

33017

}

33018

33019

MachineBasicBlock *

33020

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

33021

MachineBasicBlock *MBB) const {

33022

const DebugLoc &DL = MI.getDebugLoc();

33023

MachineFunction *MF = MBB->getParent();

33024

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

33025

MachineRegisterInfo &MRI = MF->getRegInfo();

33026

33027

// Memory Reference

33028

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

33029

MI.memoperands_end());

33030

33031

MVT PVT = getPointerTy(MF->getDataLayout());

33032

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33033, __PRETTY_FUNCTION__))

33033

"Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33033, __PRETTY_FUNCTION__));

33034

33035

const TargetRegisterClass *RC =

33036

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

33037

Register Tmp = MRI.createVirtualRegister(RC);

33038

// Since FP is only updated here but NOT referenced, it's treated as GPR.

33039

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

33040

Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

33041

Register SP = RegInfo->getStackRegister();

33042

33043

MachineInstrBuilder MIB;

33044

33045

const int64_t LabelOffset = 1 * PVT.getStoreSize();

33046

const int64_t SPOffset = 2 * PVT.getStoreSize();

33047

33048

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

33049

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

33050

33051

MachineBasicBlock *thisMBB = MBB;

33052

33053

// When CET and shadow stack is enabled, we need to fix the Shadow Stack.

33054

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

33055

thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

33056

}

33057

33058

// Reload FP

33059

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);

33060

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

33061

const MachineOperand &MO = MI.getOperand(i);

33062

if (MO.isReg()) // Don't add the whole operand, we don't want to

33063

// preserve kill flags.

33064

MIB.addReg(MO.getReg());

33065

else

33066

MIB.add(MO);

33067

}

33068

MIB.setMemRefs(MMOs);

33069

33070

// Reload IP

33071

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

33072

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

33073

const MachineOperand &MO = MI.getOperand(i);

33074

if (i == X86::AddrDisp)

33075

MIB.addDisp(MO, LabelOffset);

33076

else if (MO.isReg()) // Don't add the whole operand, we don't want to

33077

// preserve kill flags.

33078

MIB.addReg(MO.getReg());

33079

else

33080

MIB.add(MO);

33081

}

33082

MIB.setMemRefs(MMOs);

33083

33084

// Reload SP

33085

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);

33086

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

33087

if (i == X86::AddrDisp)

33088

MIB.addDisp(MI.getOperand(i), SPOffset);

33089

else

33090

MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

33091

// the last instruction of the expansion.

33092

}

33093

MIB.setMemRefs(MMOs);

33094

33095

// Jump

33096

BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

33097

33098

MI.eraseFromParent();

33099

return thisMBB;

33100

}

33101

33102

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

33103

MachineBasicBlock *MBB,

33104

MachineBasicBlock *DispatchBB,

33105

int FI) const {

33106

const DebugLoc &DL = MI.getDebugLoc();

33107

MachineFunction *MF = MBB->getParent();

33108

MachineRegisterInfo *MRI = &MF->getRegInfo();

33109

const X86InstrInfo *TII = Subtarget.getInstrInfo();

33110

33111

MVT PVT = getPointerTy(MF->getDataLayout());

33112

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33112, __PRETTY_FUNCTION__));

33113

33114

unsigned Op = 0;

33115

unsigned VR = 0;

33116

33117

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

33118

!isPositionIndependent();

33119

33120

if (UseImmLabel) {

33121

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

33122

} else {

33123

const TargetRegisterClass *TRC =

33124

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

33125

VR = MRI->createVirtualRegister(TRC);

33126

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

33127

33128

if (Subtarget.is64Bit())

33129

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

33130

.addReg(X86::RIP)

33131

.addImm(1)

33132

.addReg(0)

33133

.addMBB(DispatchBB)

33134

.addReg(0);

33135

else

33136

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

33137

.addReg(0) /* TII->getGlobalBaseReg(MF) */

33138

.addImm(1)

33139

.addReg(0)

33140

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

33141

.addReg(0);

33142

}

33143

33144

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

33145

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

33146

if (UseImmLabel)

33147

MIB.addMBB(DispatchBB);

33148

else

33149

MIB.addReg(VR);

33150

}

33151

33152

MachineBasicBlock *

33153

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

33154

MachineBasicBlock *BB) const {

33155

const DebugLoc &DL = MI.getDebugLoc();

33156

MachineFunction *MF = BB->getParent();

33157

MachineRegisterInfo *MRI = &MF->getRegInfo();

33158

const X86InstrInfo *TII = Subtarget.getInstrInfo();

33159

int FI = MF->getFrameInfo().getFunctionContextIndex();

33160

33161

// Get a mapping of the call site numbers to all of the landing pads they're

33162

// associated with.

33163

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

33164

unsigned MaxCSNum = 0;

33165

for (auto &MBB : *MF) {

33166

if (!MBB.isEHPad())

33167

continue;

33168

33169

MCSymbol *Sym = nullptr;

33170

for (const auto &MI : MBB) {

33171

if (MI.isDebugInstr())

33172

continue;

33173

33174

assert(MI.isEHLabel() && "expected EH_LABEL")((MI.isEHLabel() && "expected EH_LABEL") ? static_cast
<void> (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33174, __PRETTY_FUNCTION__));

33175

Sym = MI.getOperand(0).getMCSymbol();

33176

break;

33177

}

33178

33179

if (!MF->hasCallSiteLandingPad(Sym))

33180

continue;

33181

33182

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

33183

CallSiteNumToLPad[CSI].push_back(&MBB);

33184

MaxCSNum = std::max(MaxCSNum, CSI);

33185

}

33186

}

33187

33188

// Get an ordered list of the machine basic blocks for the jump table.

33189

std::vector<MachineBasicBlock *> LPadList;

33190

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

33191

LPadList.reserve(CallSiteNumToLPad.size());

33192

33193

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

33194

for (auto &LP : CallSiteNumToLPad[CSI]) {

33195

LPadList.push_back(LP);

33196

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

33197

}

33198

}

33199

33200

assert(!LPadList.empty() &&((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33201, __PRETTY_FUNCTION__))

33201

"No landing pad destinations for the dispatch jump table!")((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33201, __PRETTY_FUNCTION__));

33202

33203

// Create the MBBs for the dispatch code.

33204

33205

// Shove the dispatch's address into the return slot in the function context.

33206

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

33207

DispatchBB->setIsEHPad(true);

33208

33209

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

33210

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

33211

DispatchBB->addSuccessor(TrapBB);

33212

33213

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

33214

DispatchBB->addSuccessor(DispContBB);

33215

33216

// Insert MBBs.

33217

MF->push_back(DispatchBB);

33218

MF->push_back(DispContBB);

33219

MF->push_back(TrapBB);

33220

33221

// Insert code into the entry block that creates and registers the function

33222

// context.

33223

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

33224

33225

// Create the jump table and associated information

33226

unsigned JTE = getJumpTableEncoding();

33227

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

33228

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

33229

33230

const X86RegisterInfo &RI = TII->getRegisterInfo();

33231

// Add a register mask with no preserved registers. This results in all

33232

// registers being marked as clobbered.

33233

if (RI.hasBasePointer(*MF)) {

33234

const bool FPIs64Bit =

33235

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

33236

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

33237

MFI->setRestoreBasePointer(MF);

33238

33239

Register FP = RI.getFrameRegister(*MF);

33240

Register BP = RI.getBaseRegister();

33241

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

33242

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

33243

MFI->getRestoreBasePointerOffset())

33244

.addRegMask(RI.getNoPreservedMask());

33245

} else {

33246

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

33247

.addRegMask(RI.getNoPreservedMask());

33248

}

33249

33250

// IReg is used as an index in a memory operand and therefore can't be SP

33251

Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

33252

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

33253

Subtarget.is64Bit() ? 8 : 4);

33254

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

33255

.addReg(IReg)

33256

.addImm(LPadList.size());

33257

BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

33258

33259

if (Subtarget.is64Bit()) {

33260

Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

33261

Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

33262

33263

// leaq .LJTI0_0(%rip), BReg

33264

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

33265

.addReg(X86::RIP)

33266

.addImm(1)

33267

.addReg(0)

33268

.addJumpTableIndex(MJTI)

33269

.addReg(0);

33270

// movzx IReg64, IReg

33271

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

33272

.addImm(0)

33273

.addReg(IReg)

33274

.addImm(X86::sub_32bit);

33275

33276

switch (JTE) {

33277

case MachineJumpTableInfo::EK_BlockAddress:

33278

// jmpq *(BReg,IReg64,8)

33279

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

33280

.addReg(BReg)

33281

.addImm(8)

33282

.addReg(IReg64)

33283

.addImm(0)

33284

.addReg(0);

33285

break;

33286

case MachineJumpTableInfo::EK_LabelDifference32: {

33287

Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

33288

Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

33289

Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

33290

33291

// movl (BReg,IReg64,4), OReg

33292

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

33293

.addReg(BReg)

33294

.addImm(4)

33295

.addReg(IReg64)

33296

.addImm(0)

33297

.addReg(0);

33298

// movsx OReg64, OReg

33299

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

33300

// addq BReg, OReg64, TReg

33301

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

33302

.addReg(OReg64)

33303

.addReg(BReg);

33304

// jmpq *TReg

33305

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

33306

break;

33307

}

33308

default:

33309

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33309);

33310

}

33311

} else {

33312

// jmpl *.LJTI0_0(,IReg,4)

33313

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

33314

.addReg(0)

33315

.addImm(4)

33316

.addReg(IReg)

33317

.addJumpTableIndex(MJTI)

33318

.addReg(0);

33319

}

33320

33321

// Add the jump table entries as successors to the MBB.

33322

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

33323

for (auto &LP : LPadList)

33324

if (SeenMBBs.insert(LP).second)

33325

DispContBB->addSuccessor(LP);

33326

33327

// N.B. the order the invoke BBs are processed in doesn't matter here.

33328

SmallVector<MachineBasicBlock *, 64> MBBLPads;

33329

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

33330

for (MachineBasicBlock *MBB : InvokeBBs) {

33331

// Remove the landing pad successor from the invoke block and replace it

33332

// with the new dispatch block.

33333

// Keep a copy of Successors since it's modified inside the loop.

33334

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

33335

MBB->succ_rend());

33336

// FIXME: Avoid quadratic complexity.

33337

for (auto MBBS : Successors) {

33338

if (MBBS->isEHPad()) {

33339

MBB->removeSuccessor(MBBS);

33340

MBBLPads.push_back(MBBS);

33341

}

33342

}

33343

33344

MBB->addSuccessor(DispatchBB);

33345

33346

// Find the invoke call and mark all of the callee-saved registers as

33347

// 'implicit defined' so that they're spilled. This prevents code from

33348

// moving instructions to before the EH block, where they will never be

33349

// executed.

33350

for (auto &II : reverse(*MBB)) {

33351

if (!II.isCall())

33352

continue;

33353

33354

DenseMap<unsigned, bool> DefRegs;

33355

for (auto &MOp : II.operands())

33356

if (MOp.isReg())

33357

DefRegs[MOp.getReg()] = true;

33358

33359

MachineInstrBuilder MIB(*MF, &II);

33360

for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

33361

unsigned Reg = SavedRegs[RegIdx];

33362

if (!DefRegs[Reg])

33363

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

33364

}

33365

33366

break;

33367

}

33368

}

33369

33370

// Mark all former landing pads as non-landing pads. The dispatch is the only

33371

// landing pad now.

33372

for (auto &LP : MBBLPads)

33373

LP->setIsEHPad(false);

33374

33375

// The instruction is gone now.

33376

MI.eraseFromParent();

33377

return BB;

33378

}

33379

33380

MachineBasicBlock *

33381

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

33382

MachineBasicBlock *BB) const {

33383

MachineFunction *MF = BB->getParent();

33384

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

33385

const DebugLoc &DL = MI.getDebugLoc();

33386

33387

auto TMMImmToTMMReg = [](unsigned Imm) {

33388

assert (Imm < 8 && "Illegal tmm index")((Imm < 8 && "Illegal tmm index") ? static_cast<
void> (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33388, __PRETTY_FUNCTION__));

33389

return X86::TMM0 + Imm;

33390

};

33391

switch (MI.getOpcode()) {

33392

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33392);

33393

case X86::TLS_addr32:

33394

case X86::TLS_addr64:

33395

case X86::TLS_base_addr32:

33396

case X86::TLS_base_addr64:

33397

return EmitLoweredTLSAddr(MI, BB);

33398

case X86::INDIRECT_THUNK_CALL32:

33399

case X86::INDIRECT_THUNK_CALL64:

33400

case X86::INDIRECT_THUNK_TCRETURN32:

33401

case X86::INDIRECT_THUNK_TCRETURN64:

33402

return EmitLoweredIndirectThunk(MI, BB);

33403

case X86::CATCHRET:

33404

return EmitLoweredCatchRet(MI, BB);

33405

case X86::SEG_ALLOCA_32:

33406

case X86::SEG_ALLOCA_64:

33407

return EmitLoweredSegAlloca(MI, BB);

33408

case X86::PROBED_ALLOCA_32:

33409

case X86::PROBED_ALLOCA_64:

33410

return EmitLoweredProbedAlloca(MI, BB);

33411

case X86::TLSCall_32:

33412

case X86::TLSCall_64:

33413

return EmitLoweredTLSCall(MI, BB);

33414

case X86::CMOV_FR32:

33415

case X86::CMOV_FR32X:

33416

case X86::CMOV_FR64:

33417

case X86::CMOV_FR64X:

33418

case X86::CMOV_GR8:

33419

case X86::CMOV_GR16:

33420

case X86::CMOV_GR32:

33421

case X86::CMOV_RFP32:

33422

case X86::CMOV_RFP64:

33423

case X86::CMOV_RFP80:

33424

case X86::CMOV_VR64:

33425

case X86::CMOV_VR128:

33426

case X86::CMOV_VR128X:

33427

case X86::CMOV_VR256:

33428

case X86::CMOV_VR256X:

33429

case X86::CMOV_VR512:

33430

case X86::CMOV_VK1:

33431

case X86::CMOV_VK2:

33432

case X86::CMOV_VK4:

33433

case X86::CMOV_VK8:

33434

case X86::CMOV_VK16:

33435

case X86::CMOV_VK32:

33436

case X86::CMOV_VK64:

33437

return EmitLoweredSelect(MI, BB);

33438

33439

case X86::RDFLAGS32:

33440

case X86::RDFLAGS64: {

33441

unsigned PushF =

33442

MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;

33443

unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;

33444

MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));

33445

// Permit reads of the EFLAGS and DF registers without them being defined.

33446

// This intrinsic exists to read external processor state in flags, such as

33447

// the trap flag, interrupt flag, and direction flag, none of which are

33448

// modeled by the backend.

33449

assert(Push->getOperand(2).getReg() == X86::EFLAGS &&((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33450, __PRETTY_FUNCTION__))

33450

"Unexpected register in operand!")((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33450, __PRETTY_FUNCTION__));

33451

Push->getOperand(2).setIsUndef();

33452

assert(Push->getOperand(3).getReg() == X86::DF &&((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33453, __PRETTY_FUNCTION__))

33453

"Unexpected register in operand!")((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33453, __PRETTY_FUNCTION__));

33454

Push->getOperand(3).setIsUndef();

33455

BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

33456

33457

MI.eraseFromParent(); // The pseudo is gone now.

33458

return BB;

33459

}

33460

33461

case X86::WRFLAGS32:

33462

case X86::WRFLAGS64: {

33463

unsigned Push =

33464

MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;

33465

unsigned PopF =

33466

MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;

33467

BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());

33468

BuildMI(*BB, MI, DL, TII->get(PopF));

33469

33470

MI.eraseFromParent(); // The pseudo is gone now.

33471

return BB;

33472

}

33473

33474

case X86::FP32_TO_INT16_IN_MEM:

33475

case X86::FP32_TO_INT32_IN_MEM:

33476

case X86::FP32_TO_INT64_IN_MEM:

33477

case X86::FP64_TO_INT16_IN_MEM:

33478

case X86::FP64_TO_INT32_IN_MEM:

33479

case X86::FP64_TO_INT64_IN_MEM:

33480

case X86::FP80_TO_INT16_IN_MEM:

33481

case X86::FP80_TO_INT32_IN_MEM:

33482

case X86::FP80_TO_INT64_IN_MEM: {

33483

// Change the floating point control register to use "round towards zero"

33484

// mode when truncating to an integer value.

33485

int OrigCWFrameIdx =

33486

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

33487

addFrameReference(BuildMI(*BB, MI, DL,

33488

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

33489

33490

// Load the old value of the control word...

33491

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

33492

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

33493

OrigCWFrameIdx);

33494

33495

// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

33496

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

33497

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

33498

.addReg(OldCW, RegState::Kill).addImm(0xC00);

33499

33500

// Extract to 16 bits.

33501

Register NewCW16 =

33502

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

33503

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

33504

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

33505

33506

// Prepare memory for FLDCW.

33507

int NewCWFrameIdx =

33508

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

33509

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

33510

NewCWFrameIdx)

33511

.addReg(NewCW16, RegState::Kill);

33512

33513

// Reload the modified control word now...

33514

addFrameReference(BuildMI(*BB, MI, DL,

33515

TII->get(X86::FLDCW16m)), NewCWFrameIdx);

33516

33517

// Get the X86 opcode to use.

33518

unsigned Opc;

33519

switch (MI.getOpcode()) {

33520

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33520);

33521

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

33522

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

33523

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

33524

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

33525

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

33526

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

33527

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

33528

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

33529

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

33530

}

33531

33532

X86AddressMode AM = getAddressFromInstr(&MI, 0);

33533

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

33534

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

33535

33536

// Reload the original control word now.

33537

addFrameReference(BuildMI(*BB, MI, DL,

33538

TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

33539

33540

MI.eraseFromParent(); // The pseudo instruction is gone now.

33541

return BB;

33542

}

33543

33544

// xbegin

33545

case X86::XBEGIN:

33546

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

33547

33548

case X86::VASTART_SAVE_XMM_REGS:

33549

return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

33550

33551

case X86::VAARG_64:

33552

return EmitVAARG64WithCustomInserter(MI, BB);

33553

33554

case X86::EH_SjLj_SetJmp32:

33555

case X86::EH_SjLj_SetJmp64:

33556

return emitEHSjLjSetJmp(MI, BB);

33557

33558

case X86::EH_SjLj_LongJmp32:

33559

case X86::EH_SjLj_LongJmp64:

33560

return emitEHSjLjLongJmp(MI, BB);

33561

33562

case X86::Int_eh_sjlj_setup_dispatch:

33563

return EmitSjLjDispatchBlock(MI, BB);

33564

33565

case TargetOpcode::STATEPOINT:

33566

// As an implementation detail, STATEPOINT shares the STACKMAP format at

33567

// this point in the process. We diverge later.

33568

return emitPatchPoint(MI, BB);

33569

33570

case TargetOpcode::STACKMAP:

33571

case TargetOpcode::PATCHPOINT:

33572

return emitPatchPoint(MI, BB);

33573

33574

case TargetOpcode::PATCHABLE_EVENT_CALL:

33575

return emitXRayCustomEvent(MI, BB);

33576

33577

case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

33578

return emitXRayTypedEvent(MI, BB);

33579

33580

case X86::LCMPXCHG8B: {

33581

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

33582

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

33583

// requires a memory operand. If it happens that current architecture is

33584

// i686 and for current function we need a base pointer

33585

// - which is ESI for i686 - register allocator would not be able to

33586

// allocate registers for an address in form of X(%reg, %reg, Y)

33587

// - there never would be enough unreserved registers during regalloc

33588

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

33589

// We are giving a hand to register allocator by precomputing the address in

33590

// a new vreg using LEA.

33591

33592

// If it is not i686 or there is no base pointer - nothing to do here.

33593

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

33594

return BB;

33595

33596

// Even though this code does not necessarily needs the base pointer to

33597

// be ESI, we check for that. The reason: if this assert fails, there are

33598

// some changes happened in the compiler base pointer handling, which most

33599

// probably have to be addressed somehow here.

33600

assert(TRI->getBaseRegister() == X86::ESI &&((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33602, __PRETTY_FUNCTION__))

33601

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33602, __PRETTY_FUNCTION__))

33602

"base pointer in mind")((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33602, __PRETTY_FUNCTION__));

33603

33604

MachineRegisterInfo &MRI = MF->getRegInfo();

33605

MVT SPTy = getPointerTy(MF->getDataLayout());

33606

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

33607

Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

33608

33609

X86AddressMode AM = getAddressFromInstr(&MI, 0);

33610

// Regalloc does not need any help when the memory operand of CMPXCHG8B

33611

// does not use index register.

33612

if (AM.IndexReg == X86::NoRegister)

33613

return BB;

33614

33615

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

33616

// four operand definitions that are E[ABCD] registers. We skip them and

33617

// then insert the LEA.

33618

MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

33619

while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||

33620

RMBBI->definesRegister(X86::EBX) ||

33621

RMBBI->definesRegister(X86::ECX) ||

33622

RMBBI->definesRegister(X86::EDX))) {

33623

++RMBBI;

33624

}

33625

MachineBasicBlock::iterator MBBI(RMBBI);

33626

addFullAddress(

33627

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

33628

33629

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

33630

33631

return BB;

33632

}

33633

case X86::LCMPXCHG16B:

33634

return BB;

33635

case X86::LCMPXCHG8B_SAVE_EBX:

33636

case X86::LCMPXCHG16B_SAVE_RBX: {

33637

unsigned BasePtr =

33638

MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;

33639

if (!BB->isLiveIn(BasePtr))

33640

BB->addLiveIn(BasePtr);

33641

return BB;

33642

}

33643

case X86::MWAITX: {

33644

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

33645

Register BasePtr = TRI->getBaseRegister();

33646

bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

33647

// If no need to save the base pointer, we generate MWAITXrrr,

33648

// else we generate pseudo MWAITX_SAVE_RBX/EBX.

33649

if (!IsRBX || !TRI->hasBasePointer(*MF)) {

33650

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

33651

.addReg(MI.getOperand(0).getReg());

33652

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

33653

.addReg(MI.getOperand(1).getReg());

33654

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)

33655

.addReg(MI.getOperand(2).getReg());

33656

BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));

33657

MI.eraseFromParent();

33658

} else {

33659

if (!BB->isLiveIn(BasePtr)) {

33660

BB->addLiveIn(BasePtr);

33661

}

33662

// Parameters can be copied into ECX and EAX but not EBX yet.

33663

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

33664

.addReg(MI.getOperand(0).getReg());

33665

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

33666

.addReg(MI.getOperand(1).getReg());

33667

const TargetRegisterClass *RegClass =

33668

BasePtr == X86::EBX ? &X86::GR32RegClass : &X86::GR64RegClass;

33669

// Save RBX (or EBX) into a virtual register.

33670

Register SaveRBX = MF->getRegInfo().createVirtualRegister(RegClass);

33671

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

33672

.addReg(BasePtr);

33673

// Generate mwaitx pseudo.

33674

unsigned Opcode =

33675

BasePtr == X86::RBX ? X86::MWAITX_SAVE_RBX : X86::MWAITX_SAVE_EBX;

33676

Register Dst = MF->getRegInfo().createVirtualRegister(RegClass);

33677

BuildMI(*BB, MI, DL, TII->get(Opcode))

33678

.addDef(Dst) // Destination tied in with SaveRBX.

33679

.addReg(MI.getOperand(2).getReg()) // input value of EBX.

33680

.addUse(SaveRBX); // Save of base pointer.

33681

MI.eraseFromParent();

33682

}

33683

return BB;

33684

}

33685

case TargetOpcode::PREALLOCATED_SETUP: {

33686

assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")((Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33686, __PRETTY_FUNCTION__));

33687

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

33688

MFI->setHasPreallocatedCall(true);

33689

int64_t PreallocatedId = MI.getOperand(0).getImm();

33690

size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

33691

assert(StackAdjustment != 0 && "0 stack adjustment")((StackAdjustment != 0 && "0 stack adjustment") ? static_cast
<void> (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33691, __PRETTY_FUNCTION__));

33692

LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)

33693

<< StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false);

33694

BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

33695

.addReg(X86::ESP)

33696

.addImm(StackAdjustment);

33697

MI.eraseFromParent();

33698

return BB;

33699

}

33700

case TargetOpcode::PREALLOCATED_ARG: {

33701

assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")((Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33701, __PRETTY_FUNCTION__));

33702

int64_t PreallocatedId = MI.getOperand(1).getImm();

33703

int64_t ArgIdx = MI.getOperand(2).getImm();

33704

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

33705

size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

33706

LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)

33707

<< ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false);

33708

// stack pointer + offset

33709

addRegOffset(

33710

BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

33711

X86::ESP, false, ArgOffset);

33712

MI.eraseFromParent();

33713

return BB;

33714

}

33715

case X86::PTDPBSSD:

33716

case X86::PTDPBSUD:

33717

case X86::PTDPBUSD:

33718

case X86::PTDPBUUD:

33719

case X86::PTDPBF16PS: {

33720

unsigned Opc;

33721

switch (MI.getOpcode()) {

33722

case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

33723

case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

33724

case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

33725

case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

33726

case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

33727

}

33728

33729

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

33730

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

33731

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

33732

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

33733

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

33734

33735

MI.eraseFromParent(); // The pseudo is gone now.

33736

return BB;

33737

}

33738

case X86::PTILEZERO: {

33739

unsigned Imm = MI.getOperand(0).getImm();

33740

BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

33741

MI.eraseFromParent(); // The pseudo is gone now.

33742

return BB;

33743

}

33744

case X86::PTILELOADD:

33745

case X86::PTILELOADDT1:

33746

case X86::PTILESTORED: {

33747

unsigned Opc;

33748

switch (MI.getOpcode()) {

33749

case X86::PTILELOADD: Opc = X86::TILELOADD; break;

33750

case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

33751

case X86::PTILESTORED: Opc = X86::TILESTORED; break;

33752

}

33753

33754

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

33755

unsigned CurOp = 0;

33756

if (Opc != X86::TILESTORED)

33757

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

33758

RegState::Define);

33759

33760

MIB.add(MI.getOperand(CurOp++)); // base

33761

MIB.add(MI.getOperand(CurOp++)); // scale

33762

MIB.add(MI.getOperand(CurOp++)); // index -- stride

33763

MIB.add(MI.getOperand(CurOp++)); // displacement

33764

MIB.add(MI.getOperand(CurOp++)); // segment

33765

33766

if (Opc == X86::TILESTORED)

33767

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

33768

RegState::Undef);

33769

33770

MI.eraseFromParent(); // The pseudo is gone now.

33771

return BB;

33772

}

33773

}

33774

}

33775

33776

//===----------------------------------------------------------------------===//

33777

// X86 Optimization Hooks

33778

//===----------------------------------------------------------------------===//

33779

33780

bool

33781

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

33782

const APInt &DemandedBits,

33783

const APInt &DemandedElts,

33784

TargetLoweringOpt &TLO) const {

33785

EVT VT = Op.getValueType();

33786

unsigned Opcode = Op.getOpcode();

33787

unsigned EltSize = VT.getScalarSizeInBits();

33788

33789

if (VT.isVector()) {

33790

// If the constant is only all signbits in the active bits, then we should

33791

// extend it to the entire constant to allow it act as a boolean constant

33792

// vector.

33793

auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

33794

if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

33795

return false;

33796

for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

33797

if (!DemandedElts[i] || V.getOperand(i).isUndef())

33798

continue;

33799

const APInt &Val = V.getConstantOperandAPInt(i);

33800

if (Val.getBitWidth() > Val.getNumSignBits() &&

33801

Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

33802

return true;

33803

}

33804

return false;

33805

};

33806

// For vectors - if we have a constant, then try to sign extend.

33807

// TODO: Handle AND/ANDN cases.

33808

unsigned ActiveBits = DemandedBits.getActiveBits();

33809

if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

33810

(Opcode == ISD::OR || Opcode == ISD::XOR) &&

33811

NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

33812

EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

33813

EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

33814

VT.getVectorNumElements());

33815

SDValue NewC =

33816

TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

33817

Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

33818

SDValue NewOp =

33819

TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

33820

return TLO.CombineTo(Op, NewOp);

33821

}

33822

return false;

33823

}

33824

33825

// Only optimize Ands to prevent shrinking a constant that could be

33826

// matched by movzx.

33827

if (Opcode != ISD::AND)

33828

return false;

33829

33830

// Make sure the RHS really is a constant.

33831

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

33832

if (!C)

33833

return false;

33834

33835

const APInt &Mask = C->getAPIntValue();

33836

33837

// Clear all non-demanded bits initially.

33838

APInt ShrunkMask = Mask & DemandedBits;

33839

33840

// Find the width of the shrunk mask.

33841

unsigned Width = ShrunkMask.getActiveBits();

33842

33843

// If the mask is all 0s there's nothing to do here.

33844

if (Width == 0)

33845

return false;

33846

33847

// Find the next power of 2 width, rounding up to a byte.

33848

Width = PowerOf2Ceil(std::max(Width, 8U));

33849

// Truncate the width to size to handle illegal types.

33850

Width = std::min(Width, EltSize);

33851

33852

// Calculate a possible zero extend mask for this constant.

33853

APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

33854

33855

// If we aren't changing the mask, just return true to keep it and prevent

33856

// the caller from optimizing.

33857

if (ZeroExtendMask == Mask)

33858

return true;

33859

33860

// Make sure the new mask can be represented by a combination of mask bits

33861

// and non-demanded bits.

33862

if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

33863

return false;

33864

33865

// Replace the constant with the zero extend mask.

33866

SDLoc DL(Op);

33867

SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

33868

SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

33869

return TLO.CombineTo(Op, NewOp);

33870

}

33871

33872

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

33873

KnownBits &Known,

33874

const APInt &DemandedElts,

33875

const SelectionDAG &DAG,

33876

unsigned Depth) const {

33877

unsigned BitWidth = Known.getBitWidth();

33878

unsigned NumElts = DemandedElts.getBitWidth();

33879

unsigned Opc = Op.getOpcode();

33880

EVT VT = Op.getValueType();

33881

assert((Opc >= ISD::BUILTIN_OP_END ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33886, __PRETTY_FUNCTION__))

33882

Opc == ISD::INTRINSIC_WO_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33886, __PRETTY_FUNCTION__))

33883

Opc == ISD::INTRINSIC_W_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33886, __PRETTY_FUNCTION__))

33884

Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33886, __PRETTY_FUNCTION__))

33885

"Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33886, __PRETTY_FUNCTION__))

33886

" is a target node!")(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33886, __PRETTY_FUNCTION__));

33887

33888

Known.resetAll();

33889

switch (Opc) {

33890

default: break;

33891

case X86ISD::SETCC:

33892

Known.Zero.setBitsFrom(1);

33893

break;

33894

case X86ISD::MOVMSK: {

33895

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

33896

Known.Zero.setBitsFrom(NumLoBits);

33897

break;

33898

}

33899

case X86ISD::PEXTRB:

33900

case X86ISD::PEXTRW: {

33901

SDValue Src = Op.getOperand(0);

33902

EVT SrcVT = Src.getValueType();

33903

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

33904

Op.getConstantOperandVal(1));

33905

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

33906

Known = Known.anyextOrTrunc(BitWidth);

33907

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

33908

break;

33909

}

33910

case X86ISD::VSRAI:

33911

case X86ISD::VSHLI:

33912

case X86ISD::VSRLI: {

33913

unsigned ShAmt = Op.getConstantOperandVal(1);

33914

if (ShAmt >= VT.getScalarSizeInBits()) {

33915

Known.setAllZero();

33916

break;

33917

}

33918

33919

Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

33920

if (Opc == X86ISD::VSHLI) {

33921

Known.Zero <<= ShAmt;

33922

Known.One <<= ShAmt;

33923

// Low bits are known zero.

33924

Known.Zero.setLowBits(ShAmt);

33925

} else if (Opc == X86ISD::VSRLI) {

33926

Known.Zero.lshrInPlace(ShAmt);

33927

Known.One.lshrInPlace(ShAmt);

33928

// High bits are known zero.

33929

Known.Zero.setHighBits(ShAmt);

33930

} else {

33931

Known.Zero.ashrInPlace(ShAmt);

33932

Known.One.ashrInPlace(ShAmt);

33933

}

33934

break;

33935

}

33936

case X86ISD::PACKUS: {

33937

// PACKUS is just a truncation if the upper half is zero.

33938

APInt DemandedLHS, DemandedRHS;

33939

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

33940

33941

Known.One = APInt::getAllOnesValue(BitWidth * 2);

33942

Known.Zero = APInt::getAllOnesValue(BitWidth * 2);

33943

33944

KnownBits Known2;

33945

if (!!DemandedLHS) {

33946

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

33947

Known.One &= Known2.One;

33948

Known.Zero &= Known2.Zero;

33949

}

33950

if (!!DemandedRHS) {

33951

Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

33952

Known.One &= Known2.One;

33953

Known.Zero &= Known2.Zero;

33954

}

33955

33956

if (Known.countMinLeadingZeros() < BitWidth)

33957

Known.resetAll();

33958

Known = Known.trunc(BitWidth);

33959

break;

33960

}

33961

case X86ISD::ANDNP: {

33962

KnownBits Known2;

33963

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

33964

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

33965

33966

// ANDNP = (~X & Y);

33967

Known.One &= Known2.Zero;

33968

Known.Zero |= Known2.One;

33969

break;

33970

}

33971

case X86ISD::FOR: {

33972

KnownBits Known2;

33973

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

33974

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

33975

33976

Known |= Known2;

33977

break;

33978

}

33979

case X86ISD::PSADBW: {

33980

assert(VT.getScalarType() == MVT::i64 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33982, __PRETTY_FUNCTION__))

33981

Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33982, __PRETTY_FUNCTION__))

33982

"Unexpected PSADBW types")((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33982, __PRETTY_FUNCTION__));

33983

33984

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.

33985

Known.Zero.setBitsFrom(16);

33986

break;

33987

}

33988

case X86ISD::CMOV: {

33989

Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

33990

// If we don't know any bits, early out.

33991

if (Known.isUnknown())

33992

break;

33993

KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

33994

33995

// Only known if known in both the LHS and RHS.

33996

Known.One &= Known2.One;

33997

Known.Zero &= Known2.Zero;

33998

break;

33999

}

34000

case X86ISD::BEXTR: {

34001

SDValue Op0 = Op.getOperand(0);

34002

SDValue Op1 = Op.getOperand(1);

34003

34004

if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

34005

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

34006

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

34007

34008

// If the length is 0, the result is 0.

34009

if (Length == 0) {

34010

Known.setAllZero();

34011

break;

34012

}

34013

34014

if ((Shift + Length) <= BitWidth) {

34015

Known = DAG.computeKnownBits(Op0, Depth + 1);

34016

Known = Known.extractBits(Length, Shift);

34017

Known = Known.zextOrTrunc(BitWidth);

34018

}

34019

}

34020

break;

34021

}

34022

case X86ISD::VTRUNC:

34023

case X86ISD::VTRUNCS:

34024

case X86ISD::VTRUNCUS:

34025

case X86ISD::CVTSI2P:

34026

case X86ISD::CVTUI2P:

34027

case X86ISD::CVTP2SI:

34028

case X86ISD::CVTP2UI:

34029

case X86ISD::MCVTP2SI:

34030

case X86ISD::MCVTP2UI:

34031

case X86ISD::CVTTP2SI:

34032

case X86ISD::CVTTP2UI:

34033

case X86ISD::MCVTTP2SI:

34034

case X86ISD::MCVTTP2UI:

34035

case X86ISD::MCVTSI2P:

34036

case X86ISD::MCVTUI2P:

34037

case X86ISD::VFPROUND:

34038

case X86ISD::VMFPROUND:

34039

case X86ISD::CVTPS2PH:

34040

case X86ISD::MCVTPS2PH: {

34041

// Truncations/Conversions - upper elements are known zero.

34042

EVT SrcVT = Op.getOperand(0).getValueType();

34043

if (SrcVT.isVector()) {

34044

unsigned NumSrcElts = SrcVT.getVectorNumElements();

34045

if (NumElts > NumSrcElts &&

34046

DemandedElts.countTrailingZeros() >= NumSrcElts)

34047

Known.setAllZero();

34048

}

34049

break;

34050

}

34051

case X86ISD::STRICT_CVTTP2SI:

34052

case X86ISD::STRICT_CVTTP2UI:

34053

case X86ISD::STRICT_CVTSI2P:

34054

case X86ISD::STRICT_CVTUI2P:

34055

case X86ISD::STRICT_VFPROUND:

34056

case X86ISD::STRICT_CVTPS2PH: {

34057

// Strict Conversions - upper elements are known zero.

34058

EVT SrcVT = Op.getOperand(1).getValueType();

34059

if (SrcVT.isVector()) {

34060

unsigned NumSrcElts = SrcVT.getVectorNumElements();

34061

if (NumElts > NumSrcElts &&

34062

DemandedElts.countTrailingZeros() >= NumSrcElts)

34063

Known.setAllZero();

34064

}

34065

break;

34066

}

34067

case X86ISD::MOVQ2DQ: {

34068

// Move from MMX to XMM. Upper half of XMM should be 0.

34069

if (DemandedElts.countTrailingZeros() >= (NumElts / 2))

34070

Known.setAllZero();

34071

break;

34072

}

34073

}

34074

34075

// Handle target shuffles.

34076

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

34077

if (isTargetShuffle(Opc)) {

34078

bool IsUnary;

34079

SmallVector<int, 64> Mask;

34080

SmallVector<SDValue, 2> Ops;

34081

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,

34082

IsUnary)) {

34083

unsigned NumOps = Ops.size();

34084

unsigned NumElts = VT.getVectorNumElements();

34085

if (Mask.size() == NumElts) {

34086

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

34087

Known.Zero.setAllBits(); Known.One.setAllBits();

34088

for (unsigned i = 0; i != NumElts; ++i) {

34089

if (!DemandedElts[i])

34090

continue;

34091

int M = Mask[i];

34092

if (M == SM_SentinelUndef) {

34093

// For UNDEF elements, we don't know anything about the common state

34094

// of the shuffle result.

34095

Known.resetAll();

34096

break;

34097

} else if (M == SM_SentinelZero) {

34098

Known.One.clearAllBits();

34099

continue;

34100

}

34101

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34102, __PRETTY_FUNCTION__))

34102

"Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34102, __PRETTY_FUNCTION__));

34103

34104

unsigned OpIdx = (unsigned)M / NumElts;

34105

unsigned EltIdx = (unsigned)M % NumElts;

34106

if (Ops[OpIdx].getValueType() != VT) {

34107

// TODO - handle target shuffle ops with different value types.

34108

Known.resetAll();

34109

break;

34110

}

34111

DemandedOps[OpIdx].setBit(EltIdx);

34112

}

34113

// Known bits are the values that are shared by every demanded element.

34114

for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

34115

if (!DemandedOps[i])

34116

continue;

34117

KnownBits Known2 =

34118

DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

34119

Known.One &= Known2.One;

34120

Known.Zero &= Known2.Zero;

34121

}

34122

}

34123

}

34124

}

34125

}

34126

34127

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

34128

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

34129

unsigned Depth) const {

34130

EVT VT = Op.getValueType();

34131

unsigned VTBits = VT.getScalarSizeInBits();

34132

unsigned Opcode = Op.getOpcode();

34133

switch (Opcode) {

34134

case X86ISD::SETCC_CARRY:

34135

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

34136

return VTBits;

34137

34138

case X86ISD::VTRUNC: {

34139

SDValue Src = Op.getOperand(0);

34140

MVT SrcVT = Src.getSimpleValueType();

34141

unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

34142

assert(VTBits < NumSrcBits && "Illegal truncation input type")((VTBits < NumSrcBits && "Illegal truncation input type"
) ? static_cast<void> (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34142, __PRETTY_FUNCTION__));

34143

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

34144

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

34145

if (Tmp > (NumSrcBits - VTBits))

34146

return Tmp - (NumSrcBits - VTBits);

34147

return 1;

34148

}

34149

34150

case X86ISD::PACKSS: {

34151

// PACKSS is just a truncation if the sign bits extend to the packed size.

34152

APInt DemandedLHS, DemandedRHS;

34153

getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

34154

DemandedRHS);

34155

34156

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

34157

unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

34158

if (!!DemandedLHS)

34159

Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);

34160

if (!!DemandedRHS)

34161

Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);

34162

unsigned Tmp = std::min(Tmp0, Tmp1);

34163

if (Tmp > (SrcBits - VTBits))

34164

return Tmp - (SrcBits - VTBits);

34165

return 1;

34166

}

34167

34168

case X86ISD::VSHLI: {

34169

SDValue Src = Op.getOperand(0);

34170

const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

34171

if (ShiftVal.uge(VTBits))

34172

return VTBits; // Shifted all bits out --> zero.

34173

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

34174

if (ShiftVal.uge(Tmp))

34175

return 1; // Shifted all sign bits out --> unknown.

34176

return Tmp - ShiftVal.getZExtValue();

34177

}

34178

34179

case X86ISD::VSRAI: {

34180

SDValue Src = Op.getOperand(0);

34181

APInt ShiftVal = Op.getConstantOperandAPInt(1);

34182

if (ShiftVal.uge(VTBits - 1))

34183

return VTBits; // Sign splat.

34184

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

34185

ShiftVal += Tmp;

34186

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

34187

}

34188

34189

case X86ISD::PCMPGT:

34190

case X86ISD::PCMPEQ:

34191

case X86ISD::CMPP:

34192

case X86ISD::VPCOM:

34193

case X86ISD::VPCOMU:

34194

// Vector compares return zero/all-bits result values.

34195

return VTBits;

34196

34197

case X86ISD::ANDNP: {

34198

unsigned Tmp0 =

34199

DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

34200

if (Tmp0 == 1) return 1; // Early out.

34201

unsigned Tmp1 =

34202

DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

34203

return std::min(Tmp0, Tmp1);

34204

}

34205

34206

case X86ISD::CMOV: {

34207

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

34208

if (Tmp0 == 1) return 1; // Early out.

34209

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

34210

return std::min(Tmp0, Tmp1);

34211

}

34212

}

34213

34214

// Handle target shuffles.

34215

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

34216

if (isTargetShuffle(Opcode)) {

34217

bool IsUnary;

34218

SmallVector<int, 64> Mask;

34219

SmallVector<SDValue, 2> Ops;

34220

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,

34221

IsUnary)) {

34222

unsigned NumOps = Ops.size();

34223

unsigned NumElts = VT.getVectorNumElements();

34224

if (Mask.size() == NumElts) {

34225

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

34226

for (unsigned i = 0; i != NumElts; ++i) {

34227

if (!DemandedElts[i])

34228

continue;

34229

int M = Mask[i];

34230

if (M == SM_SentinelUndef) {

34231

// For UNDEF elements, we don't know anything about the common state

34232

// of the shuffle result.

34233

return 1;

34234

} else if (M == SM_SentinelZero) {

34235

// Zero = all sign bits.

34236

continue;

34237

}

34238

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34239, __PRETTY_FUNCTION__))

34239

"Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34239, __PRETTY_FUNCTION__));

34240

34241

unsigned OpIdx = (unsigned)M / NumElts;

34242

unsigned EltIdx = (unsigned)M % NumElts;

34243

if (Ops[OpIdx].getValueType() != VT) {

34244

// TODO - handle target shuffle ops with different value types.

34245

return 1;

34246

}

34247

DemandedOps[OpIdx].setBit(EltIdx);

34248

}

34249

unsigned Tmp0 = VTBits;

34250

for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

34251

if (!DemandedOps[i])

34252

continue;

34253

unsigned Tmp1 =

34254

DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

34255

Tmp0 = std::min(Tmp0, Tmp1);

34256

}

34257

return Tmp0;

34258

}

34259

}

34260

}

34261

34262

// Fallback case.

34263

return 1;

34264

}

34265

34266

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

34267

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

34268

return N->getOperand(0);

34269

return N;

34270

}

34271

34272

// Helper to look for a normal load that can be narrowed into a vzload with the

34273

// specified VT and memory VT. Returns SDValue() on failure.

34274

static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

34275

SelectionDAG &DAG) {

34276

// Can't if the load is volatile or atomic.

34277

if (!LN->isSimple())

34278

return SDValue();

34279

34280

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

34281

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

34282

return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

34283

LN->getPointerInfo(), LN->getOriginalAlign(),

34284

LN->getMemOperand()->getFlags());

34285

}

34286

34287

// Attempt to match a combined shuffle mask against supported unary shuffle

34288

// instructions.

34289

// TODO: Investigate sharing more of this with shuffle lowering.

34290

static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

34291

bool AllowFloatDomain, bool AllowIntDomain,

34292

SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,

34293

const X86Subtarget &Subtarget, unsigned &Shuffle,

34294

MVT &SrcVT, MVT &DstVT) {

34295

unsigned NumMaskElts = Mask.size();

34296

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

34297

34298

// Match against a VZEXT_MOVL vXi32 zero-extending instruction.

34299

if (MaskEltSize == 32 && Mask[0] == 0) {

34300

if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {

34301

Shuffle = X86ISD::VZEXT_MOVL;

34302

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

34303

return true;

34304

}

34305

if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

34306

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

34307

Shuffle = X86ISD::VZEXT_MOVL;

34308

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

34309

return true;

34310

}

34311

}

34312

34313

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.

34314

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

34315

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

34316

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

34317

unsigned MaxScale = 64 / MaskEltSize;

34318

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

34319

bool MatchAny = true;

34320

bool MatchZero = true;

34321

unsigned NumDstElts = NumMaskElts / Scale;

34322

for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {

34323

if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

34324

MatchAny = MatchZero = false;

34325

break;

34326

}

34327

MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);

34328

MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

34329

}

34330

if (MatchAny || MatchZero) {

34331

assert(MatchZero && "Failed to match zext but matched aext?")((MatchZero && "Failed to match zext but matched aext?"
) ? static_cast<void> (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34331, __PRETTY_FUNCTION__));

34332

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

34333

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

34334

MVT::getIntegerVT(MaskEltSize);

34335

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

34336

34337

if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())

34338

V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);

34339

34340

Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);

34341

if (SrcVT.getVectorNumElements() != NumDstElts)

34342

Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);

34343

34344

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

34345

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

34346

return true;

34347

}

34348

}

34349

}

34350

34351

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

34352

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&

34353

isUndefOrEqual(Mask[0], 0) &&

34354

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

34355

Shuffle = X86ISD::VZEXT_MOVL;

34356

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

34357

return true;

34358

}

34359

34360

// Check if we have SSE3 which will let us use MOVDDUP etc. The

34361

// instructions are no slower than UNPCKLPD but has the option to

34362

// fold the input operand into even an unaligned memory load.

34363

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

34364

if (isTargetShuffleEquivalent(Mask, {0, 0}, V1)) {

34365

Shuffle = X86ISD::MOVDDUP;

34366

SrcVT = DstVT = MVT::v2f64;

34367

return true;

34368

}

34369

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}, V1)) {

34370

Shuffle = X86ISD::MOVSLDUP;

34371

SrcVT = DstVT = MVT::v4f32;

34372

return true;

34373

}

34374

if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3}, V1)) {

34375

Shuffle = X86ISD::MOVSHDUP;

34376

SrcVT = DstVT = MVT::v4f32;

34377

return true;

34378

}

34379

}

34380

34381

if (MaskVT.is256BitVector() && AllowFloatDomain) {

34382

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34382, __PRETTY_FUNCTION__));

34383

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}, V1)) {

34384

Shuffle = X86ISD::MOVDDUP;

34385

SrcVT = DstVT = MVT::v4f64;

34386

return true;

34387

}

34388

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {

34389

Shuffle = X86ISD::MOVSLDUP;

34390

SrcVT = DstVT = MVT::v8f32;

34391

return true;

34392

}

34393

if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {

34394

Shuffle = X86ISD::MOVSHDUP;

34395

SrcVT = DstVT = MVT::v8f32;

34396

return true;

34397

}

34398

}

34399

34400

if (MaskVT.is512BitVector() && AllowFloatDomain) {

34401

assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34402, __PRETTY_FUNCTION__))

34402

"AVX512 required for 512-bit vector shuffles")((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34402, __PRETTY_FUNCTION__));

34403

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {

34404

Shuffle = X86ISD::MOVDDUP;

34405

SrcVT = DstVT = MVT::v8f64;

34406

return true;

34407

}

34408

if (isTargetShuffleEquivalent(

34409

Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {

34410

Shuffle = X86ISD::MOVSLDUP;

34411

SrcVT = DstVT = MVT::v16f32;

34412

return true;

34413

}

34414

if (isTargetShuffleEquivalent(

34415

Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {

34416

Shuffle = X86ISD::MOVSHDUP;

34417

SrcVT = DstVT = MVT::v16f32;

34418

return true;

34419

}

34420

}

34421

34422

return false;

34423

}

34424

34425

// Attempt to match a combined shuffle mask against supported unary immediate

34426

// permute instructions.

34427

// TODO: Investigate sharing more of this with shuffle lowering.

34428

static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

34429

const APInt &Zeroable,

34430

bool AllowFloatDomain, bool AllowIntDomain,

34431

const X86Subtarget &Subtarget,

34432

unsigned &Shuffle, MVT &ShuffleVT,

34433

unsigned &PermuteImm) {

34434

unsigned NumMaskElts = Mask.size();

34435

unsigned InputSizeInBits = MaskVT.getSizeInBits();

34436

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

34437

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

34438

bool ContainsZeros = isAnyZero(Mask);

34439

34440

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

34441

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

34442

// Check for lane crossing permutes.

34443

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

34444

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

34445

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

34446

Shuffle = X86ISD::VPERMI;

34447

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

34448

PermuteImm = getV4X86ShuffleImm(Mask);

34449

return true;

34450

}

34451

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

34452

SmallVector<int, 4> RepeatedMask;

34453

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

34454

Shuffle = X86ISD::VPERMI;

34455

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

34456

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

34457

return true;

34458

}

34459

}

34460

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

34461

// VPERMILPD can permute with a non-repeating shuffle.

34462

Shuffle = X86ISD::VPERMILPI;

34463

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

34464

PermuteImm = 0;

34465

for (int i = 0, e = Mask.size(); i != e; ++i) {

34466

int M = Mask[i];

34467

if (M == SM_SentinelUndef)

34468

continue;

34469

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? static_cast<void> (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34469, __PRETTY_FUNCTION__));

34470

PermuteImm |= (M & 1) << i;

34471

}

34472

return true;

34473

}

34474

}

34475

34476

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

34477

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

34478

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

34479

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

34480

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

34481

SmallVector<int, 4> RepeatedMask;

34482

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

34483

// Narrow the repeated mask to create 32-bit element permutes.

34484

SmallVector<int, 4> WordMask = RepeatedMask;

34485

if (MaskScalarSizeInBits == 64)

34486

narrowShuffleMaskElts(2, RepeatedMask, WordMask);

34487

34488

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

34489

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

34490

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

34491

PermuteImm = getV4X86ShuffleImm(WordMask);

34492

return true;

34493

}

34494

}

34495

34496

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

34497

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

34498

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

34499

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

34500

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

34501

SmallVector<int, 4> RepeatedMask;

34502

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

34503

ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

34504

ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

34505

34506

// PSHUFLW: permute lower 4 elements only.

34507

if (isUndefOrInRange(LoMask, 0, 4) &&

34508

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

34509

Shuffle = X86ISD::PSHUFLW;

34510

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

34511

PermuteImm = getV4X86ShuffleImm(LoMask);

34512

return true;

34513

}

34514

34515

// PSHUFHW: permute upper 4 elements only.

34516

if (isUndefOrInRange(HiMask, 4, 8) &&

34517

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

34518

// Offset the HiMask so that we can create the shuffle immediate.

34519

int OffsetHiMask[4];

34520

for (int i = 0; i != 4; ++i)

34521

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

34522

34523

Shuffle = X86ISD::PSHUFHW;

34524

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

34525

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

34526

return true;

34527

}

34528

}

34529

}

34530

34531

// Attempt to match against byte/bit shifts.

34532

if (AllowIntDomain &&

34533

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

34534

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

34535

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

34536

int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,

34537

Mask, 0, Zeroable, Subtarget);

34538

if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

34539

32 <= ShuffleVT.getScalarSizeInBits())) {

34540

PermuteImm = (unsigned)ShiftAmt;

34541

return true;

34542

}

34543

}

34544

34545

// Attempt to match against bit rotates.

34546

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

34547

((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

34548

Subtarget.hasAVX512())) {

34549

int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

34550

Subtarget, Mask);

34551

if (0 < RotateAmt) {

34552

Shuffle = X86ISD::VROTLI;

34553

PermuteImm = (unsigned)RotateAmt;

34554

return true;

34555

}

34556

}

34557

34558

return false;

34559

}

34560

34561

// Attempt to match a combined unary shuffle mask against supported binary

34562

// shuffle instructions.

34563

// TODO: Investigate sharing more of this with shuffle lowering.

34564

static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

34565

bool AllowFloatDomain, bool AllowIntDomain,

34566

SDValue &V1, SDValue &V2, const SDLoc &DL,

34567

SelectionDAG &DAG, const X86Subtarget &Subtarget,

34568

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

34569

bool IsUnary) {

34570

unsigned NumMaskElts = Mask.size();

34571

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

34572

34573

if (MaskVT.is128BitVector()) {

34574

if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {

34575

V2 = V1;

34576

V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

34577

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

34578

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

34579

return true;

34580

}

34581

if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {

34582

V2 = V1;

34583

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

34584

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

34585

return true;

34586

}

34587

if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&

34588

(AllowFloatDomain || !Subtarget.hasSSE41())) {

34589

std::swap(V1, V2);

34590

Shuffle = X86ISD::MOVSD;

34591

SrcVT = DstVT = MVT::v2f64;

34592

return true;

34593

}

34594

if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&

34595

(AllowFloatDomain || !Subtarget.hasSSE41())) {

34596

Shuffle = X86ISD::MOVSS;

34597

SrcVT = DstVT = MVT::v4f32;

34598

return true;

34599

}

34600

}

34601

34602

// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

34603

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

34604

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

34605

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

34606

if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

34607

Subtarget)) {

34608

DstVT = MaskVT;

34609

return true;

34610

}

34611

}

34612

34613

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

34614

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

34615

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

34616

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

34617

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

34618

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

34619

if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

34620

Subtarget)) {

34621

SrcVT = DstVT = MaskVT;

34622

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

34623

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

34624

return true;

34625

}

34626

}

34627

34628

// Attempt to match against a OR if we're performing a blend shuffle and the

34629

// non-blended source element is zero in each case.

34630

if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

34631

(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

34632

bool IsBlend = true;

34633

unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

34634

unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

34635

unsigned Scale1 = NumV1Elts / NumMaskElts;

34636

unsigned Scale2 = NumV2Elts / NumMaskElts;

34637

APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);

34638

APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);

34639

for (unsigned i = 0; i != NumMaskElts; ++i) {

34640

int M = Mask[i];

34641

if (M == SM_SentinelUndef)

34642

continue;

34643

if (M == SM_SentinelZero) {

34644

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

34645

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

34646

continue;

34647

}

34648

if (M == (int)i) {

34649

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

34650

continue;

34651

}

34652

if (M == (int)(i + NumMaskElts)) {

34653

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

34654

continue;

34655

}

34656

IsBlend = false;

34657

break;

34658

}

34659

if (IsBlend &&

34660

DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&

34661

DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {

34662

Shuffle = ISD::OR;

34663

SrcVT = DstVT = EVT(MaskVT).changeTypeToInteger().getSimpleVT();

34664

return true;

34665

}

34666

}

34667

34668

return false;

34669

}

34670

34671

static bool matchBinaryPermuteShuffle(

34672

MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

34673

bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

34674

const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

34675

unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

34676

unsigned NumMaskElts = Mask.size();

34677

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

34678

34679

// Attempt to match against VALIGND/VALIGNQ rotate.

34680

if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

34681

((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

34682

(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

34683

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

34684

if (!isAnyZero(Mask)) {

34685

int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

34686

if (0 < Rotation) {

34687

Shuffle = X86ISD::VALIGN;

34688

if (EltSizeInBits == 64)

34689

ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

34690

else

34691

ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

34692

PermuteImm = Rotation;

34693

return true;

34694

}

34695

}

34696

}

34697

34698

// Attempt to match against PALIGNR byte rotate.

34699

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

34700

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

34701

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

34702

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

34703

if (0 < ByteRotation) {

34704

Shuffle = X86ISD::PALIGNR;

34705

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

34706

PermuteImm = ByteRotation;

34707

return true;

34708

}

34709

}

34710

34711

// Attempt to combine to X86ISD::BLENDI.

34712

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

34713

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

34714

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

34715

uint64_t BlendMask = 0;

34716

bool ForceV1Zero = false, ForceV2Zero = false;

34717

SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());

34718

if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,

34719

ForceV2Zero, BlendMask)) {

34720

if (MaskVT == MVT::v16i16) {

34721

// We can only use v16i16 PBLENDW if the lanes are repeated.

34722

SmallVector<int, 8> RepeatedMask;

34723

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

34724

RepeatedMask)) {

34725

assert(RepeatedMask.size() == 8 &&((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34726, __PRETTY_FUNCTION__))

34726

"Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34726, __PRETTY_FUNCTION__));

34727

PermuteImm = 0;

34728

for (int i = 0; i < 8; ++i)

34729

if (RepeatedMask[i] >= 8)

34730

PermuteImm |= 1 << i;

34731

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

34732

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

34733

Shuffle = X86ISD::BLENDI;

34734

ShuffleVT = MaskVT;

34735

return true;

34736

}

34737

} else {

34738

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

34739

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

34740

PermuteImm = (unsigned)BlendMask;

34741

Shuffle = X86ISD::BLENDI;

34742

ShuffleVT = MaskVT;

34743

return true;

34744

}

34745

}

34746

}

34747

34748

// Attempt to combine to INSERTPS, but only if it has elements that need to

34749

// be set to zero.

34750

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

34751

MaskVT.is128BitVector() && isAnyZero(Mask) &&

34752

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

34753

Shuffle = X86ISD::INSERTPS;

34754

ShuffleVT = MVT::v4f32;

34755

return true;

34756

}

34757

34758

// Attempt to combine to SHUFPD.

34759

if (AllowFloatDomain && EltSizeInBits == 64 &&

34760

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

34761

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

34762

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

34763

bool ForceV1Zero = false, ForceV2Zero = false;

34764

if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

34765

PermuteImm, Mask, Zeroable)) {

34766

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

34767

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

34768

Shuffle = X86ISD::SHUFP;

34769

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

34770

return true;

34771

}

34772

}

34773

34774

// Attempt to combine to SHUFPS.

34775

if (AllowFloatDomain && EltSizeInBits == 32 &&

34776

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

34777

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

34778

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

34779

SmallVector<int, 4> RepeatedMask;

34780

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

34781

// Match each half of the repeated mask, to determine if its just

34782

// referencing one of the vectors, is zeroable or entirely undef.

34783

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

34784

int M0 = RepeatedMask[Offset];

34785

int M1 = RepeatedMask[Offset + 1];

34786

34787

if (isUndefInRange(RepeatedMask, Offset, 2)) {

34788

return DAG.getUNDEF(MaskVT);

34789

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

34790

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

34791

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

34792

return getZeroVector(MaskVT, Subtarget, DAG, DL);

34793

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

34794

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

34795

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

34796

return V1;

34797

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

34798

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

34799

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

34800

return V2;

34801

}

34802

34803

return SDValue();

34804

};

34805

34806

int ShufMask[4] = {-1, -1, -1, -1};

34807

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

34808

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

34809

34810

if (Lo && Hi) {

34811

V1 = Lo;

34812

V2 = Hi;

34813

Shuffle = X86ISD::SHUFP;

34814

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

34815

PermuteImm = getV4X86ShuffleImm(ShufMask);

34816

return true;

34817

}

34818

}

34819

}

34820

34821

// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

34822

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

34823

MaskVT.is128BitVector() &&

34824

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

34825

Shuffle = X86ISD::INSERTPS;

34826

ShuffleVT = MVT::v4f32;

34827

return true;

34828

}

34829

34830

return false;

34831

}

34832

34833

static SDValue combineX86ShuffleChainWithExtract(

34834

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

34835

bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,

34836

const X86Subtarget &Subtarget);

34837

34838

/// Combine an arbitrary chain of shuffles into a single instruction if

34839

/// possible.

34840

///

34841

/// This is the leaf of the recursive combine below. When we have found some

34842

/// chain of single-use x86 shuffle instructions and accumulated the combined

34843

/// shuffle mask represented by them, this will try to pattern match that mask

34844

/// into either a single instruction if there is a special purpose instruction

34845

/// for this operation, or into a PSHUFB instruction which is a fully general

34846

/// instruction but should only be used to replace chains over a certain depth.

34847

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

34848

ArrayRef<int> BaseMask, int Depth,

34849

bool HasVariableMask,

34850

bool AllowVariableMask, SelectionDAG &DAG,

34851

const X86Subtarget &Subtarget) {

34852

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? static_cast<void> (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34852, __PRETTY_FUNCTION__));

34853

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34854, __PRETTY_FUNCTION__))

34854

"Unexpected number of shuffle inputs!")(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34854, __PRETTY_FUNCTION__));

34855

34856

MVT RootVT = Root.getSimpleValueType();

34857

unsigned RootSizeInBits = RootVT.getSizeInBits();

34858

unsigned NumRootElts = RootVT.getVectorNumElements();

34859

34860

// Find the inputs that enter the chain. Note that multiple uses are OK

34861

// here, we're not going to remove the operands we find.

34862

bool UnaryShuffle = (Inputs.size() == 1);

34863

SDValue V1 = peekThroughBitcasts(Inputs[0]);

34864

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

34865

: peekThroughBitcasts(Inputs[1]));

34866

34867

MVT VT1 = V1.getSimpleValueType();

34868

MVT VT2 = V2.getSimpleValueType();

34869

assert(VT1.getSizeInBits() == RootSizeInBits &&((VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits
() == RootSizeInBits && "Vector size mismatch") ? static_cast
<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits() == RootSizeInBits && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34870, __PRETTY_FUNCTION__))

34870

VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch")((VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits
() == RootSizeInBits && "Vector size mismatch") ? static_cast
<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits() == RootSizeInBits && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34870, __PRETTY_FUNCTION__));

34871

34872

SDLoc DL(Root);

34873

SDValue Res;

34874

34875

unsigned NumBaseMaskElts = BaseMask.size();

34876

if (NumBaseMaskElts == 1) {

34877

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((BaseMask[0] == 0 && "Invalid shuffle index found!")
? static_cast<void> (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34877, __PRETTY_FUNCTION__));

34878

return DAG.getBitcast(RootVT, V1);

34879

}

34880

34881

bool OptForSize = DAG.shouldOptForSize();

34882

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

34883

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

34884

(RootVT.isFloatingPoint() && Depth >= 1) ||

34885

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

34886

34887

// Don't combine if we are a AVX512/EVEX target and the mask element size

34888

// is different from the root element size - this would prevent writemasks

34889

// from being reused.

34890

bool IsMaskedShuffle = false;

34891

if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

34892

if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

34893

Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

34894

IsMaskedShuffle = true;

34895

}

34896

}

34897

34898

// If we are shuffling a broadcast (and not introducing zeros) then

34899

// we can just use the broadcast directly. This works for smaller broadcast

34900

// elements as well as they already repeat across each mask element

34901

if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&

34902

(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {

34903

return DAG.getBitcast(RootVT, V1);

34904

}

34905

34906

// Attempt to match a subvector broadcast.

34907

// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)

34908

if (UnaryShuffle &&

34909

(BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {

34910

SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);

34911

if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {

34912

SDValue Src = Inputs[0];

34913

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

34914

Src.getOperand(0).isUndef() &&

34915

Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&

34916

MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {

34917

return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,

34918

Src.getValueType(),

34919

Src.getOperand(1)));

34920

}

34921

}

34922

}

34923

34924

// Handle 128/256-bit lane shuffles of 512-bit vectors.

34925

if (RootVT.is512BitVector() &&

34926

(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

34927

MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

34928

34929

// If the upper subvectors are zeroable, then an extract+insert is more

34930

// optimal than using X86ISD::SHUF128. The insertion is free, even if it has

34931

// to zero the upper subvectors.

34932

if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {

34933

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

34934

return SDValue(); // Nothing to do!

34935

assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&((isInRange(BaseMask[0], 0, NumBaseMaskElts) && "Unexpected lane shuffle"
) ? static_cast<void> (0) : __assert_fail ("isInRange(BaseMask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34936, __PRETTY_FUNCTION__))

34936

"Unexpected lane shuffle")((isInRange(BaseMask[0], 0, NumBaseMaskElts) && "Unexpected lane shuffle"
) ? static_cast<void> (0) : __assert_fail ("isInRange(BaseMask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34936, __PRETTY_FUNCTION__));

34937

Res = DAG.getBitcast(ShuffleVT, V1);

34938

unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);

34939

bool UseZero = isAnyZero(BaseMask);

34940

Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

34941

Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

34942

return DAG.getBitcast(RootVT, Res);

34943

}

34944

34945

// Narrow shuffle mask to v4x128.

34946

SmallVector<int, 4> Mask;

34947

assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34947, __PRETTY_FUNCTION__));

34948

narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);

34949

34950

// Try to lower to vshuf64x2/vshuf32x4.

34951

auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,

34952

SDValue V1, SDValue V2, SelectionDAG &DAG) {

34953

unsigned PermMask = 0;

34954

// Insure elements came from the same Op.

34955

SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

34956

for (int i = 0; i < 4; ++i) {

34957

assert(Mask[i] >= -1 && "Illegal shuffle sentinel value")((Mask[i] >= -1 && "Illegal shuffle sentinel value"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34957, __PRETTY_FUNCTION__));

34958

if (Mask[i] < 0)

34959

continue;

34960

34961

SDValue Op = Mask[i] >= 4 ? V2 : V1;

34962

unsigned OpIndex = i / 2;

34963

if (Ops[OpIndex].isUndef())

34964

Ops[OpIndex] = Op;

34965

else if (Ops[OpIndex] != Op)

34966

return SDValue();

34967

34968

// Convert the 128-bit shuffle mask selection values into 128-bit

34969

// selection bits defined by a vshuf64x2 instruction's immediate control

34970

// byte.

34971

PermMask |= (Mask[i] % 4) << (i * 2);

34972

}

34973

34974

return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

34975

DAG.getBitcast(ShuffleVT, Ops[0]),

34976

DAG.getBitcast(ShuffleVT, Ops[1]),

34977

DAG.getTargetConstant(PermMask, DL, MVT::i8));

34978

};

34979

34980

// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

34981

// doesn't work because our mask is for 128 bits and we don't have an MVT

34982

// to match that.

34983

bool PreferPERMQ =

34984

UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&

34985

isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&

34986

isUndefOrInRange(Mask[3], 2, 4) &&

34987

(Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&

34988

(Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));

34989

34990

if (!isAnyZero(Mask) && !PreferPERMQ) {

34991

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

34992

return SDValue(); // Nothing to do!

34993

if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))

34994

return DAG.getBitcast(RootVT, V);

34995

}

34996

}

34997

34998

// Handle 128-bit lane shuffles of 256-bit vectors.

34999

if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

35000

MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);

35001

35002

// If the upper half is zeroable, then an extract+insert is more optimal

35003

// than using X86ISD::VPERM2X128. The insertion is free, even if it has to

35004

// zero the upper half.

35005

if (isUndefOrZero(BaseMask[1])) {

35006

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

35007

return SDValue(); // Nothing to do!

35008

assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle")((isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle"
) ? static_cast<void> (0) : __assert_fail ("isInRange(BaseMask[0], 0, 2) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35008, __PRETTY_FUNCTION__));

35009

Res = DAG.getBitcast(ShuffleVT, V1);

35010

Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);

35011

Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,

35012

DL, 256);

35013

return DAG.getBitcast(RootVT, Res);

35014

}

35015

35016

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

35017

return SDValue(); // Nothing to do!

35018

35019

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

35020

// we need to use the zeroing feature.

35021

// Prefer blends for sequential shuffles unless we are optimizing for size.

35022

if (UnaryShuffle &&

35023

!(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&

35024

(OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {

35025

unsigned PermMask = 0;

35026

PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);

35027

PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

35028

35029

Res = DAG.getBitcast(ShuffleVT, V1);

35030

Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,

35031

DAG.getUNDEF(ShuffleVT),

35032

DAG.getTargetConstant(PermMask, DL, MVT::i8));

35033

return DAG.getBitcast(RootVT, Res);

35034

}

35035

35036

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

35037

return SDValue(); // Nothing to do!

35038

35039

// TODO - handle AVX512VL cases with X86ISD::SHUF128.

35040

if (!UnaryShuffle && !IsMaskedShuffle) {

35041

assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&((llvm::all_of(BaseMask, [](int M) { return 0 <= M &&
M < 4; }) && "Unexpected shuffle sentinel value")
? static_cast<void> (0) : __assert_fail ("llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35042, __PRETTY_FUNCTION__))

35042

"Unexpected shuffle sentinel value")((llvm::all_of(BaseMask, [](int M) { return 0 <= M &&
M < 4; }) && "Unexpected shuffle sentinel value")
? static_cast<void> (0) : __assert_fail ("llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35042, __PRETTY_FUNCTION__));

35043

// Prefer blends to X86ISD::VPERM2X128.

35044

if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||

35045

(BaseMask[0] == 2 && BaseMask[1] == 1))) {

35046

unsigned PermMask = 0;

35047

PermMask |= ((BaseMask[0] & 3) << 0);

35048

PermMask |= ((BaseMask[1] & 3) << 4);

35049

35050

Res = DAG.getNode(

35051

X86ISD::VPERM2X128, DL, ShuffleVT,

35052

DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),

35053

DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),

35054

DAG.getTargetConstant(PermMask, DL, MVT::i8));

35055

return DAG.getBitcast(RootVT, Res);

35056

}

35057

}

35058

}

35059

35060

// For masks that have been widened to 128-bit elements or more,

35061

// narrow back down to 64-bit elements.

35062

SmallVector<int, 64> Mask;

35063

if (BaseMaskEltSizeInBits > 64) {

35064

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35064, __PRETTY_FUNCTION__));

35065

int MaskScale = BaseMaskEltSizeInBits / 64;

35066

narrowShuffleMaskElts(MaskScale, BaseMask, Mask);

35067

} else {

35068

Mask.assign(BaseMask.begin(), BaseMask.end());

35069

}

35070

35071

// For masked shuffles, we're trying to match the root width for better

35072

// writemask folding, attempt to scale the mask.

35073

// TODO - variable shuffles might need this to be widened again.

35074

if (IsMaskedShuffle && NumRootElts > Mask.size()) {

35075

assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(((NumRootElts % Mask.size()) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35075, __PRETTY_FUNCTION__));

35076

int MaskScale = NumRootElts / Mask.size();

35077

SmallVector<int, 64> ScaledMask;

35078

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

35079

Mask = std::move(ScaledMask);

35080

}

35081

35082

unsigned NumMaskElts = Mask.size();

35083

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

35084

35085

// Determine the effective mask value type.

35086

FloatDomain &= (32 <= MaskEltSizeInBits);

35087

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

35088

: MVT::getIntegerVT(MaskEltSizeInBits);

35089

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

35090

35091

// Only allow legal mask types.

35092

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

35093

return SDValue();

35094

35095

// Attempt to match the mask against known shuffle patterns.

35096

MVT ShuffleSrcVT, ShuffleVT;

35097

unsigned Shuffle, PermuteImm;

35098

35099

// Which shuffle domains are permitted?

35100

// Permit domain crossing at higher combine depths.

35101

// TODO: Should we indicate which domain is preferred if both are allowed?

35102

bool AllowFloatDomain = FloatDomain || (Depth >= 3);

35103

bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

35104

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

35105

35106

// Determine zeroable mask elements.

35107

APInt KnownUndef, KnownZero;

35108

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

35109

APInt Zeroable = KnownUndef | KnownZero;

35110

35111

if (UnaryShuffle) {

35112

// Attempt to match against broadcast-from-vector.

35113

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

35114

if ((Subtarget.hasAVX2() ||

35115

(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

35116

(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

35117

SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);

35118

if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {

35119

if (V1.getValueType() == MaskVT &&

35120

V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

35121

MayFoldLoad(V1.getOperand(0))) {

35122

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

35123

return SDValue(); // Nothing to do!

35124

Res = V1.getOperand(0);

35125

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

35126

return DAG.getBitcast(RootVT, Res);

35127

}

35128

if (Subtarget.hasAVX2()) {

35129

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

35130

return SDValue(); // Nothing to do!

35131

Res = DAG.getBitcast(MaskVT, V1);

35132

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

35133

return DAG.getBitcast(RootVT, Res);

35134

}

35135

}

35136

}

35137

35138

SDValue NewV1 = V1; // Save operand in case early exit happens.

35139

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

35140

DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

35141

ShuffleVT) &&

35142

(!IsMaskedShuffle ||

35143

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

35144

if (Depth == 0 && Root.getOpcode() == Shuffle)

35145

return SDValue(); // Nothing to do!

35146

Res = DAG.getBitcast(ShuffleSrcVT, NewV1);

35147

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

35148

return DAG.getBitcast(RootVT, Res);

35149

}

35150

35151

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

35152

AllowIntDomain, Subtarget, Shuffle, ShuffleVT,

35153

PermuteImm) &&

35154

(!IsMaskedShuffle ||

35155

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

35156

if (Depth == 0 && Root.getOpcode() == Shuffle)

35157

return SDValue(); // Nothing to do!

35158

Res = DAG.getBitcast(ShuffleVT, V1);

35159

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

35160

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

35161

return DAG.getBitcast(RootVT, Res);

35162

}

35163

}

35164

35165

// Attempt to combine to INSERTPS, but only if the inserted element has come

35166

// from a scalar.

35167

// TODO: Handle other insertions here as well?

35168

if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

35169

Subtarget.hasSSE41() && !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {

35170

if (MaskEltSizeInBits == 32) {

35171

SDValue SrcV1 = V1, SrcV2 = V2;

35172

if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

35173

DAG) &&

35174

SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

35175

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

35176

return SDValue(); // Nothing to do!

35177

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

35178

DAG.getBitcast(MVT::v4f32, SrcV1),

35179

DAG.getBitcast(MVT::v4f32, SrcV2),

35180

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

35181

return DAG.getBitcast(RootVT, Res);

35182

}

35183

}

35184

if (MaskEltSizeInBits == 64 && isTargetShuffleEquivalent(Mask, {0, 2}) &&

35185

V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

35186

V2.getScalarValueSizeInBits() <= 32) {

35187

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

35188

return SDValue(); // Nothing to do!

35189

PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);

35190

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

35191

DAG.getBitcast(MVT::v4f32, V1),

35192

DAG.getBitcast(MVT::v4f32, V2),

35193

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

35194

return DAG.getBitcast(RootVT, Res);

35195

}

35196

}

35197

35198

SDValue NewV1 = V1; // Save operands in case early exit happens.

35199

SDValue NewV2 = V2;

35200

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

35201

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

35202

ShuffleVT, UnaryShuffle) &&

35203

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

35204

if (Depth == 0 && Root.getOpcode() == Shuffle)

35205

return SDValue(); // Nothing to do!

35206

NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);

35207

NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);

35208

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

35209

return DAG.getBitcast(RootVT, Res);

35210

}

35211

35212

NewV1 = V1; // Save operands in case early exit happens.

35213

NewV2 = V2;

35214

if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

35215

AllowIntDomain, NewV1, NewV2, DL, DAG,

35216

Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

35217

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

35218

if (Depth == 0 && Root.getOpcode() == Shuffle)

35219

return SDValue(); // Nothing to do!

35220

NewV1 = DAG.getBitcast(ShuffleVT, NewV1);

35221

NewV2 = DAG.getBitcast(ShuffleVT, NewV2);

35222

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

35223

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

35224

return DAG.getBitcast(RootVT, Res);

35225

}

35226

35227

// Typically from here on, we need an integer version of MaskVT.

35228

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

35229

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

35230

35231

// Annoyingly, SSE4A instructions don't map into the above match helpers.

35232

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

35233

uint64_t BitLen, BitIdx;

35234

if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

35235

Zeroable)) {

35236

if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)

35237

return SDValue(); // Nothing to do!

35238

V1 = DAG.getBitcast(IntMaskVT, V1);

35239

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

35240

DAG.getTargetConstant(BitLen, DL, MVT::i8),

35241

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

35242

return DAG.getBitcast(RootVT, Res);

35243

}

35244

35245

if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

35246

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)

35247

return SDValue(); // Nothing to do!

35248

V1 = DAG.getBitcast(IntMaskVT, V1);

35249

V2 = DAG.getBitcast(IntMaskVT, V2);

35250

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

35251

DAG.getTargetConstant(BitLen, DL, MVT::i8),

35252

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

35253

return DAG.getBitcast(RootVT, Res);

35254

}

35255

}

35256

35257

// Match shuffle against TRUNCATE patterns.

35258

if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

35259

// Match against a VTRUNC instruction, accounting for src/dst sizes.

35260

if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

35261

Subtarget)) {

35262

bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

35263

ShuffleSrcVT.getVectorNumElements();

35264

unsigned Opc =

35265

IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

35266

if (Depth == 0 && Root.getOpcode() == Opc)

35267

return SDValue(); // Nothing to do!

35268

V1 = DAG.getBitcast(ShuffleSrcVT, V1);

35269

Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

35270

if (ShuffleVT.getSizeInBits() < RootSizeInBits)

35271

Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

35272

return DAG.getBitcast(RootVT, Res);

35273

}

35274

35275

// Do we need a more general binary truncation pattern?

35276

if (RootSizeInBits < 512 &&

35277

((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

35278

(RootVT.is128BitVector() && Subtarget.hasVLX())) &&

35279

(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

35280

isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

35281

if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)

35282

return SDValue(); // Nothing to do!

35283

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

35284

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

35285

V1 = DAG.getBitcast(ShuffleSrcVT, V1);

35286

V2 = DAG.getBitcast(ShuffleSrcVT, V2);

35287

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

35288

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

35289

Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

35290

Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

35291

return DAG.getBitcast(RootVT, Res);

35292

}

35293

}

35294

35295

// Don't try to re-form single instruction chains under any circumstances now

35296

// that we've done encoding canonicalization for them.

35297

if (Depth < 1)

35298

return SDValue();

35299

35300

// Depth threshold above which we can efficiently use variable mask shuffles.

35301

int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;

35302

AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;

35303

35304

bool MaskContainsZeros = isAnyZero(Mask);

35305

35306

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

35307

// If we have a single input lane-crossing shuffle then lower to VPERMV.

35308

if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {

35309

if (Subtarget.hasAVX2() &&

35310

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

35311

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

35312

Res = DAG.getBitcast(MaskVT, V1);

35313

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

35314

return DAG.getBitcast(RootVT, Res);

35315

}

35316

// AVX512 variants (non-VLX will pad to 512-bit shuffles).

35317

if ((Subtarget.hasAVX512() &&

35318

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

35319

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

35320

(Subtarget.hasBWI() &&

35321

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

35322

(Subtarget.hasVBMI() &&

35323

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

35324

V1 = DAG.getBitcast(MaskVT, V1);

35325

V2 = DAG.getUNDEF(MaskVT);

35326

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

35327

return DAG.getBitcast(RootVT, Res);

35328

}

35329

}

35330

35331

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

35332

// vector as the second source (non-VLX will pad to 512-bit shuffles).

35333

if (UnaryShuffle && AllowVariableMask &&

35334

((Subtarget.hasAVX512() &&

35335

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

35336

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

35337

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

35338

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

35339

(Subtarget.hasBWI() &&

35340

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

35341

(Subtarget.hasVBMI() &&

35342

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

35343

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

35344

for (unsigned i = 0; i != NumMaskElts; ++i)

35345

if (Mask[i] == SM_SentinelZero)

35346

Mask[i] = NumMaskElts + i;

35347

V1 = DAG.getBitcast(MaskVT, V1);

35348

V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

35349

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

35350

return DAG.getBitcast(RootVT, Res);

35351

}

35352

35353

// If that failed and either input is extracted then try to combine as a

35354

// shuffle with the larger type.

35355

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

35356

Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,

35357

DAG, Subtarget))

35358

return WideShuffle;

35359

35360

// If we have a dual input lane-crossing shuffle then lower to VPERMV3,

35361

// (non-VLX will pad to 512-bit shuffles).

35362

if (AllowVariableMask && !MaskContainsZeros &&

35363

((Subtarget.hasAVX512() &&

35364

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

35365

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

35366

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

35367

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

35368

(Subtarget.hasBWI() &&

35369

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

35370

(Subtarget.hasVBMI() &&

35371

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

35372

V1 = DAG.getBitcast(MaskVT, V1);

35373

V2 = DAG.getBitcast(MaskVT, V2);

35374

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

35375

return DAG.getBitcast(RootVT, Res);

35376

}

35377

return SDValue();

35378

}

35379

35380

// See if we can combine a single input shuffle with zeros to a bit-mask,

35381

// which is much simpler than any shuffle.

35382

if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&

35383

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

35384

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

35385

APInt Zero = APInt::getNullValue(MaskEltSizeInBits);

35386

APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);

35387

APInt UndefElts(NumMaskElts, 0);

35388

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

35389

for (unsigned i = 0; i != NumMaskElts; ++i) {

35390

int M = Mask[i];

35391

if (M == SM_SentinelUndef) {

35392

UndefElts.setBit(i);

35393

continue;

35394

}

35395

if (M == SM_SentinelZero)

35396

continue;

35397

EltBits[i] = AllOnes;

35398

}

35399

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

35400

Res = DAG.getBitcast(MaskVT, V1);

35401

unsigned AndOpcode =

35402

MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

35403

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

35404

return DAG.getBitcast(RootVT, Res);

35405

}

35406

35407

// If we have a single input shuffle with different shuffle patterns in the

35408

// the 128-bit lanes use the variable mask to VPERMILPS.

35409

// TODO Combine other mask types at higher depths.

35410

if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&

35411

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

35412

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

35413

SmallVector<SDValue, 16> VPermIdx;

35414

for (int M : Mask) {

35415

SDValue Idx =

35416

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

35417

VPermIdx.push_back(Idx);

35418

}

35419

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

35420

Res = DAG.getBitcast(MaskVT, V1);

35421

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

35422

return DAG.getBitcast(RootVT, Res);

35423

}

35424

35425

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

35426

// to VPERMIL2PD/VPERMIL2PS.

35427

if (AllowVariableMask && Subtarget.hasXOP() &&

35428

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

35429

MaskVT == MVT::v8f32)) {

35430

// VPERMIL2 Operation.

35431

// Bits[3] - Match Bit.

35432

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

35433

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

35434

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

35435

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

35436

SmallVector<int, 8> VPerm2Idx;

35437

unsigned M2ZImm = 0;

35438

for (int M : Mask) {

35439

if (M == SM_SentinelUndef) {

35440

VPerm2Idx.push_back(-1);

35441

continue;

35442

}

35443

if (M == SM_SentinelZero) {

35444

M2ZImm = 2;

35445

VPerm2Idx.push_back(8);

35446

continue;

35447

}

35448

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

35449

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

35450

VPerm2Idx.push_back(Index);

35451

}

35452

V1 = DAG.getBitcast(MaskVT, V1);

35453

V2 = DAG.getBitcast(MaskVT, V2);

35454

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

35455

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

35456

DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

35457

return DAG.getBitcast(RootVT, Res);

35458

}

35459

35460

// If we have 3 or more shuffle instructions or a chain involving a variable

35461

// mask, we can replace them with a single PSHUFB instruction profitably.

35462

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

35463

// instructions, but in practice PSHUFB tends to be *very* fast so we're

35464

// more aggressive.

35465

if (UnaryShuffle && AllowVariableMask &&

35466

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

35467

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

35468

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

35469

SmallVector<SDValue, 16> PSHUFBMask;

35470

int NumBytes = RootVT.getSizeInBits() / 8;

35471

int Ratio = NumBytes / NumMaskElts;

35472

for (int i = 0; i < NumBytes; ++i) {

35473

int M = Mask[i / Ratio];

35474

if (M == SM_SentinelUndef) {

35475

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

35476

continue;

35477

}

35478

if (M == SM_SentinelZero) {

35479

PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

35480

continue;

35481

}

35482

M = Ratio * M + i % Ratio;

35483

assert((M / 16) == (i / 16) && "Lane crossing detected")(((M / 16) == (i / 16) && "Lane crossing detected") ?
static_cast<void> (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35483, __PRETTY_FUNCTION__));

35484

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

35485

}

35486

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

35487

Res = DAG.getBitcast(ByteVT, V1);

35488

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

35489

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

35490

return DAG.getBitcast(RootVT, Res);

35491

}

35492

35493

// With XOP, if we have a 128-bit binary input shuffle we can always combine

35494

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

35495

// slower than PSHUFB on targets that support both.

35496

if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {

35497

// VPPERM Mask Operation

35498

// Bits[4:0] - Byte Index (0 - 31)

35499

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

35500

SmallVector<SDValue, 16> VPPERMMask;

35501

int NumBytes = 16;

35502

int Ratio = NumBytes / NumMaskElts;

35503

for (int i = 0; i < NumBytes; ++i) {

35504

int M = Mask[i / Ratio];

35505

if (M == SM_SentinelUndef) {

35506

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

35507

continue;

35508

}

35509

if (M == SM_SentinelZero) {

35510

VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

35511

continue;

35512

}

35513

M = Ratio * M + i % Ratio;

35514

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

35515

}

35516

MVT ByteVT = MVT::v16i8;

35517

V1 = DAG.getBitcast(ByteVT, V1);

35518

V2 = DAG.getBitcast(ByteVT, V2);

35519

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

35520

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

35521

return DAG.getBitcast(RootVT, Res);

35522

}

35523

35524

// If that failed and either input is extracted then try to combine as a

35525

// shuffle with the larger type.

35526

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

35527

Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,

35528

DAG, Subtarget))

35529

return WideShuffle;

35530

35531

// If we have a dual input shuffle then lower to VPERMV3,

35532

// (non-VLX will pad to 512-bit shuffles)

35533

if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&

35534

((Subtarget.hasAVX512() &&

35535

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

35536

MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

35537

MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

35538

MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

35539

MaskVT == MVT::v16i32)) ||

35540

(Subtarget.hasBWI() && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

35541

MaskVT == MVT::v32i16)) ||

35542

(Subtarget.hasVBMI() && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

35543

MaskVT == MVT::v64i8)))) {

35544

V1 = DAG.getBitcast(MaskVT, V1);

35545

V2 = DAG.getBitcast(MaskVT, V2);

35546

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

35547

return DAG.getBitcast(RootVT, Res);

35548

}

35549

35550

// Failed to find any combines.

35551

return SDValue();

35552

}

35553

35554

// Combine an arbitrary chain of shuffles + extract_subvectors into a single

35555

// instruction if possible.

35556

//

35557

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

35558

// type size to attempt to combine:

35559

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

35560

// -->

35561

// extract_subvector(shuffle(x,y,m2),0)

35562

static SDValue combineX86ShuffleChainWithExtract(

35563

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

35564

bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,

35565

const X86Subtarget &Subtarget) {

35566

unsigned NumMaskElts = BaseMask.size();

35567

unsigned NumInputs = Inputs.size();

35568

if (NumInputs == 0)

35569

return SDValue();

35570

35571

SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());

35572

SmallVector<unsigned, 4> Offsets(NumInputs, 0);

35573

35574

// Peek through subvectors.

35575

// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?

35576

unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();

35577

for (unsigned i = 0; i != NumInputs; ++i) {

35578

SDValue &Src = WideInputs[i];

35579

unsigned &Offset = Offsets[i];

35580

Src = peekThroughBitcasts(Src);

35581

EVT BaseVT = Src.getValueType();

35582

while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

35583

Offset += Src.getConstantOperandVal(1);

35584

Src = Src.getOperand(0);

35585

}

35586

WideSizeInBits = std::max(WideSizeInBits,

35587

(unsigned)Src.getValueSizeInBits());

35588

assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35589, __PRETTY_FUNCTION__))

35589

"Unexpected subvector extraction")(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35589, __PRETTY_FUNCTION__));

35590

Offset /= BaseVT.getVectorNumElements();

35591

Offset *= NumMaskElts;

35592

}

35593

35594

// Bail if we're always extracting from the lowest subvectors,

35595

// combineX86ShuffleChain should match this for the current width.

35596

if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))

35597

return SDValue();

35598

35599

EVT RootVT = Root.getValueType();

35600

unsigned RootSizeInBits = RootVT.getSizeInBits();

35601

unsigned Scale = WideSizeInBits / RootSizeInBits;

35602

assert((WideSizeInBits % RootSizeInBits) == 0 &&(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35603, __PRETTY_FUNCTION__))

35603

"Unexpected subvector extraction")(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35603, __PRETTY_FUNCTION__));

35604

35605

// If the src vector types aren't the same, see if we can extend

35606

// them to match each other.

35607

// TODO: Support different scalar types?

35608

EVT WideSVT = WideInputs[0].getValueType().getScalarType();

35609

if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {

35610

return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||

35611

Op.getValueType().getScalarType() != WideSVT;

35612

}))

35613

return SDValue();

35614

35615

for (SDValue &NewInput : WideInputs) {

35616

assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35617, __PRETTY_FUNCTION__))

35617

"Shuffle vector size mismatch")(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35617, __PRETTY_FUNCTION__));

35618

if (WideSizeInBits > NewInput.getValueSizeInBits())

35619

NewInput = widenSubVector(NewInput, false, Subtarget, DAG,

35620

SDLoc(NewInput), WideSizeInBits);

35621

assert(WideSizeInBits == NewInput.getValueSizeInBits() &&((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35622, __PRETTY_FUNCTION__))

35622

"Unexpected subvector extraction")((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35622, __PRETTY_FUNCTION__));

35623

}

35624

35625

// Create new mask for larger type.

35626

for (unsigned i = 1; i != NumInputs; ++i)

35627

Offsets[i] += i * Scale * NumMaskElts;

35628

35629

SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());

35630

for (int &M : WideMask) {

35631

if (M < 0)

35632

continue;

35633

M = (M % NumMaskElts) + Offsets[M / NumMaskElts];

35634

}

35635

WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

35636

35637

// Remove unused/repeated shuffle source ops.

35638

resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

35639

assert(!WideInputs.empty() && "Shuffle with no inputs detected")((!WideInputs.empty() && "Shuffle with no inputs detected"
) ? static_cast<void> (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35639, __PRETTY_FUNCTION__));

35640

35641

if (WideInputs.size() > 2)

35642

return SDValue();

35643

35644

// Increase depth for every upper subvector we've peeked through.

35645

Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

35646

35647

// Attempt to combine wider chain.

35648

// TODO: Can we use a better Root?

35649

SDValue WideRoot = WideInputs[0];

35650

if (SDValue WideShuffle = combineX86ShuffleChain(

35651

WideInputs, WideRoot, WideMask, Depth, HasVariableMask,

35652

AllowVariableMask, DAG, Subtarget)) {

35653

WideShuffle =

35654

extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);

35655

return DAG.getBitcast(RootVT, WideShuffle);

35656

}

35657

return SDValue();

35658

}

35659

35660

// Canonicalize the combined shuffle mask chain with horizontal ops.

35661

// NOTE: This may update the Ops and Mask.

35662

static SDValue canonicalizeShuffleMaskWithHorizOp(

35663

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

35664

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

35665

const X86Subtarget &Subtarget) {

35666

35667

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

35668

// single instruction. Attempt to match a v2X64 repeating shuffle pattern that

35669

// represents the LHS/RHS inputs for the lower/upper halves.

35670

if (Mask.empty() || Ops.empty() || 2 < Ops.size())

35671

return SDValue();

35672

35673

SDValue BC0 = peekThroughBitcasts(Ops.front());

35674

SDValue BC1 = peekThroughBitcasts(Ops.back());

35675

EVT VT0 = BC0.getValueType();

35676

EVT VT1 = BC1.getValueType();

35677

unsigned Opcode0 = BC0.getOpcode();

35678

unsigned Opcode1 = BC1.getOpcode();

35679

if (Opcode0 != Opcode1 || VT0 != VT1 || VT0.getSizeInBits() != RootSizeInBits)

35680

return SDValue();

35681

35682

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

35683

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

35684

bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

35685

if (!isHoriz && !isPack)

35686

return SDValue();

35687

35688

if (Mask.size() == VT0.getVectorNumElements()) {

35689

int NumElts = VT0.getVectorNumElements();

35690

int NumLanes = VT0.getSizeInBits() / 128;

35691

int NumEltsPerLane = NumElts / NumLanes;

35692

int NumHalfEltsPerLane = NumEltsPerLane / 2;

35693

35694

// Canonicalize binary shuffles of horizontal ops that use the

35695

// same sources to an unary shuffle.

35696

// TODO: Try to perform this fold even if the shuffle remains.

35697

if (Ops.size() == 2) {

35698

auto ContainsOps = [](SDValue HOp, SDValue Op) {

35699

return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

35700

};

35701

// Commute if all BC0's ops are contained in BC1.

35702

if (ContainsOps(BC1, BC0.getOperand(0)) &&

35703

ContainsOps(BC1, BC0.getOperand(1))) {

35704

ShuffleVectorSDNode::commuteMask(Mask);

35705

std::swap(Ops[0], Ops[1]);

35706

std::swap(BC0, BC1);

35707

}

35708

35709

// If BC1 can be represented by BC0, then convert to unary shuffle.

35710

if (ContainsOps(BC0, BC1.getOperand(0)) &&

35711

ContainsOps(BC0, BC1.getOperand(1))) {

35712

for (int &M : Mask) {

35713

if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

35714

continue;

35715

int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

35716

M -= NumElts + (SubLane * NumHalfEltsPerLane);

35717

if (BC1.getOperand(SubLane) != BC0.getOperand(0))

35718

M += NumHalfEltsPerLane;

35719

}

35720

}

35721

}

35722

35723

// Canonicalize unary horizontal ops to only refer to lower halves.

35724

for (int i = 0; i != NumElts; ++i) {

35725

int &M = Mask[i];

35726

if (isUndefOrZero(M))

35727

continue;

35728

if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

35729

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

35730

M -= NumHalfEltsPerLane;

35731

if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

35732

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

35733

M -= NumHalfEltsPerLane;

35734

}

35735

}

35736

35737

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

35738

SmallVector<int, 16> TargetMask128, WideMask128;

35739

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

35740

scaleShuffleElements(TargetMask128, 2, WideMask128)) {

35741

assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")((isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35741, __PRETTY_FUNCTION__));

35742

bool SingleOp = (Ops.size() == 1);

35743

if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

35744

SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

35745

SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

35746

Lo = Lo.getOperand(WideMask128[0] & 1);

35747

Hi = Hi.getOperand(WideMask128[1] & 1);

35748

if (SingleOp) {

35749

MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

35750

SDValue Undef = DAG.getUNDEF(SrcVT);

35751

SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

35752

Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

35753

Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

35754

Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

35755

Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

35756

}

35757

return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

35758

}

35759

}

35760

35761

return SDValue();

35762

}

35763

35764

// Attempt to constant fold all of the constant source ops.

35765

// Returns true if the entire shuffle is folded to a constant.

35766

// TODO: Extend this to merge multiple constant Ops and update the mask.

35767

static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

35768

ArrayRef<int> Mask, SDValue Root,

35769

bool HasVariableMask,

35770

SelectionDAG &DAG,

35771

const X86Subtarget &Subtarget) {

35772

MVT VT = Root.getSimpleValueType();

35773

35774

unsigned SizeInBits = VT.getSizeInBits();

35775

unsigned NumMaskElts = Mask.size();

35776

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

35777

unsigned NumOps = Ops.size();

35778

35779

// Extract constant bits from each source op.

35780

bool OneUseConstantOp = false;

35781

SmallVector<APInt, 16> UndefEltsOps(NumOps);

35782

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

35783

for (unsigned i = 0; i != NumOps; ++i) {

35784

SDValue SrcOp = Ops[i];

35785

OneUseConstantOp |= SrcOp.hasOneUse();

35786

if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],

35787

RawBitsOps[i]))

35788

return SDValue();

35789

}

35790

35791

// Only fold if at least one of the constants is only used once or

35792

// the combined shuffle has included a variable mask shuffle, this

35793

// is to avoid constant pool bloat.

35794

if (!OneUseConstantOp && !HasVariableMask)

35795

return SDValue();

35796

35797

// Shuffle the constant bits according to the mask.

35798

SDLoc DL(Root);

35799

APInt UndefElts(NumMaskElts, 0);

35800

APInt ZeroElts(NumMaskElts, 0);

35801

APInt ConstantElts(NumMaskElts, 0);

35802

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

35803

APInt::getNullValue(MaskSizeInBits));

35804

for (unsigned i = 0; i != NumMaskElts; ++i) {

35805

int M = Mask[i];

35806

if (M == SM_SentinelUndef) {

35807

UndefElts.setBit(i);

35808

continue;

35809

} else if (M == SM_SentinelZero) {

35810

ZeroElts.setBit(i);

35811

continue;

35812

}

35813

assert(0 <= M && M < (int)(NumMaskElts * NumOps))((0 <= M && M < (int)(NumMaskElts * NumOps)) ? static_cast
<void> (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35813, __PRETTY_FUNCTION__));

35814

35815

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

35816

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

35817

35818

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

35819

if (SrcUndefElts[SrcMaskIdx]) {

35820

UndefElts.setBit(i);

35821

continue;

35822

}

35823

35824

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

35825

APInt &Bits = SrcEltBits[SrcMaskIdx];

35826

if (!Bits) {

35827

ZeroElts.setBit(i);

35828

continue;

35829

}

35830

35831

ConstantElts.setBit(i);

35832

ConstantBitData[i] = Bits;

35833

}

35834

assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()) ? static_cast
<void> (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35834, __PRETTY_FUNCTION__));

35835

35836

// Attempt to create a zero vector.

35837

if ((UndefElts | ZeroElts).isAllOnesValue())

35838

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

35839

35840

// Create the constant data.

35841

MVT MaskSVT;

35842

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

35843

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

35844

else

35845

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

35846

35847

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

35848

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

35849

return SDValue();

35850

35851

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

35852

return DAG.getBitcast(VT, CstOp);

35853

}

35854

35855

namespace llvm {

35856

namespace X86 {

35857

enum {

35858

MaxShuffleCombineDepth = 8

35859

};

35860

}

35861

} // namespace llvm

35862

35863

/// Fully generic combining of x86 shuffle instructions.

35864

///

35865

/// This should be the last combine run over the x86 shuffle instructions. Once

35866

/// they have been fully optimized, this will recursively consider all chains

35867

/// of single-use shuffle instructions, build a generic model of the cumulative

35868

/// shuffle operation, and check for simpler instructions which implement this

35869

/// operation. We use this primarily for two purposes:

35870

///

35871

/// 1) Collapse generic shuffles to specialized single instructions when

35872

/// equivalent. In most cases, this is just an encoding size win, but

35873

/// sometimes we will collapse multiple generic shuffles into a single

35874

/// special-purpose shuffle.

35875

/// 2) Look for sequences of shuffle instructions with 3 or more total

35876

/// instructions, and replace them with the slightly more expensive SSSE3

35877

/// PSHUFB instruction if available. We do this as the last combining step

35878

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

35879

/// a suitable short sequence of other instructions. The PSHUFB will either

35880

/// use a register or have to read from memory and so is slightly (but only

35881

/// slightly) more expensive than the other shuffle instructions.

35882

///

35883

/// Because this is inherently a quadratic operation (for each shuffle in

35884

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

35885

/// This should never be an issue in practice as the shuffle lowering doesn't

35886

/// produce sequences of more than 8 instructions.

35887

///

35888

/// FIXME: We will currently miss some cases where the redundant shuffling

35889

/// would simplify under the threshold for PSHUFB formation because of

35890

/// combine-ordering. To fix this, we should do the redundant instruction

35891

/// combining in this recursive walk.

35892

static SDValue combineX86ShufflesRecursively(

35893

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

35894

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

35895

unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,

35896

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

35897

assert(RootMask.size() > 0 &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35899, __PRETTY_FUNCTION__))

35898

(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35899, __PRETTY_FUNCTION__))

35899

"Illegal shuffle root mask")((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35899, __PRETTY_FUNCTION__));

35900

assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35901, __PRETTY_FUNCTION__))

35901

"Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35901, __PRETTY_FUNCTION__));

35902

unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();

35903

35904

// Bound the depth of our recursive combine because this is ultimately

35905

// quadratic in nature.

35906

if (Depth >= MaxDepth)

35907

return SDValue();

35908

35909

// Directly rip through bitcasts to find the underlying operand.

35910

SDValue Op = SrcOps[SrcOpIndex];

35911

Op = peekThroughOneUseBitcasts(Op);

35912

35913

EVT VT = Op.getValueType();

35914

if (!VT.isVector() || !VT.isSimple())

35915

return SDValue(); // Bail if we hit a non-simple non-vector.

35916

35917

assert(VT.getSizeInBits() == RootSizeInBits &&((VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == RootSizeInBits && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35918, __PRETTY_FUNCTION__))

35918

"Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == RootSizeInBits && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35918, __PRETTY_FUNCTION__));

35919

35920

// Extract target shuffle mask and resolve sentinels and inputs.

35921

// TODO - determine Op's demanded elts from RootMask.

35922

SmallVector<int, 64> OpMask;

35923

SmallVector<SDValue, 2> OpInputs;

35924

APInt OpUndef, OpZero;

35925

APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());

35926

bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());

35927

if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

35928

OpZero, DAG, Depth, false))

35929

return SDValue();

35930

35931

// Shuffle inputs must be the same size as the result, bail on any larger

35932

// inputs and widen any smaller inputs.

35933

if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {

35934

return Op.getValueSizeInBits() > RootSizeInBits;

35935

}))

35936

return SDValue();

35937

35938

for (SDValue &Op : OpInputs)

35939

if (Op.getValueSizeInBits() < RootSizeInBits)

35940

Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,

35941

SDLoc(Op), RootSizeInBits);

35942

35943

SmallVector<int, 64> Mask;

35944

SmallVector<SDValue, 16> Ops;

35945

35946

// We don't need to merge masks if the root is empty.

35947

bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

35948

if (EmptyRoot

16.1	'EmptyRoot' is true

) {

35949

// Only resolve zeros if it will remove an input, otherwise we might end

35950

// up in an infinite loop.

35951

bool ResolveKnownZeros = true;

35952

if (!OpZero.isNullValue()) {

35953

APInt UsedInputs = APInt::getNullValue(OpInputs.size());

35954

for (int i = 0, e = OpMask.size(); i != e; ++i) {

35955

int M = OpMask[i];

35956

if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

35957

continue;

35958

UsedInputs.setBit(M / OpMask.size());

35959

if (UsedInputs.isAllOnesValue()) {

35960

ResolveKnownZeros = false;

35961

break;

35962

}

35963

}

35964

}

35965

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

35966

ResolveKnownZeros);

35967

35968

Mask = OpMask;

35969

Ops.append(OpInputs.begin(), OpInputs.end());

35970

} else {

35971

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

35972

35973

// Add the inputs to the Ops list, avoiding duplicates.

35974

Ops.append(SrcOps.begin(), SrcOps.end());

35975

35976

auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

35977

// Attempt to find an existing match.

35978

SDValue InputBC = peekThroughBitcasts(Input);

35979

for (int i = 0, e = Ops.size(); i < e; ++i)

35980

if (InputBC == peekThroughBitcasts(Ops[i]))

35981

return i;

35982

// Match failed - should we replace an existing Op?

35983

if (InsertionPoint >= 0) {

35984

Ops[InsertionPoint] = Input;

35985

return InsertionPoint;

35986

}

35987

// Add to the end of the Ops list.

35988

Ops.push_back(Input);

35989

return Ops.size() - 1;

35990

};

35991

35992

SmallVector<int, 2> OpInputIdx;

35993

for (SDValue OpInput : OpInputs)

35994

OpInputIdx.push_back(

35995

AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

35996

35997

assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36002, __PRETTY_FUNCTION__))

35998

RootMask.size() % OpMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36002, __PRETTY_FUNCTION__))

35999

(OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36002, __PRETTY_FUNCTION__))

36000

OpMask.size() % RootMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36002, __PRETTY_FUNCTION__))

36001

OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36002, __PRETTY_FUNCTION__))

36002

"The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36002, __PRETTY_FUNCTION__));

36003

36004

// This function can be performance-critical, so we rely on the power-of-2

36005

// knowledge that we have about the mask sizes to replace div/rem ops with

36006

// bit-masks and shifts.

36007

assert(isPowerOf2_32(RootMask.size()) &&((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36008, __PRETTY_FUNCTION__))

36008

"Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36008, __PRETTY_FUNCTION__));

36009

assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36009, __PRETTY_FUNCTION__));

36010

unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());

36011

unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

36012

36013

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

36014

unsigned RootRatio =

36015

std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

36016

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

36017

assert((RootRatio == 1 || OpRatio == 1) &&(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36018, __PRETTY_FUNCTION__))

36018

"Must not have a ratio for both incoming and op masks!")(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36018, __PRETTY_FUNCTION__));

36019

36020

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36020, __PRETTY_FUNCTION__));

36021

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36021, __PRETTY_FUNCTION__));

36022

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36022, __PRETTY_FUNCTION__));

36023

unsigned RootRatioLog2 = countTrailingZeros(RootRatio);

36024

unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

36025

36026

Mask.resize(MaskWidth, SM_SentinelUndef);

36027

36028

// Merge this shuffle operation's mask into our accumulated mask. Note that

36029

// this shuffle's mask will be the first applied to the input, followed by

36030

// the root mask to get us all the way to the root value arrangement. The

36031

// reason for this order is that we are recursing up the operation chain.

36032

for (unsigned i = 0; i < MaskWidth; ++i) {

36033

unsigned RootIdx = i >> RootRatioLog2;

36034

if (RootMask[RootIdx] < 0) {

36035

// This is a zero or undef lane, we're done.

36036

Mask[i] = RootMask[RootIdx];

36037

continue;

36038

}

36039

36040

unsigned RootMaskedIdx =

36041

RootRatio == 1

36042

? RootMask[RootIdx]

36043

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

36044

36045

// Just insert the scaled root mask value if it references an input other

36046

// than the SrcOp we're currently inserting.

36047

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

36048

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

36049

Mask[i] = RootMaskedIdx;

36050

continue;

36051

}

36052

36053

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

36054

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

36055

if (OpMask[OpIdx] < 0) {

36056

// The incoming lanes are zero or undef, it doesn't matter which ones we

36057

// are using.

36058

Mask[i] = OpMask[OpIdx];

36059

continue;

36060

}

36061

36062

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

36063

unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

36064

: (OpMask[OpIdx] << OpRatioLog2) +

36065

(RootMaskedIdx & (OpRatio - 1));

36066

36067

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

36068

int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

36069

assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")((0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"
) ? static_cast<void> (0) : __assert_fail ("0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36069, __PRETTY_FUNCTION__));

36070

OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

36071

36072

Mask[i] = OpMaskedIdx;

36073

}

36074

}

36075

36076

// Remove unused/repeated shuffle source ops.

36077

resolveTargetShuffleInputsAndMask(Ops, Mask);

36078

36079

// Handle the all undef/zero cases early.

36080

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

36081

return DAG.getUNDEF(Root.getValueType());

36082

36083

// TODO - should we handle the mixed zero/undef case as well? Just returning

36084

// a zero mask will lose information on undef elements possibly reducing

36085

// future combine possibilities.

36086

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

36087

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,

36088

SDLoc(Root));

36089

36090

assert(!Ops.empty() && "Shuffle with no inputs detected")((!Ops.empty() && "Shuffle with no inputs detected") ?
static_cast<void> (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36090, __PRETTY_FUNCTION__));

36091

HasVariableMask |= IsOpVariableMask;

36092

36093

// Update the list of shuffle nodes that have been combined so far.

36094

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

36095

SrcNodes.end());

36096

CombinedNodes.push_back(Op.getNode());

36097

36098

// See if we can recurse into each shuffle source op (if it's a target

36099

// shuffle). The source op should only be generally combined if it either has

36100

// a single use (i.e. current Op) or all its users have already been combined,

36101

// if not then we can still combine but should prevent generation of variable

36102

// shuffles to avoid constant pool bloat.

36103

// Don't recurse if we already have more source ops than we can combine in

36104

// the remaining recursion depth.

36105

if (Ops.size() < (MaxDepth - Depth)) {

36106

for (int i = 0, e = Ops.size(); i < e; ++i) {

36107

// For empty roots, we need to resolve zeroable elements before combining

36108

// them with other shuffles.

36109

SmallVector<int, 64> ResolvedMask = Mask;

36110

if (EmptyRoot)

36111

resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

36112

bool AllowVar = false;

36113

if (Ops[i].getNode()->hasOneUse() ||

36114

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))

36115

AllowVar = AllowVariableMask;

36116

if (SDValue Res = combineX86ShufflesRecursively(

36117

Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,

36118

HasVariableMask, AllowVar, DAG, Subtarget))

36119

return Res;

36120

}

36121

}

36122

36123

// Attempt to constant fold all of the constant source ops.

36124

if (SDValue Cst = combineX86ShufflesConstants(

36125

Ops, Mask, Root, HasVariableMask, DAG, Subtarget))

36126

return Cst;

36127

36128

// Canonicalize the combined shuffle mask chain with horizontal ops.

36129

// NOTE: This will update the Ops and Mask.

36130

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

36131

Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))

36132

return DAG.getBitcast(Root.getValueType(), HOp);

36133

36134

// We can only combine unary and binary shuffle mask cases.

36135

if (Ops.size() <= 2) {

36136

// Minor canonicalization of the accumulated shuffle mask to make it easier

36137

// to match below. All this does is detect masks with sequential pairs of

36138

// elements, and shrink them to the half-width mask. It does this in a loop

36139

// so it will reduce the size of the mask to the minimal width mask which

36140

// performs an equivalent shuffle.

36141

SmallVector<int, 64> WidenedMask;

36142

while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {

36143

Mask = std::move(WidenedMask);

36144

}

36145

36146

// Canonicalization of binary shuffle masks to improve pattern matching by

36147

// commuting the inputs.

36148

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

36149

ShuffleVectorSDNode::commuteMask(Mask);

36150

std::swap(Ops[0], Ops[1]);

36151

}

36152

36153

// Finally, try to combine into a single shuffle instruction.

36154

return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,

36155

AllowVariableMask, DAG, Subtarget);

36156

}

36157

36158

// If that failed and any input is extracted then try to combine as a

36159

// shuffle with the larger type.

36160

return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,

36161

HasVariableMask, AllowVariableMask,

36162

DAG, Subtarget);

36163

}

36164

36165

/// Helper entry wrapper to combineX86ShufflesRecursively.

36166

static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

36167

const X86Subtarget &Subtarget) {

36168

return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,

36169

X86::MaxShuffleCombineDepth,

36170

/*HasVarMask*/ false,

36171

/*AllowVarMask*/ true, DAG, Subtarget);

36172

}

36173

36174

/// Get the PSHUF-style mask from PSHUF node.

36175

///

36176

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

36177

/// PSHUF-style masks that can be reused with such instructions.

36178

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

36179

MVT VT = N.getSimpleValueType();

36180

SmallVector<int, 4> Mask;

36181

SmallVector<SDValue, 2> Ops;

36182

bool IsUnary;

36183

bool HaveMask =

36184

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);

36185

(void)HaveMask;

36186

assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36186, __PRETTY_FUNCTION__));

36187

36188

// If we have more than 128-bits, only the low 128-bits of shuffle mask

36189

// matter. Check that the upper masks are repeats and remove them.

36190

if (VT.getSizeInBits() > 128) {

36191

int LaneElts = 128 / VT.getScalarSizeInBits();

36192

#ifndef NDEBUG

36193

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

36194

for (int j = 0; j < LaneElts; ++j)

36195

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36196, __PRETTY_FUNCTION__))

36196

"Mask doesn't repeat in high 128-bit lanes!")((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36196, __PRETTY_FUNCTION__));

36197

#endif

36198

Mask.resize(LaneElts);

36199

}

36200

36201

switch (N.getOpcode()) {

36202

case X86ISD::PSHUFD:

36203

return Mask;

36204

case X86ISD::PSHUFLW:

36205

Mask.resize(4);

36206

return Mask;

36207

case X86ISD::PSHUFHW:

36208

Mask.erase(Mask.begin(), Mask.begin() + 4);

36209

for (int &M : Mask)

36210

M -= 4;

36211

return Mask;

36212

default:

36213

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36213);

36214

}

36215

}

36216

36217

/// Search for a combinable shuffle across a chain ending in pshufd.

36218

///

36219

/// We walk up the chain and look for a combinable shuffle, skipping over

36220

/// shuffles that we could hoist this shuffle's transformation past without

36221

/// altering anything.

36222

static SDValue

36223

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

36224

SelectionDAG &DAG) {

36225

assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36226, __PRETTY_FUNCTION__))

36226

"Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36226, __PRETTY_FUNCTION__));

36227

SDLoc DL(N);

36228

36229

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

36230

// of the shuffles in the chain so that we can form a fresh chain to replace

36231

// this one.

36232

SmallVector<SDValue, 8> Chain;

36233

SDValue V = N.getOperand(0);

36234

for (; V.hasOneUse(); V = V.getOperand(0)) {

36235

switch (V.getOpcode()) {

36236

default:

36237

return SDValue(); // Nothing combined!

36238

36239

case ISD::BITCAST:

36240

// Skip bitcasts as we always know the type for the target specific

36241

// instructions.

36242

continue;

36243

36244

case X86ISD::PSHUFD:

36245

// Found another dword shuffle.

36246

break;

36247

36248

case X86ISD::PSHUFLW:

36249

// Check that the low words (being shuffled) are the identity in the

36250

// dword shuffle, and the high words are self-contained.

36251

if (Mask[0] != 0 || Mask[1] != 1 ||

36252

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

36253

return SDValue();

36254

36255

Chain.push_back(V);

36256

continue;

36257

36258

case X86ISD::PSHUFHW:

36259

// Check that the high words (being shuffled) are the identity in the

36260

// dword shuffle, and the low words are self-contained.

36261

if (Mask[2] != 2 || Mask[3] != 3 ||

36262

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

36263

return SDValue();

36264

36265

Chain.push_back(V);

36266

continue;

36267

36268

case X86ISD::UNPCKL:

36269

case X86ISD::UNPCKH:

36270

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

36271

// shuffle into a preceding word shuffle.

36272

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

36273

V.getSimpleValueType().getVectorElementType() != MVT::i16)

36274

return SDValue();

36275

36276

// Search for a half-shuffle which we can combine with.

36277

unsigned CombineOp =

36278

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

36279

if (V.getOperand(0) != V.getOperand(1) ||

36280

!V->isOnlyUserOf(V.getOperand(0).getNode()))

36281

return SDValue();

36282

Chain.push_back(V);

36283

V = V.getOperand(0);

36284

do {

36285

switch (V.getOpcode()) {

36286

default:

36287

return SDValue(); // Nothing to combine.

36288

36289

case X86ISD::PSHUFLW:

36290

case X86ISD::PSHUFHW:

36291

if (V.getOpcode() == CombineOp)

36292

break;

36293

36294

Chain.push_back(V);

36295

36296

LLVM_FALLTHROUGH[[gnu::fallthrough]];

36297

case ISD::BITCAST:

36298

V = V.getOperand(0);

36299

continue;

36300

}

36301

break;

36302

} while (V.hasOneUse());

36303

break;

36304

}

36305

// Break out of the loop if we break out of the switch.

36306

break;

36307

}

36308

36309

if (!V.hasOneUse())

36310

// We fell out of the loop without finding a viable combining instruction.

36311

return SDValue();

36312

36313

// Merge this node's mask and our incoming mask.

36314

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

36315

for (int &M : Mask)

36316

M = VMask[M];

36317

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

36318

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

36319

36320

// Rebuild the chain around this new shuffle.

36321

while (!Chain.empty()) {

36322

SDValue W = Chain.pop_back_val();

36323

36324

if (V.getValueType() != W.getOperand(0).getValueType())

36325

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

36326

36327

switch (W.getOpcode()) {

36328

default:

36329

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36329);

36330

36331

case X86ISD::UNPCKL:

36332

case X86ISD::UNPCKH:

36333

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

36334

break;

36335

36336

case X86ISD::PSHUFD:

36337

case X86ISD::PSHUFLW:

36338

case X86ISD::PSHUFHW:

36339

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

36340

break;

36341

}

36342

}

36343

if (V.getValueType() != N.getValueType())

36344

V = DAG.getBitcast(N.getValueType(), V);

36345

36346

// Return the new chain to replace N.

36347

return V;

36348

}

36349

36350

// Attempt to commute shufps LHS loads:

36351

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

36352

static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

36353

SelectionDAG &DAG) {

36354

// TODO: Add vXf64 support.

36355

if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

36356

return SDValue();

36357

36358

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

36359

auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

36360

if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

36361

return SDValue();

36362

SDValue N0 = V.getOperand(0);

36363

SDValue N1 = V.getOperand(1);

36364

unsigned Imm = V.getConstantOperandVal(2);

36365

if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||

36366

MayFoldLoad(peekThroughOneUseBitcasts(N1)))

36367

return SDValue();

36368

Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

36369

return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

36370

DAG.getTargetConstant(Imm, DL, MVT::i8));

36371

};

36372

36373

switch (N.getOpcode()) {

36374

case X86ISD::VPERMILPI:

36375

if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

36376

unsigned Imm = N.getConstantOperandVal(1);

36377

return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

36378

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

36379

}

36380

break;

36381

case X86ISD::SHUFP: {

36382

SDValue N0 = N.getOperand(0);

36383

SDValue N1 = N.getOperand(1);

36384

unsigned Imm = N.getConstantOperandVal(2);

36385

if (N0 == N1) {

36386

if (SDValue NewSHUFP = commuteSHUFP(N, N0))

36387

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

36388

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

36389

} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

36390

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

36391

DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

36392

} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

36393

return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

36394

DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

36395

}

36396

break;

36397

}

36398

}

36399

36400

return SDValue();

36401

}

36402

36403

/// Try to combine x86 target specific shuffles.

36404

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

36405

TargetLowering::DAGCombinerInfo &DCI,

36406

const X86Subtarget &Subtarget) {

36407

SDLoc DL(N);

36408

MVT VT = N.getSimpleValueType();

36409

SmallVector<int, 4> Mask;

36410

unsigned Opcode = N.getOpcode();

36411

36412

if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

36413

return R;

36414

36415

// Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to

36416

// help expose the 'NOT' pattern further up the DAG.

36417

// TODO: This might be beneficial for any binop with a 'splattable' operand.

36418

switch (Opcode) {

36419

case X86ISD::MOVDDUP:

36420

case X86ISD::PSHUFD: {

36421

SDValue Src = N.getOperand(0);

36422

if (Src.hasOneUse() && Src.getValueType() == VT) {

36423

if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {

36424

Not = DAG.getBitcast(VT, Not);

36425

Not = Opcode == X86ISD::MOVDDUP

36426

? DAG.getNode(Opcode, DL, VT, Not)

36427

: DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));

36428

EVT IntVT = Not.getValueType().changeTypeToInteger();

36429

SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);

36430

Not = DAG.getBitcast(IntVT, Not);

36431

Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);

36432

return DAG.getBitcast(VT, Not);

36433

}

36434

}

36435

break;

36436

}

36437

}

36438

36439

// Handle specific target shuffles.

36440

switch (Opcode) {

36441

case X86ISD::MOVDDUP: {

36442

SDValue Src = N.getOperand(0);

36443

// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

36444

if (VT == MVT::v2f64 && Src.hasOneUse() &&

36445

ISD::isNormalLoad(Src.getNode())) {

36446

LoadSDNode *LN = cast<LoadSDNode>(Src);

36447

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

36448

SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

36449

DCI.CombineTo(N.getNode(), Movddup);

36450

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

36451

DCI.recursivelyDeleteUnusedNodes(LN);

36452

return N; // Return N so it doesn't get rechecked!

36453

}

36454

}

36455

36456

return SDValue();

36457

}

36458

case X86ISD::VBROADCAST: {

36459

SDValue Src = N.getOperand(0);

36460

SDValue BC = peekThroughBitcasts(Src);

36461

EVT SrcVT = Src.getValueType();

36462

EVT BCVT = BC.getValueType();

36463

36464

// If broadcasting from another shuffle, attempt to simplify it.

36465

// TODO - we really need a general SimplifyDemandedVectorElts mechanism.

36466

if (isTargetShuffle(BC.getOpcode()) &&

36467

VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

36468

unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

36469

SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

36470

SM_SentinelUndef);

36471

for (unsigned i = 0; i != Scale; ++i)

36472

DemandedMask[i] = i;

36473

if (SDValue Res = combineX86ShufflesRecursively(

36474

{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,

36475

X86::MaxShuffleCombineDepth,

36476

/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))

36477

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

36478

DAG.getBitcast(SrcVT, Res));

36479

}

36480

36481

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

36482

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

36483

if (Src.getOpcode() == ISD::BITCAST &&

36484

SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

36485

DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {

36486

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

36487

VT.getVectorNumElements());

36488

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

36489

}

36490

36491

// Reduce broadcast source vector to lowest 128-bits.

36492

if (SrcVT.getSizeInBits() > 128)

36493

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

36494

extract128BitVector(Src, 0, DAG, DL));

36495

36496

// broadcast(scalar_to_vector(x)) -> broadcast(x).

36497

if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)

36498

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

36499

36500

// Share broadcast with the longest vector and extract low subvector (free).

36501

for (SDNode *User : Src->uses())

36502

if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

36503

User->getValueSizeInBits(0) > VT.getSizeInBits()) {

36504

return extractSubVector(SDValue(User, 0), 0, DAG, DL,

36505

VT.getSizeInBits());

36506

}

36507

36508

// vbroadcast(scalarload X) -> vbroadcast_load X

36509

// For float loads, extract other uses of the scalar from the broadcast.

36510

if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

36511

ISD::isNormalLoad(Src.getNode())) {

36512

LoadSDNode *LN = cast<LoadSDNode>(Src);

36513

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36514

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

36515

SDValue BcastLd =

36516

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

36517

LN->getMemoryVT(), LN->getMemOperand());

36518

// If the load value is used only by N, replace it via CombineTo N.

36519

bool NoReplaceExtract = Src.hasOneUse();

36520

DCI.CombineTo(N.getNode(), BcastLd);

36521

if (NoReplaceExtract) {

36522

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

36523

DCI.recursivelyDeleteUnusedNodes(LN);

36524

} else {

36525

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

36526

DAG.getIntPtrConstant(0, DL));

36527

DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

36528

}

36529

return N; // Return N so it doesn't get rechecked!

36530

}

36531

36532

// Due to isTypeDesirableForOp, we won't always shrink a load truncated to

36533

// i16. So shrink it ourselves if we can make a broadcast_load.

36534

if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

36535

Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

36536

assert(Subtarget.hasAVX2() && "Expected AVX2")((Subtarget.hasAVX2() && "Expected AVX2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36536, __PRETTY_FUNCTION__));

36537

SDValue TruncIn = Src.getOperand(0);

36538

36539

// If this is a truncate of a non extending load we can just narrow it to

36540

// use a broadcast_load.

36541

if (ISD::isNormalLoad(TruncIn.getNode())) {

36542

LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

36543

// Unless its volatile or atomic.

36544

if (LN->isSimple()) {

36545

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36546

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

36547

SDValue BcastLd = DAG.getMemIntrinsicNode(

36548

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

36549

LN->getPointerInfo(), LN->getOriginalAlign(),

36550

LN->getMemOperand()->getFlags());

36551

DCI.CombineTo(N.getNode(), BcastLd);

36552

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

36553

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

36554

return N; // Return N so it doesn't get rechecked!

36555

}

36556

}

36557

36558

// If this is a truncate of an i16 extload, we can directly replace it.

36559

if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

36560

ISD::isEXTLoad(Src.getOperand(0).getNode())) {

36561

LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

36562

if (LN->getMemoryVT().getSizeInBits() == 16) {

36563

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36564

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

36565

SDValue BcastLd =

36566

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

36567

LN->getMemoryVT(), LN->getMemOperand());

36568

DCI.CombineTo(N.getNode(), BcastLd);

36569

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

36570

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

36571

return N; // Return N so it doesn't get rechecked!

36572

}

36573

}

36574

36575

// If this is a truncate of load that has been shifted right, we can

36576

// offset the pointer and use a narrower load.

36577

if (TruncIn.getOpcode() == ISD::SRL &&

36578

TruncIn.getOperand(0).hasOneUse() &&

36579

isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

36580

ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

36581

LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

36582

unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

36583

// Make sure the shift amount and the load size are divisible by 16.

36584

// Don't do this if the load is volatile or atomic.

36585

if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

36586

LN->isSimple()) {

36587

unsigned Offset = ShiftAmt / 8;

36588

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36589

SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),

36590

TypeSize::Fixed(Offset), DL);

36591

SDValue Ops[] = { LN->getChain(), Ptr };

36592

SDValue BcastLd = DAG.getMemIntrinsicNode(

36593

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

36594

LN->getPointerInfo().getWithOffset(Offset),

36595

LN->getOriginalAlign(),

36596

LN->getMemOperand()->getFlags());

36597

DCI.CombineTo(N.getNode(), BcastLd);

36598

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

36599

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

36600

return N; // Return N so it doesn't get rechecked!

36601

}

36602

}

36603

}

36604

36605

// vbroadcast(vzload X) -> vbroadcast_load X

36606

if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

36607

MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

36608

if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

36609

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36610

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

36611

SDValue BcastLd =

36612

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

36613

LN->getMemoryVT(), LN->getMemOperand());

36614

DCI.CombineTo(N.getNode(), BcastLd);

36615

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

36616

DCI.recursivelyDeleteUnusedNodes(LN);

36617

return N; // Return N so it doesn't get rechecked!

36618

}

36619

}

36620

36621

// vbroadcast(vector load X) -> vbroadcast_load

36622

if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||

36623

SrcVT == MVT::v4i32) &&

36624

Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

36625

LoadSDNode *LN = cast<LoadSDNode>(Src);

36626

// Unless the load is volatile or atomic.

36627

if (LN->isSimple()) {

36628

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36629

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

36630

SDValue BcastLd = DAG.getMemIntrinsicNode(

36631

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

36632

LN->getPointerInfo(), LN->getOriginalAlign(),

36633

LN->getMemOperand()->getFlags());

36634

DCI.CombineTo(N.getNode(), BcastLd);

36635

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

36636

DCI.recursivelyDeleteUnusedNodes(LN);

36637

return N; // Return N so it doesn't get rechecked!

36638

}

36639

}

36640

36641

return SDValue();

36642

}

36643

case X86ISD::VZEXT_MOVL: {

36644

SDValue N0 = N.getOperand(0);

36645

36646

// If this a vzmovl of a full vector load, replace it with a vzload, unless

36647

// the load is volatile.

36648

if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

36649

auto *LN = cast<LoadSDNode>(N0);

36650

if (SDValue VZLoad =

36651

narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

36652

DCI.CombineTo(N.getNode(), VZLoad);

36653

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

36654

DCI.recursivelyDeleteUnusedNodes(LN);

36655

return N;

36656

}

36657

}

36658

36659

// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

36660

// and can just use a VZEXT_LOAD.

36661

// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

36662

if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

36663

auto *LN = cast<MemSDNode>(N0);

36664

if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

36665

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36666

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

36667

SDValue VZLoad =

36668

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

36669

LN->getMemoryVT(), LN->getMemOperand());

36670

DCI.CombineTo(N.getNode(), VZLoad);

36671

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

36672

DCI.recursivelyDeleteUnusedNodes(LN);

36673

return N;

36674

}

36675

}

36676

36677

// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

36678

// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

36679

// if the upper bits of the i64 are zero.

36680

if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

36681

N0.getOperand(0).hasOneUse() &&

36682

N0.getOperand(0).getValueType() == MVT::i64) {

36683

SDValue In = N0.getOperand(0);

36684

APInt Mask = APInt::getHighBitsSet(64, 32);

36685

if (DAG.MaskedValueIsZero(In, Mask)) {

36686

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

36687

MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

36688

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

36689

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

36690

return DAG.getBitcast(VT, Movl);

36691

}

36692

}

36693

36694

// Load a scalar integer constant directly to XMM instead of transferring an

36695

// immediate value from GPR.

36696

// vzext_movl (scalar_to_vector C) --> load [C,0...]

36697

if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

36698

if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

36699

// Create a vector constant - scalar constant followed by zeros.

36700

EVT ScalarVT = N0.getOperand(0).getValueType();

36701

Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

36702

unsigned NumElts = VT.getVectorNumElements();

36703

Constant *Zero = ConstantInt::getNullValue(ScalarTy);

36704

SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

36705

ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

36706

36707

// Load the vector constant from constant pool.

36708

MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

36709

SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

36710

MachinePointerInfo MPI =

36711

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

36712

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

36713

return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

36714

MachineMemOperand::MOLoad);

36715

}

36716

}

36717

36718

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

36719

// insert into a zero vector. This helps get VZEXT_MOVL closer to

36720

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

36721

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

36722

if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

36723

SDValue V = peekThroughOneUseBitcasts(N0);

36724

36725

if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

36726

isNullConstant(V.getOperand(2))) {

36727

SDValue In = V.getOperand(1);

36728

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

36729

In.getValueSizeInBits() /

36730

VT.getScalarSizeInBits());

36731

In = DAG.getBitcast(SubVT, In);

36732

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

36733

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

36734

getZeroVector(VT, Subtarget, DAG, DL), Movl,

36735

V.getOperand(2));

36736

}

36737

}

36738

36739

return SDValue();

36740

}

36741

case X86ISD::BLENDI: {

36742

SDValue N0 = N.getOperand(0);

36743

SDValue N1 = N.getOperand(1);

36744

36745

// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

36746

// TODO: Handle MVT::v16i16 repeated blend mask.

36747

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

36748

N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

36749

MVT SrcVT = N0.getOperand(0).getSimpleValueType();

36750

if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&

36751

SrcVT.getScalarSizeInBits() >= 32) {

36752

unsigned BlendMask = N.getConstantOperandVal(2);

36753

unsigned Size = VT.getVectorNumElements();

36754

unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

36755

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);

36756

return DAG.getBitcast(

36757

VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

36758

N1.getOperand(0),

36759

DAG.getTargetConstant(BlendMask, DL, MVT::i8)));

36760

}

36761

}

36762

return SDValue();

36763

}

36764

case X86ISD::VPERMI: {

36765

// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

36766

// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

36767

SDValue N0 = N.getOperand(0);

36768

SDValue N1 = N.getOperand(1);

36769

unsigned EltSizeInBits = VT.getScalarSizeInBits();

36770

if (N0.getOpcode() == ISD::BITCAST &&

36771

N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

36772

SDValue Src = N0.getOperand(0);

36773

EVT SrcVT = Src.getValueType();

36774

SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

36775

return DAG.getBitcast(VT, Res);

36776

}

36777

return SDValue();

36778

}

36779

case X86ISD::VPERM2X128: {

36780

// If both 128-bit values were inserted into high halves of 256-bit values,

36781

// the shuffle can be reduced to a concatenation of subvectors:

36782

// vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y

36783

// Note: We are only looking for the exact high/high shuffle mask because we

36784

// expect to fold other similar patterns before creating this opcode.

36785

SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));

36786

SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));

36787

unsigned Imm = N.getConstantOperandVal(2);

36788

if (!(Imm == 0x31 &&

36789

Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&

36790

Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&

36791

Ins0.getValueType() == Ins1.getValueType()))

36792

return SDValue();

36793

36794

SDValue X = Ins0.getOperand(1);

36795

SDValue Y = Ins1.getOperand(1);

36796

unsigned C1 = Ins0.getConstantOperandVal(2);

36797

unsigned C2 = Ins1.getConstantOperandVal(2);

36798

MVT SrcVT = X.getSimpleValueType();

36799

unsigned SrcElts = SrcVT.getVectorNumElements();

36800

if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||

36801

C1 != SrcElts || C2 != SrcElts)

36802

return SDValue();

36803

36804

return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,

36805

Ins1.getValueType(), X, Y));

36806

}

36807

case X86ISD::PSHUFD:

36808

case X86ISD::PSHUFLW:

36809

case X86ISD::PSHUFHW:

36810

Mask = getPSHUFShuffleMask(N);

36811

assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36811, __PRETTY_FUNCTION__));

36812

break;

36813

case X86ISD::MOVSD:

36814

case X86ISD::MOVSS: {

36815

SDValue N0 = N.getOperand(0);

36816

SDValue N1 = N.getOperand(1);

36817

36818

// Canonicalize scalar FPOps:

36819

// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

36820

// If commutable, allow OP(N1[0], N0[0]).

36821

unsigned Opcode1 = N1.getOpcode();

36822

if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

36823

Opcode1 == ISD::FDIV) {

36824

SDValue N10 = N1.getOperand(0);

36825

SDValue N11 = N1.getOperand(1);

36826

if (N10 == N0 ||

36827

(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

36828

if (N10 != N0)

36829

std::swap(N10, N11);

36830

MVT SVT = VT.getVectorElementType();

36831

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

36832

N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

36833

N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

36834

SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

36835

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

36836

return DAG.getNode(Opcode, DL, VT, N0, SclVec);

36837

}

36838

}

36839

36840

return SDValue();

36841

}

36842

case X86ISD::INSERTPS: {

36843

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36843, __PRETTY_FUNCTION__));

36844

SDValue Op0 = N.getOperand(0);

36845

SDValue Op1 = N.getOperand(1);

36846

unsigned InsertPSMask = N.getConstantOperandVal(2);

36847

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

36848

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

36849

unsigned ZeroMask = InsertPSMask & 0xF;

36850

36851

// If we zero out all elements from Op0 then we don't need to reference it.

36852

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

36853

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

36854

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

36855

36856

// If we zero out the element from Op1 then we don't need to reference it.

36857

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

36858

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

36859

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

36860

36861

// Attempt to merge insertps Op1 with an inner target shuffle node.

36862

SmallVector<int, 8> TargetMask1;

36863

SmallVector<SDValue, 2> Ops1;

36864

APInt KnownUndef1, KnownZero1;

36865

if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

36866

KnownZero1)) {

36867

if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

36868

// Zero/UNDEF insertion - zero out element and remove dependency.

36869

InsertPSMask |= (1u << DstIdx);

36870

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

36871

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

36872

}

36873

// Update insertps mask srcidx and reference the source input directly.

36874

int M = TargetMask1[SrcIdx];

36875

assert(0 <= M && M < 8 && "Shuffle index out of range")((0 <= M && M < 8 && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36875, __PRETTY_FUNCTION__));

36876

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

36877

Op1 = Ops1[M < 4 ? 0 : 1];

36878

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

36879

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

36880

}

36881

36882

// Attempt to merge insertps Op0 with an inner target shuffle node.

36883

SmallVector<int, 8> TargetMask0;

36884

SmallVector<SDValue, 2> Ops0;

36885

APInt KnownUndef0, KnownZero0;

36886

if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

36887

KnownZero0)) {

36888

bool Updated = false;

36889

bool UseInput00 = false;

36890

bool UseInput01 = false;

36891

for (int i = 0; i != 4; ++i) {

36892

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

36893

// No change if element is already zero or the inserted element.

36894

continue;

36895

} else if (KnownUndef0[i] || KnownZero0[i]) {

36896

// If the target mask is undef/zero then we must zero the element.

36897

InsertPSMask |= (1u << i);

36898

Updated = true;

36899

continue;

36900

}

36901

36902

// The input vector element must be inline.

36903

int M = TargetMask0[i];

36904

if (M != i && M != (i + 4))

36905

return SDValue();

36906

36907

// Determine which inputs of the target shuffle we're using.

36908

UseInput00 |= (0 <= M && M < 4);

36909

UseInput01 |= (4 <= M);

36910

}

36911

36912

// If we're not using both inputs of the target shuffle then use the

36913

// referenced input directly.

36914

if (UseInput00 && !UseInput01) {

36915

Updated = true;

36916

Op0 = Ops0[0];

36917

} else if (!UseInput00 && UseInput01) {

36918

Updated = true;

36919

Op0 = Ops0[1];

36920

}

36921

36922

if (Updated)

36923

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

36924

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

36925

}

36926

36927

// If we're inserting an element from a vbroadcast load, fold the

36928

// load into the X86insertps instruction. We need to convert the scalar

36929

// load to a vector and clear the source lane of the INSERTPS control.

36930

if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

36931

auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

36932

if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

36933

SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

36934

MemIntr->getBasePtr(),

36935

MemIntr->getMemOperand());

36936

SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

36937

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

36938

Load),

36939

DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

36940

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

36941

return Insert;

36942

}

36943

}

36944

36945

return SDValue();

36946

}

36947

default:

36948

return SDValue();

36949

}

36950

36951

// Nuke no-op shuffles that show up after combining.

36952

if (isNoopShuffleMask(Mask))

36953

return N.getOperand(0);

36954

36955

// Look for simplifications involving one or two shuffle instructions.

36956

SDValue V = N.getOperand(0);

36957

switch (N.getOpcode()) {

36958

default:

36959

break;

36960

case X86ISD::PSHUFLW:

36961

case X86ISD::PSHUFHW:

36962

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36962, __PRETTY_FUNCTION__));

36963

36964

// See if this reduces to a PSHUFD which is no more expensive and can

36965

// combine with more operations. Note that it has to at least flip the

36966

// dwords as otherwise it would have been removed as a no-op.

36967

if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {

36968

int DMask[] = {0, 1, 2, 3};

36969

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

36970

DMask[DOffset + 0] = DOffset + 1;

36971

DMask[DOffset + 1] = DOffset + 0;

36972

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

36973

V = DAG.getBitcast(DVT, V);

36974

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

36975

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

36976

return DAG.getBitcast(VT, V);

36977

}

36978

36979

// Look for shuffle patterns which can be implemented as a single unpack.

36980

// FIXME: This doesn't handle the location of the PSHUFD generically, and

36981

// only works when we have a PSHUFD followed by two half-shuffles.

36982

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

36983

(V.getOpcode() == X86ISD::PSHUFLW ||

36984

V.getOpcode() == X86ISD::PSHUFHW) &&

36985

V.getOpcode() != N.getOpcode() &&

36986

V.hasOneUse() && V.getOperand(0).hasOneUse()) {

36987

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

36988

if (D.getOpcode() == X86ISD::PSHUFD) {

36989

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

36990

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

36991

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

36992

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

36993

int WordMask[8];

36994

for (int i = 0; i < 4; ++i) {

36995

WordMask[i + NOffset] = Mask[i] + NOffset;

36996

WordMask[i + VOffset] = VMask[i] + VOffset;

36997

}

36998

// Map the word mask through the DWord mask.

36999

int MappedMask[8];

37000

for (int i = 0; i < 8; ++i)

37001

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

37002

if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

37003

makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

37004

// We can replace all three shuffles with an unpack.

37005

V = DAG.getBitcast(VT, D.getOperand(0));

37006

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

37007

: X86ISD::UNPCKH,

37008

DL, VT, V, V);

37009

}

37010

}

37011

}

37012

37013

break;

37014

37015

case X86ISD::PSHUFD:

37016

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

37017

return NewN;

37018

37019

break;

37020

}

37021

37022

return SDValue();

37023

}

37024

37025

/// Checks if the shuffle mask takes subsequent elements

37026

/// alternately from two vectors.

37027

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.

37028

static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

37029

37030

int ParitySrc[2] = {-1, -1};

37031

unsigned Size = Mask.size();

37032

for (unsigned i = 0; i != Size; ++i) {

37033

int M = Mask[i];

37034

if (M < 0)

37035

continue;

37036

37037

// Make sure we are using the matching element from the input.

37038

if ((M % Size) != i)

37039

return false;

37040

37041

// Make sure we use the same input for all elements of the same parity.

37042

int Src = M / Size;

37043

if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

37044

return false;

37045

ParitySrc[i % 2] = Src;

37046

}

37047

37048

// Make sure each input is used.

37049

if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

37050

return false;

37051

37052

Op0Even = ParitySrc[0] == 0;

37053

return true;

37054

}

37055

37056

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

37057

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

37058

/// are written to the parameters \p Opnd0 and \p Opnd1.

37059

///

37060

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

37061

/// so it is easier to generically match. We also insert dummy vector shuffle

37062

/// nodes for the operands which explicitly discard the lanes which are unused

37063

/// by this operation to try to flow through the rest of the combiner

37064

/// the fact that they're unused.

37065

static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

37066

SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

37067

bool &IsSubAdd) {

37068

37069

EVT VT = N->getValueType(0);

37070

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

37071

if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

37072

!VT.getSimpleVT().isFloatingPoint())

37073

return false;

37074

37075

// We only handle target-independent shuffles.

37076

// FIXME: It would be easy and harmless to use the target shuffle mask

37077

// extraction tool to support more.

37078

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

37079

return false;

37080

37081

SDValue V1 = N->getOperand(0);

37082

SDValue V2 = N->getOperand(1);

37083

37084

// Make sure we have an FADD and an FSUB.

37085

if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

37086

(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

37087

V1.getOpcode() == V2.getOpcode())

37088

return false;

37089

37090

// If there are other uses of these operations we can't fold them.

37091

if (!V1->hasOneUse() || !V2->hasOneUse())

37092

return false;

37093

37094

// Ensure that both operations have the same operands. Note that we can

37095

// commute the FADD operands.

37096

SDValue LHS, RHS;

37097

if (V1.getOpcode() == ISD::FSUB) {

37098

LHS = V1->getOperand(0); RHS = V1->getOperand(1);

37099

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

37100

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

37101

return false;

37102

} else {

37103

assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")((V2.getOpcode() == ISD::FSUB && "Unexpected opcode")
? static_cast<void> (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37103, __PRETTY_FUNCTION__));

37104

LHS = V2->getOperand(0); RHS = V2->getOperand(1);

37105

if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

37106

(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

37107

return false;

37108

}

37109

37110

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

37111

bool Op0Even;

37112

if (!isAddSubOrSubAddMask(Mask, Op0Even))

37113

return false;

37114

37115

// It's a subadd if the vector in the even parity is an FADD.

37116

IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

37117

: V2->getOpcode() == ISD::FADD;

37118

37119

Opnd0 = LHS;

37120

Opnd1 = RHS;

37121

return true;

37122

}

37123

37124

/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.

37125

static SDValue combineShuffleToFMAddSub(SDNode *N,

37126

const X86Subtarget &Subtarget,

37127

SelectionDAG &DAG) {

37128

// We only handle target-independent shuffles.

37129

// FIXME: It would be easy and harmless to use the target shuffle mask

37130

// extraction tool to support more.

37131

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

37132

return SDValue();

37133

37134

MVT VT = N->getSimpleValueType(0);

37135

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

37136

if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

37137

return SDValue();

37138

37139

// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

37140

SDValue Op0 = N->getOperand(0);

37141

SDValue Op1 = N->getOperand(1);

37142

SDValue FMAdd = Op0, FMSub = Op1;

37143

if (FMSub.getOpcode() != X86ISD::FMSUB)

37144

std::swap(FMAdd, FMSub);

37145

37146

if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

37147

FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

37148

FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

37149

FMAdd.getOperand(2) != FMSub.getOperand(2))

37150

return SDValue();

37151

37152

// Check for correct shuffle mask.

37153

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

37154

bool Op0Even;

37155

if (!isAddSubOrSubAddMask(Mask, Op0Even))

37156

return SDValue();

37157

37158

// FMAddSub takes zeroth operand from FMSub node.

37159

SDLoc DL(N);

37160

bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

37161

unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

37162

return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

37163

FMAdd.getOperand(2));

37164

}

37165

37166

/// Try to combine a shuffle into a target-specific add-sub or

37167

/// mul-add-sub node.

37168

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

37169

const X86Subtarget &Subtarget,

37170

SelectionDAG &DAG) {

37171

if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))

37172

return V;

37173

37174

SDValue Opnd0, Opnd1;

37175

bool IsSubAdd;

37176

if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))

37177

return SDValue();

37178

37179

MVT VT = N->getSimpleValueType(0);

37180

SDLoc DL(N);

37181

37182

// Try to generate X86ISD::FMADDSUB node here.

37183

SDValue Opnd2;

37184

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {

37185

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

37186

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

37187

}

37188

37189

if (IsSubAdd)

37190

return SDValue();

37191

37192

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

37193

// the ADDSUB idiom has been successfully recognized. There are no known

37194

// X86 targets with 512-bit ADDSUB instructions!

37195

if (VT.is512BitVector())

37196

return SDValue();

37197

37198

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

37199

}

37200

37201

// We are looking for a shuffle where both sources are concatenated with undef

37202

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

37203

// if we can express this as a single-source shuffle, that's preferable.

37204

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

37205

const X86Subtarget &Subtarget) {

37206

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

37207

return SDValue();

37208

37209

EVT VT = N->getValueType(0);

37210

37211

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

37212

if (!VT.is128BitVector() && !VT.is256BitVector())

37213

return SDValue();

37214

37215

if (VT.getVectorElementType() != MVT::i32 &&

37216

VT.getVectorElementType() != MVT::i64 &&

37217

VT.getVectorElementType() != MVT::f32 &&

37218

VT.getVectorElementType() != MVT::f64)

37219

return SDValue();

37220

37221

SDValue N0 = N->getOperand(0);

37222

SDValue N1 = N->getOperand(1);

37223

37224

// Check that both sources are concats with undef.

37225

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

37226

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

37227

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

37228

!N1.getOperand(1).isUndef())

37229

return SDValue();

37230

37231

// Construct the new shuffle mask. Elements from the first source retain their

37232

// index, but elements from the second source no longer need to skip an undef.

37233

SmallVector<int, 8> Mask;

37234

int NumElts = VT.getVectorNumElements();

37235

37236

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

37237

for (int Elt : SVOp->getMask())

37238

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

37239

37240

SDLoc DL(N);

37241

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

37242

N1.getOperand(0));

37243

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

37244

}

37245

37246

/// Eliminate a redundant shuffle of a horizontal math op.

37247

static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {

37248

unsigned Opcode = N->getOpcode();

37249

if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)

37250

if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())

37251

return SDValue();

37252

37253

// For a broadcast, peek through an extract element of index 0 to find the

37254

// horizontal op: broadcast (ext_vec_elt HOp, 0)

37255

EVT VT = N->getValueType(0);

37256

if (Opcode == X86ISD::VBROADCAST) {

37257

SDValue SrcOp = N->getOperand(0);

37258

if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

37259

SrcOp.getValueType() == MVT::f64 &&

37260

SrcOp.getOperand(0).getValueType() == VT &&

37261

isNullConstant(SrcOp.getOperand(1)))

37262

N = SrcOp.getNode();

37263

}

37264

37265

SDValue HOp = N->getOperand(0);

37266

if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&

37267

HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)

37268

return SDValue();

37269

37270

// 128-bit horizontal math instructions are defined to operate on adjacent

37271

// lanes of each operand as:

37272

// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]

37273

// ...similarly for v2f64 and v8i16.

37274

if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&

37275

HOp.getOperand(0) != HOp.getOperand(1))

37276

return SDValue();

37277

37278

// The shuffle that we are eliminating may have allowed the horizontal op to

37279

// have an undemanded (undefined) operand. Duplicate the other (defined)

37280

// operand to ensure that the results are defined across all lanes without the

37281

// shuffle.

37282

auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {

37283

SDValue X;

37284

if (HorizOp.getOperand(0).isUndef()) {

37285

assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(1).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37285, __PRETTY_FUNCTION__));

37286

X = HorizOp.getOperand(1);

37287

} else if (HorizOp.getOperand(1).isUndef()) {

37288

assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(0).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37288, __PRETTY_FUNCTION__));

37289

X = HorizOp.getOperand(0);

37290

} else {

37291

return HorizOp;

37292

}

37293

return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),

37294

HorizOp.getValueType(), X, X);

37295

};

37296

37297

// When the operands of a horizontal math op are identical, the low half of

37298

// the result is the same as the high half. If a target shuffle is also

37299

// replicating low and high halves (and without changing the type/length of

37300

// the vector), we don't need the shuffle.

37301

if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {

37302

if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {

37303

// movddup (hadd X, X) --> hadd X, X

37304

// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X

37305

assert((HOp.getValueType() == MVT::v2f64 ||(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37306, __PRETTY_FUNCTION__))

37306

HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op")(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37306, __PRETTY_FUNCTION__));

37307

return updateHOp(HOp, DAG);

37308

}

37309

return SDValue();

37310

}

37311

37312

// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X

37313

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

37314

// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,

37315

// but this should be tied to whatever horizontal op matching and shuffle

37316

// canonicalization are producing.

37317

if (HOp.getValueSizeInBits() == 128 &&

37318

(isTargetShuffleEquivalent(Mask, {0, 0}) ||

37319

isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||

37320

isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))

37321

return updateHOp(HOp, DAG);

37322

37323

if (HOp.getValueSizeInBits() == 256 &&

37324

(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||

37325

isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||

37326

isTargetShuffleEquivalent(

37327

Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))

37328

return updateHOp(HOp, DAG);

37329

37330

return SDValue();

37331

}

37332

37333

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

37334

/// low half of each source vector and does not set any high half elements in

37335

/// the destination vector, narrow the shuffle to half its original size.

37336

static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

37337

if (!Shuf->getValueType(0).isSimple())

37338

return SDValue();

37339

MVT VT = Shuf->getSimpleValueType(0);

37340

if (!VT.is256BitVector() && !VT.is512BitVector())

37341

return SDValue();

37342

37343

// See if we can ignore all of the high elements of the shuffle.

37344

ArrayRef<int> Mask = Shuf->getMask();

37345

if (!isUndefUpperHalf(Mask))

37346

return SDValue();

37347

37348

// Check if the shuffle mask accesses only the low half of each input vector

37349

// (half-index output is 0 or 2).

37350

int HalfIdx1, HalfIdx2;

37351

SmallVector<int, 8> HalfMask(Mask.size() / 2);

37352

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

37353

(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

37354

return SDValue();

37355

37356

// Create a half-width shuffle to replace the unnecessarily wide shuffle.

37357

// The trick is knowing that all of the insert/extract are actually free

37358

// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

37359

// of narrow inputs into a narrow output, and that is always cheaper than

37360

// the wide shuffle that we started with.

37361

return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

37362

Shuf->getOperand(1), HalfMask, HalfIdx1,

37363

HalfIdx2, false, DAG, /*UseConcat*/true);

37364

}

37365

37366

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

37367

TargetLowering::DAGCombinerInfo &DCI,

37368

const X86Subtarget &Subtarget) {

37369

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

37370

if (SDValue V = narrowShuffle(Shuf, DAG))

37371

return V;

37372

37373

// If we have legalized the vector types, look for blends of FADD and FSUB

37374

// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

37375

SDLoc dl(N);

37376

EVT VT = N->getValueType(0);

37377

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

37378

if (TLI.isTypeLegal(VT)) {

37379

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

37380

return AddSub;

37381

37382

if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))

37383

return HAddSub;

37384

}

37385

37386

// Attempt to combine into a vector load/broadcast.

37387

if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,

37388

Subtarget, true))

37389

return LD;

37390

37391

// For AVX2, we sometimes want to combine

37392

// (vector_shuffle <mask> (concat_vectors t1, undef)

37393

// (concat_vectors t2, undef))

37394

// Into:

37395

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

37396

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

37397

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

37398

return ShufConcat;

37399

37400

if (isTargetShuffle(N->getOpcode())) {

37401

SDValue Op(N, 0);

37402

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

37403

return Shuffle;

37404

37405

// Try recursively combining arbitrary sequences of x86 shuffle

37406

// instructions into higher-order shuffles. We do this after combining

37407

// specific PSHUF instruction sequences into their minimal form so that we

37408

// can evaluate how many specialized shuffle instructions are involved in

37409

// a particular chain.

37410

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

37411

return Res;

37412

37413

// Simplify source operands based on shuffle mask.

37414

// TODO - merge this into combineX86ShufflesRecursively.

37415

APInt KnownUndef, KnownZero;

37416

APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());

37417

if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,

37418

DCI))

37419

return SDValue(N, 0);

37420

}

37421

37422

return SDValue();

37423

}

37424

37425

// Simplify variable target shuffle masks based on the demanded elements.

37426

// TODO: Handle DemandedBits in mask indices as well?

37427

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

37428

SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

37429

TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

37430

// If we're demanding all elements don't bother trying to simplify the mask.

37431

unsigned NumElts = DemandedElts.getBitWidth();

37432

if (DemandedElts.isAllOnesValue())

37433

return false;

37434

37435

SDValue Mask = Op.getOperand(MaskIndex);

37436

if (!Mask.hasOneUse())

37437

return false;

37438

37439

// Attempt to generically simplify the variable shuffle mask.

37440

APInt MaskUndef, MaskZero;

37441

if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

37442

Depth + 1))

37443

return true;

37444

37445

// Attempt to extract+simplify a (constant pool load) shuffle mask.

37446

// TODO: Support other types from getTargetShuffleMaskIndices?

37447

SDValue BC = peekThroughOneUseBitcasts(Mask);

37448

EVT BCVT = BC.getValueType();

37449

auto *Load = dyn_cast<LoadSDNode>(BC);

37450

if (!Load)

37451

return false;

37452

37453

const Constant *C = getTargetConstantFromNode(Load);

37454

if (!C)

37455

return false;

37456

37457

Type *CTy = C->getType();

37458

if (!CTy->isVectorTy() ||

37459

CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

37460

return false;

37461

37462

// Handle scaling for i64 elements on 32-bit targets.

37463

unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

37464

if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

37465

return false;

37466

unsigned Scale = NumCstElts / NumElts;

37467

37468

// Simplify mask if we have an undemanded element that is not undef.

37469

bool Simplified = false;

37470

SmallVector<Constant *, 32> ConstVecOps;

37471

for (unsigned i = 0; i != NumCstElts; ++i) {

37472

Constant *Elt = C->getAggregateElement(i);

37473

if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

37474

ConstVecOps.push_back(UndefValue::get(Elt->getType()));

37475

Simplified = true;

37476

continue;

37477

}

37478

ConstVecOps.push_back(Elt);

37479

}

37480

if (!Simplified)

37481

return false;

37482

37483

// Generate new constant pool entry + legalize immediately for the load.

37484

SDLoc DL(Op);

37485

SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

37486

SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

37487

SDValue NewMask = TLO.DAG.getLoad(

37488

BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

37489

MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

37490

Load->getAlign());

37491

return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

37492

}

37493

37494

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

37495

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

37496

TargetLoweringOpt &TLO, unsigned Depth) const {

37497

int NumElts = DemandedElts.getBitWidth();

37498

unsigned Opc = Op.getOpcode();

37499

EVT VT = Op.getValueType();

37500

37501

// Handle special case opcodes.

37502

switch (Opc) {

37503

case X86ISD::PMULDQ:

37504

case X86ISD::PMULUDQ: {

37505

APInt LHSUndef, LHSZero;

37506

APInt RHSUndef, RHSZero;

37507

SDValue LHS = Op.getOperand(0);

37508

SDValue RHS = Op.getOperand(1);

37509

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

37510

Depth + 1))

37511

return true;

37512

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

37513

Depth + 1))

37514

return true;

37515

// Multiply by zero.

37516

KnownZero = LHSZero | RHSZero;

37517

break;

37518

}

37519

case X86ISD::VSHL:

37520

case X86ISD::VSRL:

37521

case X86ISD::VSRA: {

37522

// We only need the bottom 64-bits of the (128-bit) shift amount.

37523

SDValue Amt = Op.getOperand(1);

37524

MVT AmtVT = Amt.getSimpleValueType();

37525

assert(AmtVT.is128BitVector() && "Unexpected value type")((AmtVT.is128BitVector() && "Unexpected value type") ?
static_cast<void> (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37525, __PRETTY_FUNCTION__));

37526

37527

// If we reuse the shift amount just for sse shift amounts then we know that

37528

// only the bottom 64-bits are only ever used.

37529

bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {

37530

unsigned UseOpc = Use->getOpcode();

37531

return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

37532

UseOpc == X86ISD::VSRA) &&

37533

Use->getOperand(0) != Amt;

37534

});

37535

37536

APInt AmtUndef, AmtZero;

37537

unsigned NumAmtElts = AmtVT.getVectorNumElements();

37538

APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

37539

if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

37540

Depth + 1, AssumeSingleUse))

37541

return true;

37542

LLVM_FALLTHROUGH[[gnu::fallthrough]];

37543

}

37544

case X86ISD::VSHLI:

37545

case X86ISD::VSRLI:

37546

case X86ISD::VSRAI: {

37547

SDValue Src = Op.getOperand(0);

37548

APInt SrcUndef;

37549

if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

37550

Depth + 1))

37551

return true;

37552

37553

// Aggressively peek through ops to get at the demanded elts.

37554

if (!DemandedElts.isAllOnesValue())

37555

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

37556

Src, DemandedElts, TLO.DAG, Depth + 1))

37557

return TLO.CombineTo(

37558

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

37559

break;

37560

}

37561

case X86ISD::KSHIFTL: {

37562

SDValue Src = Op.getOperand(0);

37563

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

37564

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37564, __PRETTY_FUNCTION__));

37565

unsigned ShiftAmt = Amt->getZExtValue();

37566

37567

if (ShiftAmt == 0)

37568

return TLO.CombineTo(Op, Src);

37569

37570

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

37571

// single shift. We can do this if the bottom bits (which are shifted

37572

// out) are never demanded.

37573

if (Src.getOpcode() == X86ISD::KSHIFTR) {

37574

if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

37575

unsigned C1 = Src.getConstantOperandVal(1);

37576

unsigned NewOpc = X86ISD::KSHIFTL;

37577

int Diff = ShiftAmt - C1;

37578

if (Diff < 0) {

37579

Diff = -Diff;

37580

NewOpc = X86ISD::KSHIFTR;

37581

}

37582

37583

SDLoc dl(Op);

37584

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

37585

return TLO.CombineTo(

37586

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

37587

}

37588

}

37589

37590

APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

37591

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

37592

Depth + 1))

37593

return true;

37594

37595

KnownUndef <<= ShiftAmt;

37596

KnownZero <<= ShiftAmt;

37597

KnownZero.setLowBits(ShiftAmt);

37598

break;

37599

}

37600

case X86ISD::KSHIFTR: {

37601

SDValue Src = Op.getOperand(0);

37602

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

37603

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37603, __PRETTY_FUNCTION__));

37604

unsigned ShiftAmt = Amt->getZExtValue();

37605

37606

if (ShiftAmt == 0)

37607

return TLO.CombineTo(Op, Src);

37608

37609

// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

37610

// single shift. We can do this if the top bits (which are shifted

37611

// out) are never demanded.

37612

if (Src.getOpcode() == X86ISD::KSHIFTL) {

37613

if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

37614

unsigned C1 = Src.getConstantOperandVal(1);

37615

unsigned NewOpc = X86ISD::KSHIFTR;

37616

int Diff = ShiftAmt - C1;

37617

if (Diff < 0) {

37618

Diff = -Diff;

37619

NewOpc = X86ISD::KSHIFTL;

37620

}

37621

37622

SDLoc dl(Op);

37623

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

37624

return TLO.CombineTo(

37625

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

37626

}

37627

}

37628

37629

APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

37630

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

37631

Depth + 1))

37632

return true;

37633

37634

KnownUndef.lshrInPlace(ShiftAmt);

37635

KnownZero.lshrInPlace(ShiftAmt);

37636

KnownZero.setHighBits(ShiftAmt);

37637

break;

37638

}

37639

case X86ISD::CVTSI2P:

37640

case X86ISD::CVTUI2P: {

37641

SDValue Src = Op.getOperand(0);

37642

MVT SrcVT = Src.getSimpleValueType();

37643

APInt SrcUndef, SrcZero;

37644

APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

37645

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

37646

Depth + 1))

37647

return true;

37648

break;

37649

}

37650

case X86ISD::PACKSS:

37651

case X86ISD::PACKUS: {

37652

SDValue N0 = Op.getOperand(0);

37653

SDValue N1 = Op.getOperand(1);

37654

37655

APInt DemandedLHS, DemandedRHS;

37656

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

37657

37658

APInt SrcUndef, SrcZero;

37659

if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,

37660

Depth + 1))

37661

return true;

37662

if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,

37663

Depth + 1))

37664

return true;

37665

37666

// Aggressively peek through ops to get at the demanded elts.

37667

// TODO - we should do this for all target/faux shuffles ops.

37668

if (!DemandedElts.isAllOnesValue()) {

37669

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

37670

TLO.DAG, Depth + 1);

37671

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

37672

TLO.DAG, Depth + 1);

37673

if (NewN0 || NewN1) {

37674

NewN0 = NewN0 ? NewN0 : N0;

37675

NewN1 = NewN1 ? NewN1 : N1;

37676

return TLO.CombineTo(Op,

37677

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

37678

}

37679

}

37680

break;

37681

}

37682

case X86ISD::HADD:

37683

case X86ISD::HSUB:

37684

case X86ISD::FHADD:

37685

case X86ISD::FHSUB: {

37686

APInt DemandedLHS, DemandedRHS;

37687

getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

37688

37689

APInt LHSUndef, LHSZero;

37690

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,

37691

LHSZero, TLO, Depth + 1))

37692

return true;

37693

APInt RHSUndef, RHSZero;

37694

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,

37695

RHSZero, TLO, Depth + 1))

37696

return true;

37697

break;

37698

}

37699

case X86ISD::VTRUNC:

37700

case X86ISD::VTRUNCS:

37701

case X86ISD::VTRUNCUS: {

37702

SDValue Src = Op.getOperand(0);

37703

MVT SrcVT = Src.getSimpleValueType();

37704

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

37705

APInt SrcUndef, SrcZero;

37706

if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

37707

Depth + 1))

37708

return true;

37709

KnownZero = SrcZero.zextOrTrunc(NumElts);

37710

KnownUndef = SrcUndef.zextOrTrunc(NumElts);

37711

break;

37712

}

37713

case X86ISD::BLENDV: {

37714

APInt SelUndef, SelZero;

37715

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

37716

SelZero, TLO, Depth + 1))

37717

return true;

37718

37719

// TODO: Use SelZero to adjust LHS/RHS DemandedElts.

37720

APInt LHSUndef, LHSZero;

37721

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

37722

LHSZero, TLO, Depth + 1))

37723

return true;

37724

37725

APInt RHSUndef, RHSZero;

37726

if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

37727

RHSZero, TLO, Depth + 1))

37728

return true;

37729

37730

KnownZero = LHSZero & RHSZero;

37731

KnownUndef = LHSUndef & RHSUndef;

37732

break;

37733

}

37734

case X86ISD::VZEXT_MOVL: {

37735

// If upper demanded elements are already zero then we have nothing to do.

37736

SDValue Src = Op.getOperand(0);

37737

APInt DemandedUpperElts = DemandedElts;

37738

DemandedUpperElts.clearLowBits(1);

37739

if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())

37740

return TLO.CombineTo(Op, Src);

37741

break;

37742

}

37743

case X86ISD::VBROADCAST: {

37744

SDValue Src = Op.getOperand(0);

37745

MVT SrcVT = Src.getSimpleValueType();

37746

if (!SrcVT.isVector())

37747

return false;

37748

// Don't bother broadcasting if we just need the 0'th element.

37749

if (DemandedElts == 1) {

37750

if (Src.getValueType() != VT)

37751

Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

37752

SDLoc(Op));

37753

return TLO.CombineTo(Op, Src);

37754

}

37755

APInt SrcUndef, SrcZero;

37756

APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

37757

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

37758

Depth + 1))

37759

return true;

37760

// Aggressively peek through src to get at the demanded elt.

37761

// TODO - we should do this for all target/faux shuffles ops.

37762

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

37763

Src, SrcElts, TLO.DAG, Depth + 1))

37764

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

37765

break;

37766

}

37767

case X86ISD::VPERMV:

37768

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

37769

Depth))

37770

return true;

37771

break;

37772

case X86ISD::PSHUFB:

37773

case X86ISD::VPERMV3:

37774

case X86ISD::VPERMILPV:

37775

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

37776

Depth))

37777

return true;

37778

break;

37779

case X86ISD::VPPERM:

37780

case X86ISD::VPERMIL2:

37781

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

37782

Depth))

37783

return true;

37784

break;

37785

}

37786

37787

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

37788

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

37789

// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

37790

if ((VT.is256BitVector() || VT.is512BitVector()) &&

37791

DemandedElts.lshr(NumElts / 2) == 0) {

37792

unsigned SizeInBits = VT.getSizeInBits();

37793

unsigned ExtSizeInBits = SizeInBits / 2;

37794

37795

// See if 512-bit ops only use the bottom 128-bits.

37796

if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

37797

ExtSizeInBits = SizeInBits / 4;

37798

37799

switch (Opc) {

37800

// Subvector broadcast.

37801

case X86ISD::SUBV_BROADCAST: {

37802

SDLoc DL(Op);

37803

SDValue Src = Op.getOperand(0);

37804

if (Src.getValueSizeInBits() > ExtSizeInBits)

37805

Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

37806

else if (Src.getValueSizeInBits() < ExtSizeInBits) {

37807

MVT SrcSVT = Src.getSimpleValueType().getScalarType();

37808

MVT SrcVT =

37809

MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());

37810

Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);

37811

}

37812

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,

37813

TLO.DAG, DL, ExtSizeInBits));

37814

}

37815

// Byte shifts by immediate.

37816

case X86ISD::VSHLDQ:

37817

case X86ISD::VSRLDQ:

37818

// Shift by uniform.

37819

case X86ISD::VSHL:

37820

case X86ISD::VSRL:

37821

case X86ISD::VSRA:

37822

// Shift by immediate.

37823

case X86ISD::VSHLI:

37824

case X86ISD::VSRLI:

37825

case X86ISD::VSRAI: {

37826

SDLoc DL(Op);

37827

SDValue Ext0 =

37828

extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

37829

SDValue ExtOp =

37830

TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

37831

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

37832

SDValue Insert =

37833

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

37834

return TLO.CombineTo(Op, Insert);

37835

}

37836

case X86ISD::VPERMI: {

37837

// Simplify PERMPD/PERMQ to extract_subvector.

37838

// TODO: This should be done in shuffle combining.

37839

if (VT == MVT::v4f64 || VT == MVT::v4i64) {

37840

SmallVector<int, 4> Mask;

37841

DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

37842

if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

37843

SDLoc DL(Op);

37844

SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

37845

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

37846

SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

37847

return TLO.CombineTo(Op, Insert);

37848

}

37849

}

37850

break;

37851

}

37852

// Zero upper elements.

37853

case X86ISD::VZEXT_MOVL:

37854

// Target unary shuffles by immediate:

37855

case X86ISD::PSHUFD:

37856

case X86ISD::PSHUFLW:

37857

case X86ISD::PSHUFHW:

37858

case X86ISD::VPERMILPI:

37859

// (Non-Lane Crossing) Target Shuffles.

37860

case X86ISD::VPERMILPV:

37861

case X86ISD::VPERMIL2:

37862

case X86ISD::PSHUFB:

37863

case X86ISD::UNPCKL:

37864

case X86ISD::UNPCKH:

37865

case X86ISD::BLENDI:

37866

// Integer ops.

37867

case X86ISD::AVG:

37868

case X86ISD::PACKSS:

37869

case X86ISD::PACKUS:

37870

// Horizontal Ops.

37871

case X86ISD::HADD:

37872

case X86ISD::HSUB:

37873

case X86ISD::FHADD:

37874

case X86ISD::FHSUB: {

37875

SDLoc DL(Op);

37876

SmallVector<SDValue, 4> Ops;

37877

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

37878

SDValue SrcOp = Op.getOperand(i);

37879

EVT SrcVT = SrcOp.getValueType();

37880

assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
"Unsupported vector size") ? static_cast<void> (0) : __assert_fail
("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37881, __PRETTY_FUNCTION__))

37881

"Unsupported vector size")(((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
"Unsupported vector size") ? static_cast<void> (0) : __assert_fail
("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37881, __PRETTY_FUNCTION__));

37882

Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

37883

ExtSizeInBits)

37884

: SrcOp);

37885

}

37886

MVT ExtVT = VT.getSimpleVT();

37887

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

37888

ExtSizeInBits / ExtVT.getScalarSizeInBits());

37889

SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

37890

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

37891

SDValue Insert =

37892

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

37893

return TLO.CombineTo(Op, Insert);

37894

}

37895

}

37896

}

37897

37898

// Get target/faux shuffle mask.

37899

APInt OpUndef, OpZero;

37900

SmallVector<int, 64> OpMask;

37901

SmallVector<SDValue, 2> OpInputs;

37902

if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

37903

OpZero, TLO.DAG, Depth, false))

37904

return false;

37905

37906

// Shuffle inputs must be the same size as the result.

37907

if (OpMask.size() != (unsigned)NumElts ||

37908

llvm::any_of(OpInputs, [VT](SDValue V) {

37909

return VT.getSizeInBits() != V.getValueSizeInBits() ||

37910

!V.getValueType().isVector();

37911

}))

37912

return false;

37913

37914

KnownZero = OpZero;

37915

KnownUndef = OpUndef;

37916

37917

// Check if shuffle mask can be simplified to undef/zero/identity.

37918

int NumSrcs = OpInputs.size();

37919

for (int i = 0; i != NumElts; ++i)

37920

if (!DemandedElts[i])

37921

OpMask[i] = SM_SentinelUndef;

37922

37923

if (isUndefInRange(OpMask, 0, NumElts)) {

37924

KnownUndef.setAllBits();

37925

return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

37926

}

37927

if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

37928

KnownZero.setAllBits();

37929

return TLO.CombineTo(

37930

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

37931

}

37932

for (int Src = 0; Src != NumSrcs; ++Src)

37933

if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

37934

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

37935

37936

// Attempt to simplify inputs.

37937

for (int Src = 0; Src != NumSrcs; ++Src) {

37938

// TODO: Support inputs of different types.

37939

if (OpInputs[Src].getValueType() != VT)

37940

continue;

37941

37942

int Lo = Src * NumElts;

37943

APInt SrcElts = APInt::getNullValue(NumElts);

37944

for (int i = 0; i != NumElts; ++i)

37945

if (DemandedElts[i]) {

37946

int M = OpMask[i] - Lo;

37947

if (0 <= M && M < NumElts)

37948

SrcElts.setBit(M);

37949

}

37950

37951

// TODO - Propagate input undef/zero elts.

37952

APInt SrcUndef, SrcZero;

37953

if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

37954

TLO, Depth + 1))

37955

return true;

37956

}

37957

37958

// If we don't demand all elements, then attempt to combine to a simpler

37959

// shuffle.

37960

// We need to convert the depth to something combineX86ShufflesRecursively

37961

// can handle - so pretend its Depth == 0 again, and reduce the max depth

37962

// to match. This prevents combineX86ShuffleChain from returning a

37963

// combined shuffle that's the same as the original root, causing an

37964

// infinite loop.

37965

if (!DemandedElts.isAllOnesValue()) {

37966

assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")((Depth < X86::MaxShuffleCombineDepth && "Depth out of range"
) ? static_cast<void> (0) : __assert_fail ("Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37966, __PRETTY_FUNCTION__));

37967

37968

SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

37969

for (int i = 0; i != NumElts; ++i)

37970

if (DemandedElts[i])

37971

DemandedMask[i] = i;

37972

37973

SDValue NewShuffle = combineX86ShufflesRecursively(

37974

{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,

37975

/*HasVarMask*/ false,

37976

/*AllowVarMask*/ true, TLO.DAG, Subtarget);

37977

if (NewShuffle)

37978

return TLO.CombineTo(Op, NewShuffle);

37979

}

37980

37981

return false;

37982

}

37983

37984

bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

37985

SDValue Op, const APInt &OriginalDemandedBits,

37986

const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

37987

unsigned Depth) const {

37988

EVT VT = Op.getValueType();

37989

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

37990

unsigned Opc = Op.getOpcode();

37991

switch(Opc) {

37992

case X86ISD::VTRUNC: {

37993

KnownBits KnownOp;

37994

SDValue Src = Op.getOperand(0);

37995

MVT SrcVT = Src.getSimpleValueType();

37996

37997

// Simplify the input, using demanded bit information.

37998

APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

37999

APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

38000

if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

38001

return true;

38002

break;

38003

}

38004

case X86ISD::PMULDQ:

38005

case X86ISD::PMULUDQ: {

38006

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

38007

KnownBits KnownOp;

38008

SDValue LHS = Op.getOperand(0);

38009

SDValue RHS = Op.getOperand(1);

38010

// FIXME: Can we bound this better?

38011

APInt DemandedMask = APInt::getLowBitsSet(64, 32);

38012

if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,

38013

TLO, Depth + 1))

38014

return true;

38015

if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,

38016

TLO, Depth + 1))

38017

return true;

38018

38019

// Aggressively peek through ops to get at the demanded low bits.

38020

SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

38021

LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

38022

SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

38023

RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

38024

if (DemandedLHS || DemandedRHS) {

38025

DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

38026

DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

38027

return TLO.CombineTo(

38028

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

38029

}

38030

break;

38031

}

38032

case X86ISD::VSHLI: {

38033

SDValue Op0 = Op.getOperand(0);

38034

38035

unsigned ShAmt = Op.getConstantOperandVal(1);

38036

if (ShAmt >= BitWidth)

38037

break;

38038

38039

APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

38040

38041

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

38042

// single shift. We can do this if the bottom bits (which are shifted

38043

// out) are never demanded.

38044

if (Op0.getOpcode() == X86ISD::VSRLI &&

38045

OriginalDemandedBits.countTrailingZeros() >= ShAmt) {

38046

unsigned Shift2Amt = Op0.getConstantOperandVal(1);

38047

if (Shift2Amt < BitWidth) {

38048

int Diff = ShAmt - Shift2Amt;

38049

if (Diff == 0)

38050

return TLO.CombineTo(Op, Op0.getOperand(0));

38051

38052

unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

38053

SDValue NewShift = TLO.DAG.getNode(

38054

NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

38055

TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

38056

return TLO.CombineTo(Op, NewShift);

38057

}

38058

}

38059

38060

// If we are only demanding sign bits then we can use the shift source directly.

38061

unsigned NumSignBits =

38062

TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

38063

unsigned UpperDemandedBits =

38064

BitWidth - OriginalDemandedBits.countTrailingZeros();

38065

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

38066

return TLO.CombineTo(Op, Op0);

38067

38068

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

38069

TLO, Depth + 1))

38070

return true;

38071

38072

assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38072, __PRETTY_FUNCTION__));

38073

Known.Zero <<= ShAmt;

38074

Known.One <<= ShAmt;

38075

38076

// Low bits known zero.

38077

Known.Zero.setLowBits(ShAmt);

38078

break;

38079

}

38080

case X86ISD::VSRLI: {

38081

unsigned ShAmt = Op.getConstantOperandVal(1);

38082

if (ShAmt >= BitWidth)

38083

break;

38084

38085

APInt DemandedMask = OriginalDemandedBits << ShAmt;

38086

38087

if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

38088

OriginalDemandedElts, Known, TLO, Depth + 1))

38089

return true;

38090

38091

assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38091, __PRETTY_FUNCTION__));

38092

Known.Zero.lshrInPlace(ShAmt);

38093

Known.One.lshrInPlace(ShAmt);

38094

38095

// High bits known zero.

38096

Known.Zero.setHighBits(ShAmt);

38097

break;

38098

}

38099

case X86ISD::VSRAI: {

38100

SDValue Op0 = Op.getOperand(0);

38101

SDValue Op1 = Op.getOperand(1);

38102

38103

unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

38104

if (ShAmt >= BitWidth)

38105

break;

38106

38107

APInt DemandedMask = OriginalDemandedBits << ShAmt;

38108

38109

// If we just want the sign bit then we don't need to shift it.

38110

if (OriginalDemandedBits.isSignMask())

38111

return TLO.CombineTo(Op, Op0);

38112

38113

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

38114

if (Op0.getOpcode() == X86ISD::VSHLI &&

38115

Op.getOperand(1) == Op0.getOperand(1)) {

38116

SDValue Op00 = Op0.getOperand(0);

38117

unsigned NumSignBits =

38118

TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

38119

if (ShAmt < NumSignBits)

38120

return TLO.CombineTo(Op, Op00);

38121

}

38122

38123

// If any of the demanded bits are produced by the sign extension, we also

38124

// demand the input sign bit.

38125

if (OriginalDemandedBits.countLeadingZeros() < ShAmt)

38126

DemandedMask.setSignBit();

38127

38128

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

38129

TLO, Depth + 1))

38130

return true;

38131

38132

assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38132, __PRETTY_FUNCTION__));

38133

Known.Zero.lshrInPlace(ShAmt);

38134

Known.One.lshrInPlace(ShAmt);

38135

38136

// If the input sign bit is known to be zero, or if none of the top bits

38137

// are demanded, turn this into an unsigned shift right.

38138

if (Known.Zero[BitWidth - ShAmt - 1] ||

38139

OriginalDemandedBits.countLeadingZeros() >= ShAmt)

38140

return TLO.CombineTo(

38141

Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

38142

38143

// High bits are known one.

38144

if (Known.One[BitWidth - ShAmt - 1])

38145

Known.One.setHighBits(ShAmt);

38146

break;

38147

}

38148

case X86ISD::PEXTRB:

38149

case X86ISD::PEXTRW: {

38150

SDValue Vec = Op.getOperand(0);

38151

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

38152

MVT VecVT = Vec.getSimpleValueType();

38153

unsigned NumVecElts = VecVT.getVectorNumElements();

38154

38155

if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

38156

unsigned Idx = CIdx->getZExtValue();

38157

unsigned VecBitWidth = VecVT.getScalarSizeInBits();

38158

38159

// If we demand no bits from the vector then we must have demanded

38160

// bits from the implict zext - simplify to zero.

38161

APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

38162

if (DemandedVecBits == 0)

38163

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

38164

38165

APInt KnownUndef, KnownZero;

38166

APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

38167

if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

38168

KnownZero, TLO, Depth + 1))

38169

return true;

38170

38171

KnownBits KnownVec;

38172

if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

38173

KnownVec, TLO, Depth + 1))

38174

return true;

38175

38176

if (SDValue V = SimplifyMultipleUseDemandedBits(

38177

Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

38178

return TLO.CombineTo(

38179

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

38180

38181

Known = KnownVec.zext(BitWidth);

38182

return false;

38183

}

38184

break;

38185

}

38186

case X86ISD::PINSRB:

38187

case X86ISD::PINSRW: {

38188

SDValue Vec = Op.getOperand(0);

38189

SDValue Scl = Op.getOperand(1);

38190

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

38191

MVT VecVT = Vec.getSimpleValueType();

38192

38193

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

38194

unsigned Idx = CIdx->getZExtValue();

38195

if (!OriginalDemandedElts[Idx])

38196

return TLO.CombineTo(Op, Vec);

38197

38198

KnownBits KnownVec;

38199

APInt DemandedVecElts(OriginalDemandedElts);

38200

DemandedVecElts.clearBit(Idx);

38201

if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

38202

KnownVec, TLO, Depth + 1))

38203

return true;

38204

38205

KnownBits KnownScl;

38206

unsigned NumSclBits = Scl.getScalarValueSizeInBits();

38207

APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

38208

if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

38209

return true;

38210

38211

KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

38212

Known.One = KnownVec.One & KnownScl.One;

38213

Known.Zero = KnownVec.Zero & KnownScl.Zero;

38214

return false;

38215

}

38216

break;

38217

}

38218

case X86ISD::PACKSS:

38219

// PACKSS saturates to MIN/MAX integer values. So if we just want the

38220

// sign bit then we can just ask for the source operands sign bit.

38221

// TODO - add known bits handling.

38222

if (OriginalDemandedBits.isSignMask()) {

38223

APInt DemandedLHS, DemandedRHS;

38224

getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

38225

38226

KnownBits KnownLHS, KnownRHS;

38227

APInt SignMask = APInt::getSignMask(BitWidth * 2);

38228

if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

38229

KnownLHS, TLO, Depth + 1))

38230

return true;

38231

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

38232

KnownRHS, TLO, Depth + 1))

38233

return true;

38234

38235

// Attempt to avoid multi-use ops if we don't need anything from them.

38236

SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

38237

Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

38238

SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

38239

Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

38240

if (DemandedOp0 || DemandedOp1) {

38241

SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

38242

SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

38243

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

38244

}

38245

}

38246

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

38247

break;

38248

case X86ISD::PCMPGT:

38249

// icmp sgt(0, R) == ashr(R, BitWidth-1).

38250

// iff we only need the sign bit then we can use R directly.

38251

if (OriginalDemandedBits.isSignMask() &&

38252

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

38253

return TLO.CombineTo(Op, Op.getOperand(1));

38254

break;

38255

case X86ISD::MOVMSK: {

38256

SDValue Src = Op.getOperand(0);

38257

MVT SrcVT = Src.getSimpleValueType();

38258

unsigned SrcBits = SrcVT.getScalarSizeInBits();

38259

unsigned NumElts = SrcVT.getVectorNumElements();

38260

38261

// If we don't need the sign bits at all just return zero.

38262

if (OriginalDemandedBits.countTrailingZeros() >= NumElts)

38263

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

38264

38265

// Only demand the vector elements of the sign bits we need.

38266

APInt KnownUndef, KnownZero;

38267

APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

38268

if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

38269

TLO, Depth + 1))

38270

return true;

38271

38272

Known.Zero = KnownZero.zextOrSelf(BitWidth);

38273

Known.Zero.setHighBits(BitWidth - NumElts);

38274

38275

// MOVMSK only uses the MSB from each vector element.

38276

KnownBits KnownSrc;

38277

APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

38278

if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

38279

Depth + 1))

38280

return true;

38281

38282

if (KnownSrc.One[SrcBits - 1])

38283

Known.One.setLowBits(NumElts);

38284

else if (KnownSrc.Zero[SrcBits - 1])

38285

Known.Zero.setLowBits(NumElts);

38286

38287

// Attempt to avoid multi-use os if we don't need anything from it.

38288

if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

38289

Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

38290

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

38291

return false;

38292

}

38293

case X86ISD::BEXTR: {

38294

SDValue Op0 = Op.getOperand(0);

38295

SDValue Op1 = Op.getOperand(1);

38296

38297

// Only bottom 16-bits of the control bits are required.

38298

if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

38299

// NOTE: SimplifyDemandedBits won't do this for constants.

38300

const APInt &Val1 = Cst1->getAPIntValue();

38301

APInt MaskedVal1 = Val1 & 0xFFFF;

38302

if (MaskedVal1 != Val1) {

38303

SDLoc DL(Op);

38304

return TLO.CombineTo(

38305

Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

38306

TLO.DAG.getConstant(MaskedVal1, DL, VT)));

38307

}

38308

}

38309

38310

KnownBits Known1;

38311

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

38312

if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

38313

return true;

38314

38315

// If the length is 0, replace with 0.

38316

KnownBits LengthBits = Known1.extractBits(8, 8);

38317

if (LengthBits.isZero())

38318

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

38319

38320

break;

38321

}

38322

}

38323

38324

return TargetLowering::SimplifyDemandedBitsForTargetNode(

38325

Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

38326

}

38327

38328

SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

38329

SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

38330

SelectionDAG &DAG, unsigned Depth) const {

38331

int NumElts = DemandedElts.getBitWidth();

38332

unsigned Opc = Op.getOpcode();

38333

EVT VT = Op.getValueType();

38334

38335

switch (Opc) {

38336

case X86ISD::PINSRB:

38337

case X86ISD::PINSRW: {

38338

// If we don't demand the inserted element, return the base vector.

38339

SDValue Vec = Op.getOperand(0);

38340

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

38341

MVT VecVT = Vec.getSimpleValueType();

38342

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

38343

!DemandedElts[CIdx->getZExtValue()])

38344

return Vec;

38345

break;

38346

}

38347

case X86ISD::VSHLI: {

38348

// If we are only demanding sign bits then we can use the shift source

38349

// directly.

38350

SDValue Op0 = Op.getOperand(0);

38351

unsigned ShAmt = Op.getConstantOperandVal(1);

38352

unsigned BitWidth = DemandedBits.getBitWidth();

38353

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

38354

unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();

38355

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

38356

return Op0;

38357

break;

38358

}

38359

case X86ISD::VSRAI:

38360

// iff we only need the sign bit then we can use the source directly.

38361

// TODO: generalize where we only demand extended signbits.

38362

if (DemandedBits.isSignMask())

38363

return Op.getOperand(0);

38364

break;

38365

case X86ISD::PCMPGT:

38366

// icmp sgt(0, R) == ashr(R, BitWidth-1).

38367

// iff we only need the sign bit then we can use R directly.

38368

if (DemandedBits.isSignMask() &&

38369

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

38370

return Op.getOperand(1);

38371

break;

38372

}

38373

38374

APInt ShuffleUndef, ShuffleZero;

38375

SmallVector<int, 16> ShuffleMask;

38376

SmallVector<SDValue, 2> ShuffleOps;

38377

if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

38378

ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

38379

// If all the demanded elts are from one operand and are inline,

38380

// then we can use the operand directly.

38381

int NumOps = ShuffleOps.size();

38382

if (ShuffleMask.size() == (unsigned)NumElts &&

38383

llvm::all_of(ShuffleOps, [VT](SDValue V) {

38384

return VT.getSizeInBits() == V.getValueSizeInBits();

38385

})) {

38386

38387

if (DemandedElts.isSubsetOf(ShuffleUndef))

38388

return DAG.getUNDEF(VT);

38389

if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

38390

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

38391

38392

// Bitmask that indicates which ops have only been accessed 'inline'.

38393

APInt IdentityOp = APInt::getAllOnesValue(NumOps);

38394

for (int i = 0; i != NumElts; ++i) {

38395

int M = ShuffleMask[i];

38396

if (!DemandedElts[i] || ShuffleUndef[i])

38397

continue;

38398

int OpIdx = M / NumElts;

38399

int EltIdx = M % NumElts;

38400

if (M < 0 || EltIdx != i) {

38401

IdentityOp.clearAllBits();

38402

break;

38403

}

38404

IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

38405

if (IdentityOp == 0)

38406

break;

38407

}

38408

assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38409, __PRETTY_FUNCTION__))

38409

"Multiple identity shuffles detected")(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38409, __PRETTY_FUNCTION__));

38410

38411

if (IdentityOp != 0)

38412

return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);

38413

}

38414

}

38415

38416

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

38417

Op, DemandedBits, DemandedElts, DAG, Depth);

38418

}

38419

38420

// Helper to peek through bitops/setcc to determine size of source vector.

38421

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

38422

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {

38423

switch (Src.getOpcode()) {

38424

case ISD::SETCC:

38425

return Src.getOperand(0).getValueSizeInBits() == Size;

38426

case ISD::AND:

38427

case ISD::XOR:

38428

case ISD::OR:

38429

return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&

38430

checkBitcastSrcVectorSize(Src.getOperand(1), Size);

38431

}

38432

return false;

38433

}

38434

38435

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

38436

static unsigned getAltBitOpcode(unsigned Opcode) {

38437

switch(Opcode) {

38438

case ISD::AND: return X86ISD::FAND;

38439

case ISD::OR: return X86ISD::FOR;

38440

case ISD::XOR: return X86ISD::FXOR;

38441

case X86ISD::ANDNP: return X86ISD::FANDN;

38442

}

38443

llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38443);

38444

}

38445

38446

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

38447

static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

38448

const SDLoc &DL) {

38449

EVT SrcVT = Src.getValueType();

38450

if (SrcVT != MVT::v4i1)

38451

return SDValue();

38452

38453

switch (Src.getOpcode()) {

38454

case ISD::SETCC:

38455

if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

38456

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

38457

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

38458

SDValue Op0 = Src.getOperand(0);

38459

if (ISD::isNormalLoad(Op0.getNode()))

38460

return DAG.getBitcast(MVT::v4f32, Op0);

38461

if (Op0.getOpcode() == ISD::BITCAST &&

38462

Op0.getOperand(0).getValueType() == MVT::v4f32)

38463

return Op0.getOperand(0);

38464

}

38465

break;

38466

case ISD::AND:

38467

case ISD::XOR:

38468

case ISD::OR: {

38469

SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

38470

SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

38471

if (Op0 && Op1)

38472

return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

38473

Op1);

38474

break;

38475

}

38476

}

38477

return SDValue();

38478

}

38479

38480

// Helper to push sign extension of vXi1 SETCC result through bitops.

38481

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

38482

SDValue Src, const SDLoc &DL) {

38483

switch (Src.getOpcode()) {

38484

case ISD::SETCC:

38485

return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

38486

case ISD::AND:

38487

case ISD::XOR:

38488

case ISD::OR:

38489

return DAG.getNode(

38490

Src.getOpcode(), DL, SExtVT,

38491

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

38492

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

38493

}

38494

llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38494);

38495

}

38496

38497

// Try to match patterns such as

38498

// (i16 bitcast (v16i1 x))

38499

// ->

38500

// (i16 movmsk (16i8 sext (v16i1 x)))

38501

// before the illegal vector is scalarized on subtargets that don't have legal

38502

// vxi1 types.

38503

static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

38504

const SDLoc &DL,

38505

const X86Subtarget &Subtarget) {

38506

EVT SrcVT = Src.getValueType();

38507

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

38508

return SDValue();

38509

38510

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

38511

// legalization destroys the v4i32 type.

38512

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

38513

if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

38514

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

38515

DAG.getBitcast(MVT::v4f32, V));

38516

return DAG.getZExtOrTrunc(V, DL, VT);

38517

}

38518

}

38519

38520

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

38521

// movmskb even with avx512. This will be better than truncating to vXi1 and

38522

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

38523

// vpcmpeqb/vpcmpgtb.

38524

bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

38525

(Src.getOperand(0).getValueType() == MVT::v16i8 ||

38526

Src.getOperand(0).getValueType() == MVT::v32i8 ||

38527

Src.getOperand(0).getValueType() == MVT::v64i8);

38528

38529

// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

38530

// directly with vpmovmskb/vmovmskps/vmovmskpd.

38531

if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

38532

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

38533

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

38534

EVT CmpVT = Src.getOperand(0).getValueType();

38535

EVT EltVT = CmpVT.getVectorElementType();

38536

if (CmpVT.getSizeInBits() <= 256 &&

38537

(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

38538

PreferMovMsk = true;

38539

}

38540

38541

// With AVX512 vxi1 types are legal and we prefer using k-regs.

38542

// MOVMSK is supported in SSE2 or later.

38543

if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

38544

return SDValue();

38545

38546

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

38547

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

38548

// v8i16 and v16i16.

38549

// For these two cases, we can shuffle the upper element bytes to a

38550

// consecutive sequence at the start of the vector and treat the results as

38551

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

38552

// for v16i16 this is not the case, because the shuffle is expensive, so we

38553

// avoid sign-extending to this type entirely.

38554

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

38555

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

38556

MVT SExtVT;

38557

bool PropagateSExt = false;

38558

switch (SrcVT.getSimpleVT().SimpleTy) {

38559

default:

38560

return SDValue();

38561

case MVT::v2i1:

38562

SExtVT = MVT::v2i64;

38563

break;

38564

case MVT::v4i1:

38565

SExtVT = MVT::v4i32;

38566

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

38567

// sign-extend to a 256-bit operation to avoid truncation.

38568

if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {

38569

SExtVT = MVT::v4i64;

38570

PropagateSExt = true;

38571

}

38572

break;

38573

case MVT::v8i1:

38574

SExtVT = MVT::v8i16;

38575

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

38576

// sign-extend to a 256-bit operation to match the compare.

38577

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

38578

// 256-bit because the shuffle is cheaper than sign extending the result of

38579

// the compare.

38580

if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||

38581

checkBitcastSrcVectorSize(Src, 512))) {

38582

SExtVT = MVT::v8i32;

38583

PropagateSExt = true;

38584

}

38585

break;

38586

case MVT::v16i1:

38587

SExtVT = MVT::v16i8;

38588

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

38589

// it is not profitable to sign-extend to 256-bit because this will

38590

// require an extra cross-lane shuffle which is more expensive than

38591

// truncating the result of the compare to 128-bits.

38592

break;

38593

case MVT::v32i1:

38594

SExtVT = MVT::v32i8;

38595

break;

38596

case MVT::v64i1:

38597

// If we have AVX512F, but not AVX512BW and the input is truncated from

38598

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

38599

if (Subtarget.hasAVX512()) {

38600

if (Subtarget.hasBWI())

38601

return SDValue();

38602

SExtVT = MVT::v64i8;

38603

break;

38604

}

38605

// Split if this is a <64 x i8> comparison result.

38606

if (checkBitcastSrcVectorSize(Src, 512)) {

38607

SExtVT = MVT::v64i8;

38608

break;

38609

}

38610

return SDValue();

38611

};

38612

38613

SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

38614

: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

38615

38616

if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

38617

V = getPMOVMSKB(DL, V, DAG, Subtarget);

38618

} else {

38619

if (SExtVT == MVT::v8i16)

38620

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

38621

DAG.getUNDEF(MVT::v8i16));

38622

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

38623

}

38624

38625

EVT IntVT =

38626

EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

38627

V = DAG.getZExtOrTrunc(V, DL, IntVT);

38628

return DAG.getBitcast(VT, V);

38629

}

38630

38631

// Convert a vXi1 constant build vector to the same width scalar integer.

38632

static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

38633

EVT SrcVT = Op.getValueType();

38634

assert(SrcVT.getVectorElementType() == MVT::i1 &&((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38635, __PRETTY_FUNCTION__))

38635

"Expected a vXi1 vector")((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38635, __PRETTY_FUNCTION__));

38636

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38637, __PRETTY_FUNCTION__))

38637

"Expected a constant build vector")((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38637, __PRETTY_FUNCTION__));

38638

38639

APInt Imm(SrcVT.getVectorNumElements(), 0);

38640

for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

38641

SDValue In = Op.getOperand(Idx);

38642

if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))

38643

Imm.setBit(Idx);

38644

}

38645

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

38646

return DAG.getConstant(Imm, SDLoc(Op), IntVT);

38647

}

38648

38649

static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

38650

TargetLowering::DAGCombinerInfo &DCI,

38651

const X86Subtarget &Subtarget) {

38652

assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")((N->getOpcode() == ISD::BITCAST && "Expected a bitcast"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38652, __PRETTY_FUNCTION__));

38653

38654

if (!DCI.isBeforeLegalizeOps())

38655

return SDValue();

38656

38657

// Only do this if we have k-registers.

38658

if (!Subtarget.hasAVX512())

38659

return SDValue();

38660

38661

EVT DstVT = N->getValueType(0);

38662

SDValue Op = N->getOperand(0);

38663

EVT SrcVT = Op.getValueType();

38664

38665

if (!Op.hasOneUse())

38666

return SDValue();

38667

38668

// Look for logic ops.

38669

if (Op.getOpcode() != ISD::AND &&

38670

Op.getOpcode() != ISD::OR &&

38671

Op.getOpcode() != ISD::XOR)

38672

return SDValue();

38673

38674

// Make sure we have a bitcast between mask registers and a scalar type.

38675

if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

38676

DstVT.isScalarInteger()) &&

38677

!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

38678

SrcVT.isScalarInteger()))

38679

return SDValue();

38680

38681

SDValue LHS = Op.getOperand(0);

38682

SDValue RHS = Op.getOperand(1);

38683

38684

if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&

38685

LHS.getOperand(0).getValueType() == DstVT)

38686

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),

38687

DAG.getBitcast(DstVT, RHS));

38688

38689

if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&

38690

RHS.getOperand(0).getValueType() == DstVT)

38691

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

38692

DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

38693

38694

// If the RHS is a vXi1 build vector, this is a good reason to flip too.

38695

// Most of these have to move a constant from the scalar domain anyway.

38696

if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

38697

RHS = combinevXi1ConstantToInteger(RHS, DAG);

38698

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

38699

DAG.getBitcast(DstVT, LHS), RHS);

38700

}

38701

38702

return SDValue();

38703

}

38704

38705

static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

38706

const X86Subtarget &Subtarget) {

38707

SDLoc DL(BV);

38708

unsigned NumElts = BV->getNumOperands();

38709

SDValue Splat = BV->getSplatValue();

38710

38711

// Build MMX element from integer GPR or SSE float values.

38712

auto CreateMMXElement = [&](SDValue V) {

38713

if (V.isUndef())

38714

return DAG.getUNDEF(MVT::x86mmx);

38715

if (V.getValueType().isFloatingPoint()) {

38716

if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

38717

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

38718

V = DAG.getBitcast(MVT::v2i64, V);

38719

return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

38720

}

38721

V = DAG.getBitcast(MVT::i32, V);

38722

} else {

38723

V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

38724

}

38725

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

38726

};

38727

38728

// Convert build vector ops to MMX data in the bottom elements.

38729

SmallVector<SDValue, 8> Ops;

38730

38731

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

38732

38733

// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

38734

if (Splat) {

38735

if (Splat.isUndef())

38736

return DAG.getUNDEF(MVT::x86mmx);

38737

38738

Splat = CreateMMXElement(Splat);

38739

38740

if (Subtarget.hasSSE1()) {

38741

// Unpack v8i8 to splat i8 elements to lowest 16-bits.

38742

if (NumElts == 8)

38743

Splat = DAG.getNode(

38744

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

38745

DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

38746

TLI.getPointerTy(DAG.getDataLayout())),

38747

Splat, Splat);

38748

38749

// Use PSHUFW to repeat 16-bit elements.

38750

unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

38751

return DAG.getNode(

38752

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

38753

DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

38754

TLI.getPointerTy(DAG.getDataLayout())),

38755

Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

38756

}

38757

Ops.append(NumElts, Splat);

38758

} else {

38759

for (unsigned i = 0; i != NumElts; ++i)

38760

Ops.push_back(CreateMMXElement(BV->getOperand(i)));

38761

}

38762

38763

// Use tree of PUNPCKLs to build up general MMX vector.

38764

while (Ops.size() > 1) {

38765

unsigned NumOps = Ops.size();

38766

unsigned IntrinOp =

38767

(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

38768

: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

38769

: Intrinsic::x86_mmx_punpcklbw));

38770

SDValue Intrin = DAG.getTargetConstant(

38771

IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

38772

for (unsigned i = 0; i != NumOps; i += 2)

38773

Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

38774

Ops[i], Ops[i + 1]);

38775

Ops.resize(NumOps / 2);

38776

}

38777

38778

return Ops[0];

38779

}

38780

38781

// Recursive function that attempts to find if a bool vector node was originally

38782

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

38783

// integer. If so, replace the scalar ops with bool vector equivalents back down

38784

// the chain.

38785

static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

38786

SelectionDAG &DAG,

38787

const X86Subtarget &Subtarget) {

38788

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

38789

unsigned Opc = V.getOpcode();

38790

switch (Opc) {

38791

case ISD::BITCAST: {

38792

// Bitcast from a vector/float/double, we can cheaply bitcast to VT.

38793

SDValue Src = V.getOperand(0);

38794

EVT SrcVT = Src.getValueType();

38795

if (SrcVT.isVector() || SrcVT.isFloatingPoint())

38796

return DAG.getBitcast(VT, Src);

38797

break;

38798

}

38799

case ISD::TRUNCATE: {

38800

// If we find a suitable source, a truncated scalar becomes a subvector.

38801

SDValue Src = V.getOperand(0);

38802

EVT NewSrcVT =

38803

EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

38804

if (TLI.isTypeLegal(NewSrcVT))

38805

if (SDValue N0 =

38806

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

38807

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

38808

DAG.getIntPtrConstant(0, DL));

38809

break;

38810

}

38811

case ISD::ANY_EXTEND:

38812

case ISD::ZERO_EXTEND: {

38813

// If we find a suitable source, an extended scalar becomes a subvector.

38814

SDValue Src = V.getOperand(0);

38815

EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

38816

Src.getScalarValueSizeInBits());

38817

if (TLI.isTypeLegal(NewSrcVT))

38818

if (SDValue N0 =

38819

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

38820

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

38821

Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

38822

: DAG.getConstant(0, DL, VT),

38823

N0, DAG.getIntPtrConstant(0, DL));

38824

break;

38825

}

38826

case ISD::OR: {

38827

// If we find suitable sources, we can just move an OR to the vector domain.

38828

SDValue Src0 = V.getOperand(0);

38829

SDValue Src1 = V.getOperand(1);

38830

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

38831

if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

38832

return DAG.getNode(Opc, DL, VT, N0, N1);

38833

break;

38834

}

38835

case ISD::SHL: {

38836

// If we find a suitable source, a SHL becomes a KSHIFTL.

38837

SDValue Src0 = V.getOperand(0);

38838

if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

38839

((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

38840

break;

38841

38842

if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

38843

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

38844

return DAG.getNode(

38845

X86ISD::KSHIFTL, DL, VT, N0,

38846

DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

38847

break;

38848

}

38849

}

38850

return SDValue();

38851

}

38852

38853

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

38854

TargetLowering::DAGCombinerInfo &DCI,

38855

const X86Subtarget &Subtarget) {

38856

SDValue N0 = N->getOperand(0);

38857

EVT VT = N->getValueType(0);

38858

EVT SrcVT = N0.getValueType();

38859

38860

// Try to match patterns such as

38861

// (i16 bitcast (v16i1 x))

38862

// ->

38863

// (i16 movmsk (16i8 sext (v16i1 x)))

38864

// before the setcc result is scalarized on subtargets that don't have legal

38865

// vxi1 types.

38866

if (DCI.isBeforeLegalize()) {

38867

SDLoc dl(N);

38868

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

38869

return V;

38870

38871

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

38872

// type, widen both sides to avoid a trip through memory.

38873

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

38874

Subtarget.hasAVX512()) {

38875

N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

38876

N0 = DAG.getBitcast(MVT::v8i1, N0);

38877

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

38878

DAG.getIntPtrConstant(0, dl));

38879

}

38880

38881

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

38882

// type, widen both sides to avoid a trip through memory.

38883

if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

38884

Subtarget.hasAVX512()) {

38885

// Use zeros for the widening if we already have some zeroes. This can

38886

// allow SimplifyDemandedBits to remove scalar ANDs that may be down

38887

// stream of this.

38888

// FIXME: It might make sense to detect a concat_vectors with a mix of

38889

// zeroes and undef and turn it into insert_subvector for i1 vectors as

38890

// a separate combine. What we can't do is canonicalize the operands of

38891

// such a concat or we'll get into a loop with SimplifyDemandedBits.

38892

if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

38893

SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

38894

if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

38895

SrcVT = LastOp.getValueType();

38896

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

38897

SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());

38898

Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

38899

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

38900

N0 = DAG.getBitcast(MVT::i8, N0);

38901

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

38902

}

38903

}

38904

38905

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

38906

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

38907

Ops[0] = N0;

38908

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

38909

N0 = DAG.getBitcast(MVT::i8, N0);

38910

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

38911

}

38912

} else {

38913

// If we're bitcasting from iX to vXi1, see if the integer originally

38914

// began as a vXi1 and whether we can remove the bitcast entirely.

38915

if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

38916

SrcVT.isScalarInteger() &&

38917

DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

38918

if (SDValue V =

38919

combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

38920

return V;

38921

}

38922

}

38923

38924

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

38925

// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

38926

// due to insert_subvector legalization on KNL. By promoting the copy to i16

38927

// we can help with known bits propagation from the vXi1 domain to the

38928

// scalar domain.

38929

if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

38930

!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

38931

N0.getOperand(0).getValueType() == MVT::v16i1 &&

38932

isNullConstant(N0.getOperand(1)))

38933

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

38934

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

38935

38936

// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

38937

// and the vbroadcast_load are both integer or both fp. In some cases this

38938

// will remove the bitcast entirely.

38939

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

38940

VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

38941

auto *BCast = cast<MemIntrinsicSDNode>(N0);

38942

unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

38943

unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

38944

// Don't swap i8/i16 since don't have fp types that size.

38945

if (MemSize >= 32) {

38946

MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

38947

: MVT::getIntegerVT(MemSize);

38948

MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

38949

: MVT::getIntegerVT(SrcVTSize);

38950

LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

38951

38952

SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

38953

SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

38954

SDValue ResNode =

38955

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

38956

MemVT, BCast->getMemOperand());

38957

DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

38958

return DAG.getBitcast(VT, ResNode);

38959

}

38960

}

38961

38962

// Since MMX types are special and don't usually play with other vector types,

38963

// it's better to handle them early to be sure we emit efficient code by

38964

// avoiding store-load conversions.

38965

if (VT == MVT::x86mmx) {

38966

// Detect MMX constant vectors.

38967

APInt UndefElts;

38968

SmallVector<APInt, 1> EltBits;

38969

if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {

38970

SDLoc DL(N0);

38971

// Handle zero-extension of i32 with MOVD.

38972

if (EltBits[0].countLeadingZeros() >= 32)

38973

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

38974

DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

38975

// Else, bitcast to a double.

38976

// TODO - investigate supporting sext 32-bit immediates on x86_64.

38977

APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

38978

return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

38979

}

38980

38981

// Detect bitcasts to x86mmx low word.

38982

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

38983

(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

38984

N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

38985

bool LowUndef = true, AllUndefOrZero = true;

38986

for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

38987

SDValue Op = N0.getOperand(i);

38988

LowUndef &= Op.isUndef() || (i >= e/2);

38989

AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));

38990

}

38991

if (AllUndefOrZero) {

38992

SDValue N00 = N0.getOperand(0);

38993

SDLoc dl(N00);

38994

N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

38995

: DAG.getZExtOrTrunc(N00, dl, MVT::i32);

38996

return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

38997

}

38998

}

38999

39000

// Detect bitcasts of 64-bit build vectors and convert to a

39001

// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

39002

// lowest element.

39003

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

39004

(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

39005

SrcVT == MVT::v8i8))

39006

return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

39007

39008

// Detect bitcasts between element or subvector extraction to x86mmx.

39009

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

39010

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

39011

isNullConstant(N0.getOperand(1))) {

39012

SDValue N00 = N0.getOperand(0);

39013

if (N00.getValueType().is128BitVector())

39014

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

39015

DAG.getBitcast(MVT::v2i64, N00));

39016

}

39017

39018

// Detect bitcasts from FP_TO_SINT to x86mmx.

39019

if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

39020

SDLoc DL(N0);

39021

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

39022

DAG.getUNDEF(MVT::v2i32));

39023

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

39024

DAG.getBitcast(MVT::v2i64, Res));

39025

}

39026

}

39027

39028

// Try to remove a bitcast of constant vXi1 vector. We have to legalize

39029

// most of these to scalar anyway.

39030

if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

39031

SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

39032

ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

39033

return combinevXi1ConstantToInteger(N0, DAG);

39034

}

39035

39036

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

39037

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

39038

isa<ConstantSDNode>(N0)) {

39039

auto *C = cast<ConstantSDNode>(N0);

39040

if (C->isAllOnesValue())

39041

return DAG.getConstant(1, SDLoc(N0), VT);

39042

if (C->isNullValue())

39043

return DAG.getConstant(0, SDLoc(N0), VT);

39044

}

39045

39046

// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

39047

// Turn it into a sign bit compare that produces a k-register. This avoids

39048

// a trip through a GPR.

39049

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

39050

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

39051

isPowerOf2_32(VT.getVectorNumElements())) {

39052

unsigned NumElts = VT.getVectorNumElements();

39053

SDValue Src = N0;

39054

39055

// Peek through truncate.

39056

if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

39057

Src = N0.getOperand(0);

39058

39059

if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

39060

SDValue MovmskIn = Src.getOperand(0);

39061

MVT MovmskVT = MovmskIn.getSimpleValueType();

39062

unsigned MovMskElts = MovmskVT.getVectorNumElements();

39063

39064

// We allow extra bits of the movmsk to be used since they are known zero.

39065

// We can't convert a VPMOVMSKB without avx512bw.

39066

if (MovMskElts <= NumElts &&

39067

(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

39068

EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

39069

MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

39070

SDLoc dl(N);

39071

MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

39072

SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

39073

DAG.getConstant(0, dl, IntVT), ISD::SETLT);

39074

if (EVT(CmpVT) == VT)

39075

return Cmp;

39076

39077

// Pad with zeroes up to original VT to replace the zeroes that were

39078

// being used from the MOVMSK.

39079

unsigned NumConcats = NumElts / MovMskElts;

39080

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

39081

Ops[0] = Cmp;

39082

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

39083

}

39084

}

39085

}

39086

39087

// Try to remove bitcasts from input and output of mask arithmetic to

39088

// remove GPR<->K-register crossings.

39089

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

39090

return V;

39091

39092

// Convert a bitcasted integer logic operation that has one bitcasted

39093

// floating-point operand into a floating-point logic operation. This may

39094

// create a load of a constant, but that is cheaper than materializing the

39095

// constant in an integer register and transferring it to an SSE register or

39096

// transferring the SSE operand to integer register and back.

39097

unsigned FPOpcode;

39098

switch (N0.getOpcode()) {

39099

case ISD::AND: FPOpcode = X86ISD::FAND; break;

39100

case ISD::OR: FPOpcode = X86ISD::FOR; break;

39101

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

39102

default: return SDValue();

39103

}

39104

39105

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

39106

(Subtarget.hasSSE2() && VT == MVT::f64)))

39107

return SDValue();

39108

39109

SDValue LogicOp0 = N0.getOperand(0);

39110

SDValue LogicOp1 = N0.getOperand(1);

39111

SDLoc DL0(N0);

39112

39113

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

39114

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

39115

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&

39116

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

39117

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

39118

return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

39119

}

39120

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

39121

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

39122

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&

39123

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

39124

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

39125

return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

39126

}

39127

39128

return SDValue();

39129

}

39130

39131

// Given a ABS node, detect the following pattern:

39132

// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).

39133

// This is useful as it is the input into a SAD pattern.

39134

static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {

39135

SDValue AbsOp1 = Abs->getOperand(0);

39136

if (AbsOp1.getOpcode() != ISD::SUB)

39137

return false;

39138

39139

Op0 = AbsOp1.getOperand(0);

39140

Op1 = AbsOp1.getOperand(1);

39141

39142

// Check if the operands of the sub are zero-extended from vectors of i8.

39143

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

39144

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

39145

Op1.getOpcode() != ISD::ZERO_EXTEND ||

39146

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

39147

return false;

39148

39149

return true;

39150

}

39151

39152

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

39153

// to these zexts.

39154

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

39155

const SDValue &Zext1, const SDLoc &DL,

39156

const X86Subtarget &Subtarget) {

39157

// Find the appropriate width for the PSADBW.

39158

EVT InVT = Zext0.getOperand(0).getValueType();

39159

unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

39160

39161

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

39162

// fill in the missing vector elements with 0.

39163

unsigned NumConcat = RegSize / InVT.getSizeInBits();

39164

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

39165

Ops[0] = Zext0.getOperand(0);

39166

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

39167

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

39168

Ops[0] = Zext1.getOperand(0);

39169

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

39170

39171

// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

39172

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

39173

ArrayRef<SDValue> Ops) {

39174

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

39175

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

39176

};

39177

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

39178

return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },

39179

PSADBWBuilder);

39180

}

39181

39182

// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

39183

// PHMINPOSUW.

39184

static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,

39185

const X86Subtarget &Subtarget) {

39186

// Bail without SSE41.

39187

if (!Subtarget.hasSSE41())

39188

return SDValue();

39189

39190

EVT ExtractVT = Extract->getValueType(0);

39191

if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

39192

return SDValue();

39193

39194

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

39195

ISD::NodeType BinOp;

39196

SDValue Src = DAG.matchBinOpReduction(

39197

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

39198

if (!Src)

39199

return SDValue();

39200

39201

EVT SrcVT = Src.getValueType();

39202

EVT SrcSVT = SrcVT.getScalarType();

39203

if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

39204

return SDValue();

39205

39206

SDLoc DL(Extract);

39207

SDValue MinPos = Src;

39208

39209

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

39210

while (SrcVT.getSizeInBits() > 128) {

39211

SDValue Lo, Hi;

39212

std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

39213

SrcVT = Lo.getValueType();

39214

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

39215

}

39216

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39218, __PRETTY_FUNCTION__))

39217

(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39218, __PRETTY_FUNCTION__))

39218

"Unexpected value type")((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39218, __PRETTY_FUNCTION__));

39219

39220

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

39221

// to flip the value accordingly.

39222

SDValue Mask;

39223

unsigned MaskEltsBits = ExtractVT.getSizeInBits();

39224

if (BinOp == ISD::SMAX)

39225

Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

39226

else if (BinOp == ISD::SMIN)

39227

Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

39228

else if (BinOp == ISD::UMAX)

39229

Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

39230

39231

if (Mask)

39232

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

39233

39234

// For v16i8 cases we need to perform UMIN on pairs of byte elements,

39235

// shuffling each upper element down and insert zeros. This means that the

39236

// v16i8 UMIN will leave the upper element as zero, performing zero-extension

39237

// ready for the PHMINPOS.

39238

if (ExtractVT == MVT::i8) {

39239

SDValue Upper = DAG.getVectorShuffle(

39240

SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

39241

{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

39242

MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

39243

}

39244

39245

// Perform the PHMINPOS on a v8i16 vector,

39246

MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

39247

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

39248

MinPos = DAG.getBitcast(SrcVT, MinPos);

39249

39250

if (Mask)

39251

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

39252

39253

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

39254

DAG.getIntPtrConstant(0, DL));

39255

}

39256

39257

// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.

39258

static SDValue combineHorizontalPredicateResult(SDNode *Extract,

39259

SelectionDAG &DAG,

39260

const X86Subtarget &Subtarget) {

39261

// Bail without SSE2.

39262

if (!Subtarget.hasSSE2())

39263

return SDValue();

39264

39265

EVT ExtractVT = Extract->getValueType(0);

39266

unsigned BitWidth = ExtractVT.getSizeInBits();

39267

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

39268

ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

39269

return SDValue();

39270

39271

// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

39272

ISD::NodeType BinOp;

39273

SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

39274

if (!Match && ExtractVT == MVT::i1)

39275

Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

39276

if (!Match)

39277

return SDValue();

39278

39279

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

39280

// which we can't support here for now.

39281

if (Match.getScalarValueSizeInBits() != BitWidth)

39282

return SDValue();

39283

39284

SDValue Movmsk;

39285

SDLoc DL(Extract);

39286

EVT MatchVT = Match.getValueType();

39287

unsigned NumElts = MatchVT.getVectorNumElements();

39288

unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

39289

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

39290

39291

if (ExtractVT == MVT::i1) {

39292

// Special case for (pre-legalization) vXi1 reductions.

39293

if (NumElts > 64 || !isPowerOf2_32(NumElts))

39294

return SDValue();

39295

if (TLI.isTypeLegal(MatchVT)) {

39296

// If this is a legal AVX512 predicate type then we can just bitcast.

39297

EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

39298

Movmsk = DAG.getBitcast(MovmskVT, Match);

39299

} else {

39300

// For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have

39301

// PCMPEQQ (SSE41+), use PCMPEQD instead.

39302

if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&

39303

Match.getOpcode() == ISD::SETCC &&

39304

ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&

39305

cast<CondCodeSDNode>(Match.getOperand(2))->get() ==

39306

ISD::CondCode::SETEQ) {

39307

SDValue Vec = Match.getOperand(0);

39308

if (Vec.getValueType().getScalarType() == MVT::i64 &&

39309

(2 * NumElts) <= MaxElts) {

39310

NumElts *= 2;

39311

EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);

39312

MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

39313

Match = DAG.getSetCC(

39314

DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),

39315

DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);

39316

}

39317

}

39318

39319

// Use combineBitcastvxi1 to create the MOVMSK.

39320

while (NumElts > MaxElts) {

39321

SDValue Lo, Hi;

39322

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

39323

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

39324

NumElts /= 2;

39325

}

39326

EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

39327

Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

39328

}

39329

if (!Movmsk)

39330

return SDValue();

39331

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

39332

} else {

39333

// FIXME: Better handling of k-registers or 512-bit vectors?

39334

unsigned MatchSizeInBits = Match.getValueSizeInBits();

39335

if (!(MatchSizeInBits == 128 ||

39336

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

39337

return SDValue();

39338

39339

// Make sure this isn't a vector of 1 element. The perf win from using

39340

// MOVMSK diminishes with less elements in the reduction, but it is

39341

// generally better to get the comparison over to the GPRs as soon as

39342

// possible to reduce the number of vector ops.

39343

if (Match.getValueType().getVectorNumElements() < 2)

39344

return SDValue();

39345

39346

// Check that we are extracting a reduction of all sign bits.

39347

if (DAG.ComputeNumSignBits(Match) != BitWidth)

39348

return SDValue();

39349

39350

if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

39351

SDValue Lo, Hi;

39352

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

39353

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

39354

MatchSizeInBits = Match.getValueSizeInBits();

39355

}

39356

39357

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

39358

MVT MaskSrcVT;

39359

if (64 == BitWidth || 32 == BitWidth)

39360

MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

39361

MatchSizeInBits / BitWidth);

39362

else

39363

MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

39364

39365

SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

39366

Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

39367

NumElts = MaskSrcVT.getVectorNumElements();

39368

}

39369

assert((NumElts <= 32 || NumElts == 64) &&(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39370, __PRETTY_FUNCTION__))

39370

"Not expecting more than 64 elements")(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39370, __PRETTY_FUNCTION__));

39371

39372

MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

39373

if (BinOp == ISD::XOR) {

39374

// parity -> (PARITY(MOVMSK X))

39375

SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

39376

return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

39377

}

39378

39379

SDValue CmpC;

39380

ISD::CondCode CondCode;

39381

if (BinOp == ISD::OR) {

39382

// any_of -> MOVMSK != 0

39383

CmpC = DAG.getConstant(0, DL, CmpVT);

39384

CondCode = ISD::CondCode::SETNE;

39385

} else {

39386

// all_of -> MOVMSK == ((1 << NumElts) - 1)

39387

CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

39388

DL, CmpVT);

39389

CondCode = ISD::CondCode::SETEQ;

39390

}

39391

39392

// The setcc produces an i8 of 0/1, so extend that to the result width and

39393

// negate to get the final 0/-1 mask value.

39394

EVT SetccVT =

39395

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);

39396

SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

39397

SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

39398

SDValue Zero = DAG.getConstant(0, DL, ExtractVT);

39399

return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);

39400

}

39401

39402

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

39403

const X86Subtarget &Subtarget) {

39404

// PSADBW is only supported on SSE2 and up.

39405

if (!Subtarget.hasSSE2())

39406

return SDValue();

39407

39408

EVT ExtractVT = Extract->getValueType(0);

39409

// Verify the type we're extracting is either i32 or i64.

39410

// FIXME: Could support other types, but this is what we have coverage for.

39411

if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

39412

return SDValue();

39413

39414

EVT VT = Extract->getOperand(0).getValueType();

39415

if (!isPowerOf2_32(VT.getVectorNumElements()))

39416

return SDValue();

39417

39418

// Match shuffle + add pyramid.

39419

ISD::NodeType BinOp;

39420

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

39421

39422

// The operand is expected to be zero extended from i8

39423

// (verified in detectZextAbsDiff).

39424

// In order to convert to i64 and above, additional any/zero/sign

39425

// extend is expected.

39426

// The zero extend from 32 bit has no mathematical effect on the result.

39427

// Also the sign extend is basically zero extend

39428

// (extends the sign bit which is zero).

39429

// So it is correct to skip the sign/zero extend instruction.

39430

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

39431

Root.getOpcode() == ISD::ZERO_EXTEND ||

39432

Root.getOpcode() == ISD::ANY_EXTEND))

39433

Root = Root.getOperand(0);

39434

39435

// If there was a match, we want Root to be a select that is the root of an

39436

// abs-diff pattern.

39437

if (!Root || Root.getOpcode() != ISD::ABS)

39438

return SDValue();

39439

39440

// Check whether we have an abs-diff pattern feeding into the select.

39441

SDValue Zext0, Zext1;

39442

if (!detectZextAbsDiff(Root, Zext0, Zext1))

39443

return SDValue();

39444

39445

// Create the SAD instruction.

39446

SDLoc DL(Extract);

39447

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

39448

39449

// If the original vector was wider than 8 elements, sum over the results

39450

// in the SAD vector.

39451

unsigned Stages = Log2_32(VT.getVectorNumElements());

39452

EVT SadVT = SAD.getValueType();

39453

if (Stages > 3) {

39454

unsigned SadElems = SadVT.getVectorNumElements();

39455

39456

for(unsigned i = Stages - 3; i > 0; --i) {

39457

SmallVector<int, 16> Mask(SadElems, -1);

39458

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

39459

Mask[j] = MaskEnd + j;

39460

39461

SDValue Shuffle =

39462

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

39463

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

39464

}

39465

}

39466

39467

unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

39468

// Return the lowest ExtractSizeInBits bits.

39469

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

39470

SadVT.getSizeInBits() / ExtractSizeInBits);

39471

SAD = DAG.getBitcast(ResVT, SAD);

39472

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

39473

Extract->getOperand(1));

39474

}

39475

39476

// Attempt to peek through a target shuffle and extract the scalar from the

39477

// source.

39478

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

39479

TargetLowering::DAGCombinerInfo &DCI,

39480

const X86Subtarget &Subtarget) {

39481

if (DCI.isBeforeLegalizeOps())

39482

return SDValue();

39483

39484

SDLoc dl(N);

39485

SDValue Src = N->getOperand(0);

39486

SDValue Idx = N->getOperand(1);

39487

39488

EVT VT = N->getValueType(0);

39489

EVT SrcVT = Src.getValueType();

39490

EVT SrcSVT = SrcVT.getVectorElementType();

39491

unsigned SrcEltBits = SrcSVT.getSizeInBits();

39492

unsigned NumSrcElts = SrcVT.getVectorNumElements();

39493

39494

// Don't attempt this for boolean mask vectors or unknown extraction indices.

39495

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

39496

return SDValue();

39497

39498

const APInt &IdxC = N->getConstantOperandAPInt(1);

39499

if (IdxC.uge(NumSrcElts))

39500

return SDValue();

39501

39502

SDValue SrcBC = peekThroughBitcasts(Src);

39503

39504

// Handle extract(bitcast(broadcast(scalar_value))).

39505

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

39506

SDValue SrcOp = SrcBC.getOperand(0);

39507

EVT SrcOpVT = SrcOp.getValueType();

39508

if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

39509

(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

39510

unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

39511

unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

39512

// TODO support non-zero offsets.

39513

if (Offset == 0) {

39514

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

39515

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

39516

return SrcOp;

39517

}

39518

}

39519

}

39520

39521

// If we're extracting a single element from a broadcast load and there are

39522

// no other users, just create a single load.

39523

if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {

39524

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

39525

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

39526

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

39527

VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

39528

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

39529

MemIntr->getBasePtr(),

39530

MemIntr->getPointerInfo(),

39531

MemIntr->getOriginalAlign(),

39532

MemIntr->getMemOperand()->getFlags());

39533

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

39534

return Load;

39535

}

39536

}

39537

39538

// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

39539

// TODO: Move to DAGCombine?

39540

if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

39541

SrcBC.getValueType().isInteger() &&

39542

(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

39543

SrcBC.getScalarValueSizeInBits() ==

39544

SrcBC.getOperand(0).getValueSizeInBits()) {

39545

unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

39546

if (IdxC.ult(Scale)) {

39547

unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

39548

SDValue Scl = SrcBC.getOperand(0);

39549

EVT SclVT = Scl.getValueType();

39550

if (Offset) {

39551

Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

39552

DAG.getShiftAmountConstant(Offset, SclVT, dl));

39553

}

39554

Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

39555

Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

39556

return Scl;

39557

}

39558

}

39559

39560

// Handle extract(truncate(x)) for 0'th index.

39561

// TODO: Treat this as a faux shuffle?

39562

// TODO: When can we use this for general indices?

39563

if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

39564

(SrcVT.getSizeInBits() % 128) == 0) {

39565

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

39566

MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

39567

return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

39568

Idx);

39569

}

39570

39571

// Resolve the target shuffle inputs and mask.

39572

SmallVector<int, 16> Mask;

39573

SmallVector<SDValue, 2> Ops;

39574

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

39575

return SDValue();

39576

39577

// Shuffle inputs must be the same size as the result.

39578

if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

39579

return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

39580

}))

39581

return SDValue();

39582

39583

// Attempt to narrow/widen the shuffle mask to the correct size.

39584

if (Mask.size() != NumSrcElts) {

39585

if ((NumSrcElts % Mask.size()) == 0) {

39586

SmallVector<int, 16> ScaledMask;

39587

int Scale = NumSrcElts / Mask.size();

39588

narrowShuffleMaskElts(Scale, Mask, ScaledMask);

39589

Mask = std::move(ScaledMask);

39590

} else if ((Mask.size() % NumSrcElts) == 0) {

39591

// Simplify Mask based on demanded element.

39592

int ExtractIdx = (int)N->getConstantOperandVal(1);

39593

int Scale = Mask.size() / NumSrcElts;

39594

int Lo = Scale * ExtractIdx;

39595

int Hi = Scale * (ExtractIdx + 1);

39596

for (int i = 0, e = (int)Mask.size(); i != e; ++i)

39597

if (i < Lo || Hi <= i)

39598

Mask[i] = SM_SentinelUndef;

39599

39600

SmallVector<int, 16> WidenedMask;

39601

while (Mask.size() > NumSrcElts &&

39602

canWidenShuffleElements(Mask, WidenedMask))

39603

Mask = std::move(WidenedMask);

39604

// TODO - investigate support for wider shuffle masks with known upper

39605

// undef/zero elements for implicit zero-extension.

39606

}

39607

}

39608

39609

// Check if narrowing/widening failed.

39610

if (Mask.size() != NumSrcElts)

39611

return SDValue();

39612

39613

int SrcIdx = Mask[IdxC.getZExtValue()];

39614

39615

// If the shuffle source element is undef/zero then we can just accept it.

39616

if (SrcIdx == SM_SentinelUndef)

39617

return DAG.getUNDEF(VT);

39618

39619

if (SrcIdx == SM_SentinelZero)

39620

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

39621

: DAG.getConstant(0, dl, VT);

39622

39623

SDValue SrcOp = Ops[SrcIdx / Mask.size()];

39624

SrcIdx = SrcIdx % Mask.size();

39625

39626

// We can only extract other elements from 128-bit vectors and in certain

39627

// circumstances, depending on SSE-level.

39628

// TODO: Investigate using extract_subvector for larger vectors.

39629

// TODO: Investigate float/double extraction if it will be just stored.

39630

if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&

39631

((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

39632

assert(SrcSVT == VT && "Unexpected extraction type")((SrcSVT == VT && "Unexpected extraction type") ? static_cast
<void> (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39632, __PRETTY_FUNCTION__));

39633

SrcOp = DAG.getBitcast(SrcVT, SrcOp);

39634

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,

39635

DAG.getIntPtrConstant(SrcIdx, dl));

39636

}

39637

39638

if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

39639

(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {

39640

assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type")((VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcEltBits && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39640, __PRETTY_FUNCTION__));

39641

unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

39642

SrcOp = DAG.getBitcast(SrcVT, SrcOp);

39643

SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,

39644

DAG.getIntPtrConstant(SrcIdx, dl));

39645

return DAG.getZExtOrTrunc(ExtOp, dl, VT);

39646

}

39647

39648

return SDValue();

39649

}

39650

39651

/// Extracting a scalar FP value from vector element 0 is free, so extract each

39652

/// operand first, then perform the math as a scalar op.

39653

static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {

39654

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Expected extract") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39654, __PRETTY_FUNCTION__));

39655

SDValue Vec = ExtElt->getOperand(0);

39656

SDValue Index = ExtElt->getOperand(1);

39657

EVT VT = ExtElt->getValueType(0);

39658

EVT VecVT = Vec.getValueType();

39659

39660

// TODO: If this is a unary/expensive/expand op, allow extraction from a

39661

// non-zero element because the shuffle+scalar op will be cheaper?

39662

if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

39663

return SDValue();

39664

39665

// Vector FP compares don't fit the pattern of FP math ops (propagate, not

39666

// extract, the condition code), so deal with those as a special-case.

39667

if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

39668

EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

39669

if (OpVT != MVT::f32 && OpVT != MVT::f64)

39670

return SDValue();

39671

39672

// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

39673

SDLoc DL(ExtElt);

39674

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

39675

Vec.getOperand(0), Index);

39676

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

39677

Vec.getOperand(1), Index);

39678

return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

39679

}

39680

39681

if (VT != MVT::f32 && VT != MVT::f64)

39682

return SDValue();

39683

39684

// Vector FP selects don't fit the pattern of FP math ops (because the

39685

// condition has a different type and we have to change the opcode), so deal

39686

// with those here.

39687

// FIXME: This is restricted to pre type legalization by ensuring the setcc

39688

// has i1 elements. If we loosen this we need to convert vector bool to a

39689

// scalar bool.

39690

if (Vec.getOpcode() == ISD::VSELECT &&

39691

Vec.getOperand(0).getOpcode() == ISD::SETCC &&

39692

Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&

39693

Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {

39694

// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

39695

SDLoc DL(ExtElt);

39696

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

39697

Vec.getOperand(0).getValueType().getScalarType(),

39698

Vec.getOperand(0), Index);

39699

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

39700

Vec.getOperand(1), Index);

39701

SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

39702

Vec.getOperand(2), Index);

39703

return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

39704

}

39705

39706

// TODO: This switch could include FNEG and the x86-specific FP logic ops

39707

// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

39708

// missed load folding and fma+fneg combining.

39709

switch (Vec.getOpcode()) {

39710

case ISD::FMA: // Begin 3 operands

39711

case ISD::FMAD:

39712

case ISD::FADD: // Begin 2 operands

39713

case ISD::FSUB:

39714

case ISD::FMUL:

39715

case ISD::FDIV:

39716

case ISD::FREM:

39717

case ISD::FCOPYSIGN:

39718

case ISD::FMINNUM:

39719

case ISD::FMAXNUM:

39720

case ISD::FMINNUM_IEEE:

39721

case ISD::FMAXNUM_IEEE:

39722

case ISD::FMAXIMUM:

39723

case ISD::FMINIMUM:

39724

case X86ISD::FMAX:

39725

case X86ISD::FMIN:

39726

case ISD::FABS: // Begin 1 operand

39727

case ISD::FSQRT:

39728

case ISD::FRINT:

39729

case ISD::FCEIL:

39730

case ISD::FTRUNC:

39731

case ISD::FNEARBYINT:

39732

case ISD::FROUND:

39733

case ISD::FFLOOR:

39734

case X86ISD::FRCP:

39735

case X86ISD::FRSQRT: {

39736

// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

39737

SDLoc DL(ExtElt);

39738

SmallVector<SDValue, 4> ExtOps;

39739

for (SDValue Op : Vec->ops())

39740

ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

39741

return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

39742

}

39743

default:

39744

return SDValue();

39745

}

39746

llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39746);

39747

}

39748

39749

/// Try to convert a vector reduction sequence composed of binops and shuffles

39750

/// into horizontal ops.

39751

static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,

39752

const X86Subtarget &Subtarget) {

39753

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Unexpected caller") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39753, __PRETTY_FUNCTION__));

39754

39755

// We need at least SSE2 to anything here.

39756

if (!Subtarget.hasSSE2())

39757

return SDValue();

39758

39759

ISD::NodeType Opc;

39760

SDValue Rdx =

39761

DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);

39762

if (!Rdx)

39763

return SDValue();

39764

39765

SDValue Index = ExtElt->getOperand(1);

39766

assert(isNullConstant(Index) &&((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39767, __PRETTY_FUNCTION__))

39767

"Reduction doesn't end in an extract from index 0")((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39767, __PRETTY_FUNCTION__));

39768

39769

EVT VT = ExtElt->getValueType(0);

39770

EVT VecVT = Rdx.getValueType();

39771

if (VecVT.getScalarType() != VT)

39772

return SDValue();

39773

39774

SDLoc DL(ExtElt);

39775

39776

// vXi8 reduction - sub 128-bit vector.

39777

if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

39778

if (VecVT == MVT::v4i8) {

39779

// Pad with zero.

39780

if (Subtarget.hasSSE41()) {

39781

Rdx = DAG.getBitcast(MVT::i32, Rdx);

39782

Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

39783

DAG.getConstant(0, DL, MVT::v4i32), Rdx,

39784

DAG.getIntPtrConstant(0, DL));

39785

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

39786

} else {

39787

Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,

39788

DAG.getConstant(0, DL, VecVT));

39789

}

39790

}

39791

if (Rdx.getValueType() == MVT::v8i8) {

39792

// Pad with undef.

39793

Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,

39794

DAG.getUNDEF(MVT::v8i8));

39795

}

39796

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

39797

DAG.getConstant(0, DL, MVT::v16i8));

39798

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

39799

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

39800

}

39801

39802

// Must be a >=128-bit vector with pow2 elements.

39803

if ((VecVT.getSizeInBits() % 128) != 0 ||

39804

!isPowerOf2_32(VecVT.getVectorNumElements()))

39805

return SDValue();

39806

39807

// vXi8 reduction - sum lo/hi halves then use PSADBW.

39808

if (VT == MVT::i8) {

39809

while (Rdx.getValueSizeInBits() > 128) {

39810

SDValue Lo, Hi;

39811

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

39812

VecVT = Lo.getValueType();

39813

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

39814

}

39815

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")((VecVT == MVT::v16i8 && "v16i8 reduction expected") ?
static_cast<void> (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39815, __PRETTY_FUNCTION__));

39816

39817

SDValue Hi = DAG.getVectorShuffle(

39818

MVT::v16i8, DL, Rdx, Rdx,

39819

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

39820

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

39821

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

39822

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

39823

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

39824

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

39825

}

39826

39827

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

39828

if (!shouldUseHorizontalOp(true, DAG, Subtarget))

39829

return SDValue();

39830

39831

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

39832

39833

// 256-bit horizontal instructions operate on 128-bit chunks rather than

39834

// across the whole vector, so we need an extract + hop preliminary stage.

39835

// This is the only step where the operands of the hop are not the same value.

39836

// TODO: We could extend this to handle 512-bit or even longer vectors.

39837

if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

39838

((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

39839

unsigned NumElts = VecVT.getVectorNumElements();

39840

SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

39841

SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

39842

Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

39843

VecVT = Rdx.getValueType();

39844

}

39845

if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

39846

!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

39847

return SDValue();

39848

39849

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

39850

unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

39851

for (unsigned i = 0; i != ReductionSteps; ++i)

39852

Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

39853

39854

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

39855

}

39856

39857

/// Detect vector gather/scatter index generation and convert it from being a

39858

/// bunch of shuffles and extracts into a somewhat faster sequence.

39859

/// For i686, the best sequence is apparently storing the value and loading

39860

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

39861

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

39862

TargetLowering::DAGCombinerInfo &DCI,

39863

const X86Subtarget &Subtarget) {

39864

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

39865

return NewOp;

39866

39867

SDValue InputVector = N->getOperand(0);

39868

SDValue EltIdx = N->getOperand(1);

39869

auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

39870

39871

EVT SrcVT = InputVector.getValueType();

39872

EVT VT = N->getValueType(0);

39873

SDLoc dl(InputVector);

39874

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

39875

unsigned NumSrcElts = SrcVT.getVectorNumElements();

39876

39877

if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

39878

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

39879

39880

// Integer Constant Folding.

39881

if (CIdx && VT.isInteger()) {

39882

APInt UndefVecElts;

39883

SmallVector<APInt, 16> EltBits;

39884

unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

39885

if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

39886

EltBits, true, false)) {

39887

uint64_t Idx = CIdx->getZExtValue();

39888

if (UndefVecElts[Idx])

39889

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

39890

return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),

39891

dl, VT);

39892

}

39893

}

39894

39895

if (IsPextr) {

39896

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

39897

if (TLI.SimplifyDemandedBits(

39898

SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))

39899

return SDValue(N, 0);

39900

39901

// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

39902

if ((InputVector.getOpcode() == X86ISD::PINSRB ||

39903

InputVector.getOpcode() == X86ISD::PINSRW) &&

39904

InputVector.getOperand(2) == EltIdx) {

39905

assert(SrcVT == InputVector.getOperand(0).getValueType() &&((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39906, __PRETTY_FUNCTION__))

39906

"Vector type mismatch")((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39906, __PRETTY_FUNCTION__));

39907

SDValue Scl = InputVector.getOperand(1);

39908

Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

39909

return DAG.getZExtOrTrunc(Scl, dl, VT);

39910

}

39911

39912

// TODO - Remove this once we can handle the implicit zero-extension of

39913

// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and

39914

// combineBasicSADPattern.

39915

return SDValue();

39916

}

39917

39918

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

39919

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

39920

VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {

39921

SDValue MMXSrc = InputVector.getOperand(0);

39922

39923

// The bitcast source is a direct mmx result.

39924

if (MMXSrc.getValueType() == MVT::x86mmx)

39925

return DAG.getBitcast(VT, InputVector);

39926

}

39927

39928

// Detect mmx to i32 conversion through a v2i32 elt extract.

39929

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

39930

VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {

39931

SDValue MMXSrc = InputVector.getOperand(0);

39932

39933

// The bitcast source is a direct mmx result.

39934

if (MMXSrc.getValueType() == MVT::x86mmx)

39935

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);

39936

}

39937

39938

// Check whether this extract is the root of a sum of absolute differences

39939

// pattern. This has to be done here because we really want it to happen

39940

// pre-legalization,

39941

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

39942

return SAD;

39943

39944

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

39945

if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))

39946

return Cmp;

39947

39948

// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

39949

if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))

39950

return MinMax;

39951

39952

if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))

39953

return V;

39954

39955

if (SDValue V = scalarizeExtEltFP(N, DAG))

39956

return V;

39957

39958

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

39959

// and then testing the relevant element.

39960

//

39961

// Note that we only combine extracts on the *same* result number, i.e.

39962

// t0 = merge_values a0, a1, a2, a3

39963

// i1 = extract_vector_elt t0, Constant:i64<2>

39964

// i1 = extract_vector_elt t0, Constant:i64<3>

39965

// but not

39966

// i1 = extract_vector_elt t0:1, Constant:i64<2>

39967

// since the latter would need its own MOVMSK.

39968

if (CIdx && SrcVT.getScalarType() == MVT::i1) {

39969

SmallVector<SDNode *, 16> BoolExtracts;

39970

unsigned ResNo = InputVector.getResNo();

39971

auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {

39972

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

39973

isa<ConstantSDNode>(Use->getOperand(1)) &&

39974

Use->getOperand(0).getResNo() == ResNo &&

39975

Use->getValueType(0) == MVT::i1) {

39976

BoolExtracts.push_back(Use);

39977

return true;

39978

}

39979

return false;

39980

};

39981

if (all_of(InputVector->uses(), IsBoolExtract) &&

39982

BoolExtracts.size() > 1) {

39983

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

39984

if (SDValue BC =

39985

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

39986

for (SDNode *Use : BoolExtracts) {

39987

// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

39988

unsigned MaskIdx = Use->getConstantOperandVal(1);

39989

APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);

39990

SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);

39991

SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

39992

Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

39993

DCI.CombineTo(Use, Res);

39994

}

39995

return SDValue(N, 0);

39996

}

39997

}

39998

}

39999

40000

return SDValue();

40001

}

40002

40003

/// If a vector select has an operand that is -1 or 0, try to simplify the

40004

/// select to a bitwise logic operation.

40005

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

40006

static SDValue

40007

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

40008

TargetLowering::DAGCombinerInfo &DCI,

40009

const X86Subtarget &Subtarget) {

40010

SDValue Cond = N->getOperand(0);

40011

SDValue LHS = N->getOperand(1);

40012

SDValue RHS = N->getOperand(2);

40013

EVT VT = LHS.getValueType();

40014

EVT CondVT = Cond.getValueType();

40015

SDLoc DL(N);

40016

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40017

40018

if (N->getOpcode() != ISD::VSELECT)

40019

return SDValue();

40020

40021

assert(CondVT.isVector() && "Vector select expects a vector selector!")((CondVT.isVector() && "Vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40021, __PRETTY_FUNCTION__));

40022

40023

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

40024

// TODO: Can we assert that both operands are not zeros (because that should

40025

// get simplified at node creation time)?

40026

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

40027

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

40028

40029

// If both inputs are 0/undef, create a complete zero vector.

40030

// FIXME: As noted above this should be handled by DAGCombiner/getNode.

40031

if (TValIsAllZeros && FValIsAllZeros) {

40032

if (VT.isFloatingPoint())

40033

return DAG.getConstantFP(0.0, DL, VT);

40034

return DAG.getConstant(0, DL, VT);

40035

}

40036

40037

// To use the condition operand as a bitwise mask, it must have elements that

40038

// are the same size as the select elements. Ie, the condition operand must

40039

// have already been promoted from the IR select condition type <N x i1>.

40040

// Don't check if the types themselves are equal because that excludes

40041

// vector floating-point selects.

40042

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

40043

return SDValue();

40044

40045

// Try to invert the condition if true value is not all 1s and false value is

40046

// not all 0s. Only do this if the condition has one use.

40047

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

40048

if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&

40049

// Check if the selector will be produced by CMPP*/PCMP*.

40050

Cond.getOpcode() == ISD::SETCC &&

40051

// Check if SETCC has already been promoted.

40052

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

40053

CondVT) {

40054

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

40055

40056

if (TValIsAllZeros || FValIsAllOnes) {

40057

SDValue CC = Cond.getOperand(2);

40058

ISD::CondCode NewCC = ISD::getSetCCInverse(

40059

cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

40060

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

40061

NewCC);

40062

std::swap(LHS, RHS);

40063

TValIsAllOnes = FValIsAllOnes;

40064

FValIsAllZeros = TValIsAllZeros;

40065

}

40066

}

40067

40068

// Cond value must be 'sign splat' to be converted to a logical op.

40069

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

40070

return SDValue();

40071

40072

// vselect Cond, 111..., 000... -> Cond

40073

if (TValIsAllOnes && FValIsAllZeros)

40074

return DAG.getBitcast(VT, Cond);

40075

40076

if (!TLI.isTypeLegal(CondVT))

40077

return SDValue();

40078

40079

// vselect Cond, 111..., X -> or Cond, X

40080

if (TValIsAllOnes) {

40081

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

40082

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

40083

return DAG.getBitcast(VT, Or);

40084

}

40085

40086

// vselect Cond, X, 000... -> and Cond, X

40087

if (FValIsAllZeros) {

40088

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

40089

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

40090

return DAG.getBitcast(VT, And);

40091

}

40092

40093

// vselect Cond, 000..., X -> andn Cond, X

40094

if (TValIsAllZeros) {

40095

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

40096

SDValue AndN;

40097

// The canonical form differs for i1 vectors - x86andnp is not used

40098

if (CondVT.getScalarType() == MVT::i1)

40099

AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),

40100

CastRHS);

40101

else

40102

AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);

40103

return DAG.getBitcast(VT, AndN);

40104

}

40105

40106

return SDValue();

40107

}

40108

40109

/// If both arms of a vector select are concatenated vectors, split the select,

40110

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

40111

/// vselect Cond, (concat T0, T1), (concat F0, F1) -->

40112

/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)

40113

static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,

40114

const X86Subtarget &Subtarget) {

40115

unsigned Opcode = N->getOpcode();

40116

if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

40117

return SDValue();

40118

40119

// TODO: Split 512-bit vectors too?

40120

EVT VT = N->getValueType(0);

40121

if (!VT.is256BitVector())

40122

return SDValue();

40123

40124

// TODO: Split as long as any 2 of the 3 operands are concatenated?

40125

SDValue Cond = N->getOperand(0);

40126

SDValue TVal = N->getOperand(1);

40127

SDValue FVal = N->getOperand(2);

40128

SmallVector<SDValue, 4> CatOpsT, CatOpsF;

40129

if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

40130

!collectConcatOps(TVal.getNode(), CatOpsT) ||

40131

!collectConcatOps(FVal.getNode(), CatOpsF))

40132

return SDValue();

40133

40134

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

40135

ArrayRef<SDValue> Ops) {

40136

return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

40137

};

40138

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },

40139

makeBlend, /*CheckBWI*/ false);

40140

}

40141

40142

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

40143

SDValue Cond = N->getOperand(0);

40144

SDValue LHS = N->getOperand(1);

40145

SDValue RHS = N->getOperand(2);

40146

SDLoc DL(N);

40147

40148

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

40149

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

40150

if (!TrueC || !FalseC)

40151

return SDValue();

40152

40153

// Don't do this for crazy integer types.

40154

EVT VT = N->getValueType(0);

40155

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

40156

return SDValue();

40157

40158

// We're going to use the condition bit in math or logic ops. We could allow

40159

// this with a wider condition value (post-legalization it becomes an i8),

40160

// but if nothing is creating selects that late, it doesn't matter.

40161

if (Cond.getValueType() != MVT::i1)

40162

return SDValue();

40163

40164

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

40165

// 3, 5, or 9 with i32/i64, so those get transformed too.

40166

// TODO: For constants that overflow or do not differ by power-of-2 or small

40167

// multiplier, convert to 'and' + 'add'.

40168

const APInt &TrueVal = TrueC->getAPIntValue();

40169

const APInt &FalseVal = FalseC->getAPIntValue();

40170

bool OV;

40171

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

40172

if (OV)

40173

return SDValue();

40174

40175

APInt AbsDiff = Diff.abs();

40176

if (AbsDiff.isPowerOf2() ||

40177

((VT == MVT::i32 || VT == MVT::i64) &&

40178

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

40179

40180

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

40181

// of the condition can usually be folded into a compare predicate, but even

40182

// without that, the sequence should be cheaper than a CMOV alternative.

40183

if (TrueVal.slt(FalseVal)) {

40184

Cond = DAG.getNOT(DL, Cond, MVT::i1);

40185

std::swap(TrueC, FalseC);

40186

}

40187

40188

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

40189

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

40190

40191

// Multiply condition by the difference if non-one.

40192

if (!AbsDiff.isOneValue())

40193

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

40194

40195

// Add the base if non-zero.

40196

if (!FalseC->isNullValue())

40197

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

40198

40199

return R;

40200

}

40201

40202

return SDValue();

40203

}

40204

40205

/// If this is a *dynamic* select (non-constant condition) and we can match

40206

/// this node with one of the variable blend instructions, restructure the

40207

/// condition so that blends can use the high (sign) bit of each element.

40208

/// This function will also call SimplifyDemandedBits on already created

40209

/// BLENDV to perform additional simplifications.

40210

static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

40211

TargetLowering::DAGCombinerInfo &DCI,

40212

const X86Subtarget &Subtarget) {

40213

SDValue Cond = N->getOperand(0);

40214

if ((N->getOpcode() != ISD::VSELECT &&

40215

N->getOpcode() != X86ISD::BLENDV) ||

40216

ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

40217

return SDValue();

40218

40219

// Don't optimize before the condition has been transformed to a legal type

40220

// and don't ever optimize vector selects that map to AVX512 mask-registers.

40221

unsigned BitWidth = Cond.getScalarValueSizeInBits();

40222

if (BitWidth < 8 || BitWidth > 64)

40223

return SDValue();

40224

40225

// We can only handle the cases where VSELECT is directly legal on the

40226

// subtarget. We custom lower VSELECT nodes with constant conditions and

40227

// this makes it hard to see whether a dynamic VSELECT will correctly

40228

// lower, so we both check the operation's status and explicitly handle the

40229

// cases where a *dynamic* blend will fail even though a constant-condition

40230

// blend could be custom lowered.

40231

// FIXME: We should find a better way to handle this class of problems.

40232

// Potentially, we should combine constant-condition vselect nodes

40233

// pre-legalization into shuffles and not mark as many types as custom

40234

// lowered.

40235

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40236

EVT VT = N->getValueType(0);

40237

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

40238

return SDValue();

40239

// FIXME: We don't support i16-element blends currently. We could and

40240

// should support them by making *all* the bits in the condition be set

40241

// rather than just the high bit and using an i8-element blend.

40242

if (VT.getVectorElementType() == MVT::i16)

40243

return SDValue();

40244

// Dynamic blending was only available from SSE4.1 onward.

40245

if (VT.is128BitVector() && !Subtarget.hasSSE41())

40246

return SDValue();

40247

// Byte blends are only available in AVX2

40248

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

40249

return SDValue();

40250

// There are no 512-bit blend instructions that use sign bits.

40251

if (VT.is512BitVector())

40252

return SDValue();

40253

40254

auto OnlyUsedAsSelectCond = [](SDValue Cond) {

40255

for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

40256

UI != UE; ++UI)

40257

if ((UI->getOpcode() != ISD::VSELECT &&

40258

UI->getOpcode() != X86ISD::BLENDV) ||

40259

UI.getOperandNo() != 0)

40260

return false;

40261

40262

return true;

40263

};

40264

40265

APInt DemandedBits(APInt::getSignMask(BitWidth));

40266

40267

if (OnlyUsedAsSelectCond(Cond)) {

40268

KnownBits Known;

40269

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

40270

!DCI.isBeforeLegalizeOps());

40271

if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

40272

return SDValue();

40273

40274

// If we changed the computation somewhere in the DAG, this change will

40275

// affect all users of Cond. Update all the nodes so that we do not use

40276

// the generic VSELECT anymore. Otherwise, we may perform wrong

40277

// optimizations as we messed with the actual expectation for the vector

40278

// boolean values.

40279

for (SDNode *U : Cond->uses()) {

40280

if (U->getOpcode() == X86ISD::BLENDV)

40281

continue;

40282

40283

SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

40284

Cond, U->getOperand(1), U->getOperand(2));

40285

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

40286

DCI.AddToWorklist(U);

40287

}

40288

DCI.CommitTargetLoweringOpt(TLO);

40289

return SDValue(N, 0);

40290

}

40291

40292

// Otherwise we can still at least try to simplify multiple use bits.

40293

if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

40294

return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

40295

N->getOperand(1), N->getOperand(2));

40296

40297

return SDValue();

40298

}

40299

40300

// Try to match:

40301

// (or (and (M, (sub 0, X)), (pandn M, X)))

40302

// which is a special case of:

40303

// (select M, (sub 0, X), X)

40304

// Per:

40305

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

40306

// We know that, if fNegate is 0 or 1:

40307

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

40308

//

40309

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

40310

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

40311

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

40312

// This lets us transform our vselect to:

40313

// (add (xor X, M), (and M, 1))

40314

// And further to:

40315

// (sub (xor X, M), M)

40316

static SDValue combineLogicBlendIntoConditionalNegate(

40317

EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

40318

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

40319

EVT MaskVT = Mask.getValueType();

40320

assert(MaskVT.isInteger() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40322, __PRETTY_FUNCTION__))

40321

DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40322, __PRETTY_FUNCTION__))

40322

"Mask must be zero/all-bits")((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40322, __PRETTY_FUNCTION__));

40323

40324

if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

40325

return SDValue();

40326

if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

40327

return SDValue();

40328

40329

auto IsNegV = [](SDNode *N, SDValue V) {

40330

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

40331

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

40332

};

40333

40334

SDValue V;

40335

if (IsNegV(Y.getNode(), X))

40336

V = X;

40337

else if (IsNegV(X.getNode(), Y))

40338

V = Y;

40339

else

40340

return SDValue();

40341

40342

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

40343

SDValue SubOp2 = Mask;

40344

40345

// If the negate was on the false side of the select, then

40346

// the operands of the SUB need to be swapped. PR 27251.

40347

// This is because the pattern being matched above is

40348

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

40349

// but if the pattern matched was

40350

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

40351

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

40352

// pattern also needs to be a negation of the replacement pattern above.

40353

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

40354

// sub accomplishes the negation of the replacement pattern.

40355

if (V == Y)

40356

std::swap(SubOp1, SubOp2);

40357

40358

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

40359

return DAG.getBitcast(VT, Res);

40360

}

40361

40362

/// Do target-specific dag combines on SELECT and VSELECT nodes.

40363

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

40364

TargetLowering::DAGCombinerInfo &DCI,

40365

const X86Subtarget &Subtarget) {

40366

SDLoc DL(N);

40367

SDValue Cond = N->getOperand(0);

40368

SDValue LHS = N->getOperand(1);

40369

SDValue RHS = N->getOperand(2);

40370

40371

// Try simplification again because we use this function to optimize

40372

// BLENDV nodes that are not handled by the generic combiner.

40373

if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

40374

return V;

40375

40376

EVT VT = LHS.getValueType();

40377

EVT CondVT = Cond.getValueType();

40378

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40379

bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

40380

40381

// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

40382

// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

40383

// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

40384

if (CondVT.isVector() && CondVT.isInteger() &&

40385

CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

40386

(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

40387

DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

40388

if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

40389

DL, DAG, Subtarget))

40390

return V;

40391

40392

// Convert vselects with constant condition into shuffles.

40393

if (CondConstantVector && DCI.isBeforeLegalizeOps()) {

40394

SmallVector<int, 64> Mask;

40395

if (createShuffleMaskFromVSELECT(Mask, Cond))

40396

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

40397

}

40398

40399

// fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

40400

// by forcing the unselected elements to zero.

40401

// TODO: Can we handle more shuffles with this?

40402

if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&

40403

LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&

40404

LHS.hasOneUse() && RHS.hasOneUse()) {

40405

MVT SimpleVT = VT.getSimpleVT();

40406

bool LHSUnary, RHSUnary;

40407

SmallVector<SDValue, 1> LHSOps, RHSOps;

40408

SmallVector<int, 64> LHSMask, RHSMask, CondMask;

40409

if (createShuffleMaskFromVSELECT(CondMask, Cond) &&

40410

getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,

40411

LHSUnary) &&

40412

getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,

40413

RHSUnary)) {

40414

int NumElts = VT.getVectorNumElements();

40415

for (int i = 0; i != NumElts; ++i) {

40416

if (CondMask[i] < NumElts)

40417

RHSMask[i] = 0x80;

40418

else

40419

LHSMask[i] = 0x80;

40420

}

40421

LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),

40422

getConstVector(LHSMask, SimpleVT, DAG, DL, true));

40423

RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),

40424

getConstVector(RHSMask, SimpleVT, DAG, DL, true));

40425

return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

40426

}

40427

}

40428

40429

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

40430

// instructions match the semantics of the common C idiom x<y?x:y but not

40431

// x<=y?x:y, because of how they handle negative zero (which can be

40432

// ignored in unsafe-math mode).

40433

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

40434

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

40435

VT != MVT::f80 && VT != MVT::f128 &&

40436

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

40437

(Subtarget.hasSSE2() ||

40438

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

40439

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

40440

40441

unsigned Opcode = 0;

40442

// Check for x CC y ? x : y.

40443

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

40444

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

40445

switch (CC) {

40446

default: break;

40447

case ISD::SETULT:

40448

// Converting this to a min would handle NaNs incorrectly, and swapping

40449

// the operands would cause it to handle comparisons between positive

40450

// and negative zero incorrectly.

40451

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

40452

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

40453

!(DAG.isKnownNeverZeroFloat(LHS) ||

40454

DAG.isKnownNeverZeroFloat(RHS)))

40455

break;

40456

std::swap(LHS, RHS);

40457

}

40458

Opcode = X86ISD::FMIN;

40459

break;

40460

case ISD::SETOLE:

40461

// Converting this to a min would handle comparisons between positive

40462

// and negative zero incorrectly.

40463

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

40464

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

40465

break;

40466

Opcode = X86ISD::FMIN;

40467

break;

40468

case ISD::SETULE:

40469

// Converting this to a min would handle both negative zeros and NaNs

40470

// incorrectly, but we can swap the operands to fix both.

40471

std::swap(LHS, RHS);

40472

LLVM_FALLTHROUGH[[gnu::fallthrough]];

40473

case ISD::SETOLT:

40474

case ISD::SETLT:

40475

case ISD::SETLE:

40476

Opcode = X86ISD::FMIN;

40477

break;

40478

40479

case ISD::SETOGE:

40480

// Converting this to a max would handle comparisons between positive

40481

// and negative zero incorrectly.

40482

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

40483

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

40484

break;

40485

Opcode = X86ISD::FMAX;

40486

break;

40487

case ISD::SETUGT:

40488

// Converting this to a max would handle NaNs incorrectly, and swapping

40489

// the operands would cause it to handle comparisons between positive

40490

// and negative zero incorrectly.

40491

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

40492

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

40493

!(DAG.isKnownNeverZeroFloat(LHS) ||

40494

DAG.isKnownNeverZeroFloat(RHS)))

40495

break;

40496

std::swap(LHS, RHS);

40497

}

40498

Opcode = X86ISD::FMAX;

40499

break;

40500

case ISD::SETUGE:

40501

// Converting this to a max would handle both negative zeros and NaNs

40502

// incorrectly, but we can swap the operands to fix both.

40503

std::swap(LHS, RHS);

40504

LLVM_FALLTHROUGH[[gnu::fallthrough]];

40505

case ISD::SETOGT:

40506

case ISD::SETGT:

40507

case ISD::SETGE:

40508

Opcode = X86ISD::FMAX;

40509

break;

40510

}

40511

// Check for x CC y ? y : x -- a min/max with reversed arms.

40512

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

40513

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

40514

switch (CC) {

40515

default: break;

40516

case ISD::SETOGE:

40517

// Converting this to a min would handle comparisons between positive

40518

// and negative zero incorrectly, and swapping the operands would

40519

// cause it to handle NaNs incorrectly.

40520

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

40521

!(DAG.isKnownNeverZeroFloat(LHS) ||

40522

DAG.isKnownNeverZeroFloat(RHS))) {

40523

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

40524

break;

40525

std::swap(LHS, RHS);

40526

}

40527

Opcode = X86ISD::FMIN;

40528

break;

40529

case ISD::SETUGT:

40530

// Converting this to a min would handle NaNs incorrectly.

40531

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

40532

break;

40533

Opcode = X86ISD::FMIN;

40534

break;

40535

case ISD::SETUGE:

40536

// Converting this to a min would handle both negative zeros and NaNs

40537

// incorrectly, but we can swap the operands to fix both.

40538

std::swap(LHS, RHS);

40539

LLVM_FALLTHROUGH[[gnu::fallthrough]];

40540

case ISD::SETOGT:

40541

case ISD::SETGT:

40542

case ISD::SETGE:

40543

Opcode = X86ISD::FMIN;

40544

break;

40545

40546

case ISD::SETULT:

40547

// Converting this to a max would handle NaNs incorrectly.

40548

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

40549

break;

40550

Opcode = X86ISD::FMAX;

40551

break;

40552

case ISD::SETOLE:

40553

// Converting this to a max would handle comparisons between positive

40554

// and negative zero incorrectly, and swapping the operands would

40555

// cause it to handle NaNs incorrectly.

40556

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

40557

!DAG.isKnownNeverZeroFloat(LHS) &&

40558

!DAG.isKnownNeverZeroFloat(RHS)) {

40559

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

40560

break;

40561

std::swap(LHS, RHS);

40562

}

40563

Opcode = X86ISD::FMAX;

40564

break;

40565

case ISD::SETULE:

40566

// Converting this to a max would handle both negative zeros and NaNs

40567

// incorrectly, but we can swap the operands to fix both.

40568

std::swap(LHS, RHS);

40569

LLVM_FALLTHROUGH[[gnu::fallthrough]];

40570

case ISD::SETOLT:

40571

case ISD::SETLT:

40572

case ISD::SETLE:

40573

Opcode = X86ISD::FMAX;

40574

break;

40575

}

40576

}

40577

40578

if (Opcode)

40579

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

40580

}

40581

40582

// Some mask scalar intrinsics rely on checking if only one bit is set

40583

// and implement it in C code like this:

40584

// A[0] = (U & 1) ? A[0] : W[0];

40585

// This creates some redundant instructions that break pattern matching.

40586

// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

40587

if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

40588

Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

40589

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

40590

SDValue AndNode = Cond.getOperand(0);

40591

if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

40592

isNullConstant(Cond.getOperand(1)) &&

40593

isOneConstant(AndNode.getOperand(1))) {

40594

// LHS and RHS swapped due to

40595

// setcc outputting 1 when AND resulted in 0 and vice versa.

40596

AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

40597

return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

40598

}

40599

}

40600

40601

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

40602

// lowering on KNL. In this case we convert it to

40603

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

40604

// The same situation all vectors of i8 and i16 without BWI.

40605

// Make sure we extend these even before type legalization gets a chance to

40606

// split wide vectors.

40607

// Since SKX these selects have a proper lowering.

40608

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

40609

CondVT.getVectorElementType() == MVT::i1 &&

40610

(VT.getVectorElementType() == MVT::i8 ||

40611

VT.getVectorElementType() == MVT::i16)) {

40612

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

40613

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

40614

}

40615

40616

// AVX512 - Extend select with zero to merge with target shuffle.

40617

// select(mask, extract_subvector(shuffle(x)), zero) -->

40618

// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))

40619

// TODO - support non target shuffles as well.

40620

if (Subtarget.hasAVX512() && CondVT.isVector() &&

40621

CondVT.getVectorElementType() == MVT::i1) {

40622

auto SelectableOp = [&TLI](SDValue Op) {

40623

return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40624

isTargetShuffle(Op.getOperand(0).getOpcode()) &&

40625

isNullConstant(Op.getOperand(1)) &&

40626

TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

40627

Op.hasOneUse() && Op.getOperand(0).hasOneUse();

40628

};

40629

40630

bool SelectableLHS = SelectableOp(LHS);

40631

bool SelectableRHS = SelectableOp(RHS);

40632

bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());

40633

bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

40634

40635

if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {

40636

EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

40637

: RHS.getOperand(0).getValueType();

40638

unsigned NumSrcElts = SrcVT.getVectorNumElements();

40639

EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);

40640

LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

40641

VT.getSizeInBits());

40642

RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

40643

VT.getSizeInBits());

40644

Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

40645

DAG.getUNDEF(SrcCondVT), Cond,

40646

DAG.getIntPtrConstant(0, DL));

40647

SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

40648

return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

40649

}

40650

}

40651

40652

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

40653

return V;

40654

40655

// Canonicalize min/max:

40656

// (x > 0) ? x : 0 -> (x >= 0) ? x : 0

40657

// (x < -1) ? x : -1 -> (x <= -1) ? x : -1

40658

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

40659

// the need for an extra compare

40660

// against zero. e.g.

40661

// (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

40662

// subl %esi, %edi

40663

// testl %edi, %edi

40664

// movl $0, %eax

40665

// cmovgl %edi, %eax

40666

// =>

40667

// xorl %eax, %eax

40668

// subl %esi, $edi

40669

// cmovsl %eax, %edi

40670

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

40671

Cond.hasOneUse() &&

40672

LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {

40673

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

40674

if ((CC == ISD::SETGT && isNullConstant(RHS)) ||

40675

(CC == ISD::SETLT && isAllOnesConstant(RHS))) {

40676

ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

40677

Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),

40678

Cond.getOperand(0), Cond.getOperand(1), NewCC);

40679

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

40680

}

40681

}

40682

40683

// Match VSELECTs into subs with unsigned saturation.

40684

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

40685

// psubus is available in SSE2 for i8 and i16 vectors.

40686

Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&

40687

isPowerOf2_32(VT.getVectorNumElements()) &&

40688

(VT.getVectorElementType() == MVT::i8 ||

40689

VT.getVectorElementType() == MVT::i16)) {

40690

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

40691

40692

// Check if one of the arms of the VSELECT is a zero vector. If it's on the

40693

// left side invert the predicate to simplify logic below.

40694

SDValue Other;

40695

if (ISD::isBuildVectorAllZeros(LHS.getNode())) {

40696

Other = RHS;

40697

CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());

40698

} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {

40699

Other = LHS;

40700

}

40701

40702

if (Other.getNode() && Other->getNumOperands() == 2 &&

40703

Other->getOperand(0) == Cond.getOperand(0)) {

40704

SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);

40705

SDValue CondRHS = Cond->getOperand(1);

40706

40707

// Look for a general sub with unsigned saturation first.

40708

// x >= y ? x-y : 0 --> subus x, y

40709

// x > y ? x-y : 0 --> subus x, y

40710

if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&

40711

Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)

40712

return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

40713

40714

if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {

40715

if (isa<BuildVectorSDNode>(CondRHS)) {

40716

// If the RHS is a constant we have to reverse the const

40717

// canonicalization.

40718

// x > C-1 ? x+-C : 0 --> subus x, C

40719

auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {

40720

return (!Op && !Cond) ||

40721

(Op && Cond &&

40722

Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));

40723

};

40724

if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&

40725

ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,

40726

/*AllowUndefs*/ true)) {

40727

OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

40728

OpRHS);

40729

return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

40730

}

40731

40732

// Another special case: If C was a sign bit, the sub has been

40733

// canonicalized into a xor.

40734

// FIXME: Would it be better to use computeKnownBits to determine

40735

// whether it's safe to decanonicalize the xor?

40736

// x s< 0 ? x^C : 0 --> subus x, C

40737

if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {

40738

if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&

40739

ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&

40740

OpRHSConst->getAPIntValue().isSignMask()) {

40741

// Note that we have to rebuild the RHS constant here to ensure we

40742

// don't rely on particular values of undef lanes.

40743

OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);

40744

return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

40745

}

40746

}

40747

}

40748

}

40749

}

40750

}

40751

40752

// Match VSELECTs into add with unsigned saturation.

40753

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

40754

// paddus is available in SSE2 for i8 and i16 vectors.

40755

Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&

40756

isPowerOf2_32(VT.getVectorNumElements()) &&

40757

(VT.getVectorElementType() == MVT::i8 ||

40758

VT.getVectorElementType() == MVT::i16)) {

40759

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

40760

40761

SDValue CondLHS = Cond->getOperand(0);

40762

SDValue CondRHS = Cond->getOperand(1);

40763

40764

// Check if one of the arms of the VSELECT is vector with all bits set.

40765

// If it's on the left side invert the predicate to simplify logic below.

40766

SDValue Other;

40767

if (ISD::isBuildVectorAllOnes(LHS.getNode())) {

40768

Other = RHS;

40769

CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());

40770

} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {

40771

Other = LHS;

40772

}

40773

40774

if (Other.getNode() && Other.getOpcode() == ISD::ADD) {

40775

SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);

40776

40777

// Canonicalize condition operands.

40778

if (CC == ISD::SETUGE) {

40779

std::swap(CondLHS, CondRHS);

40780

CC = ISD::SETULE;

40781

}

40782

40783

// We can test against either of the addition operands.

40784

// x <= x+y ? x+y : ~0 --> addus x, y

40785

// x+y >= x ? x+y : ~0 --> addus x, y

40786

if (CC == ISD::SETULE && Other == CondRHS &&

40787

(OpLHS == CondLHS || OpRHS == CondLHS))

40788

return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);

40789

40790

if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&

40791

CondLHS == OpLHS) {

40792

// If the RHS is a constant we have to reverse the const

40793

// canonicalization.

40794

// x > ~C ? x+C : ~0 --> addus x, C

40795

auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {

40796

return Cond->getAPIntValue() == ~Op->getAPIntValue();

40797

};

40798

if (CC == ISD::SETULE &&

40799

ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))

40800

return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);

40801

}

40802

}

40803

}

40804

40805

// Check if the first operand is all zeros and Cond type is vXi1.

40806

// If this an avx512 target we can improve the use of zero masking by

40807

// swapping the operands and inverting the condition.

40808

if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

40809

Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

40810

ISD::isBuildVectorAllZeros(LHS.getNode()) &&

40811

!ISD::isBuildVectorAllZeros(RHS.getNode())) {

40812

// Invert the cond to not(cond) : xor(op,allones)=not(op)

40813

SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

40814

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

40815

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

40816

}

40817

40818

// Early exit check

40819

if (!TLI.isTypeLegal(VT))

40820

return SDValue();

40821

40822

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

40823

return V;

40824

40825

if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))

40826

return V;

40827

40828

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))

40829

return V;

40830

40831

// select(~Cond, X, Y) -> select(Cond, Y, X)

40832

if (CondVT.getScalarType() != MVT::i1) {

40833

if (SDValue CondNot = IsNOT(Cond, DAG))

40834

return DAG.getNode(N->getOpcode(), DL, VT,

40835

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

40836

// pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.

40837

if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&

40838

ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {

40839

Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

40840

DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

40841

return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

40842

}

40843

}

40844

40845

// Try to optimize vXi1 selects if both operands are either all constants or

40846

// bitcasts from scalar integer type. In that case we can convert the operands

40847

// to integer and use an integer select which will be converted to a CMOV.

40848

// We need to take a little bit of care to avoid creating an i64 type after

40849

// type legalization.

40850

if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

40851

VT.getVectorElementType() == MVT::i1 &&

40852

(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

40853

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

40854

bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

40855

bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

40856

40857

if ((LHSIsConst ||

40858

(LHS.getOpcode() == ISD::BITCAST &&

40859

LHS.getOperand(0).getValueType() == IntVT)) &&

40860

(RHSIsConst ||

40861

(RHS.getOpcode() == ISD::BITCAST &&

40862

RHS.getOperand(0).getValueType() == IntVT))) {

40863

if (LHSIsConst)

40864

LHS = combinevXi1ConstantToInteger(LHS, DAG);

40865

else

40866

LHS = LHS.getOperand(0);

40867

40868

if (RHSIsConst)

40869

RHS = combinevXi1ConstantToInteger(RHS, DAG);

40870

else

40871

RHS = RHS.getOperand(0);

40872

40873

SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

40874

return DAG.getBitcast(VT, Select);

40875

}

40876

}

40877

40878

// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

40879

// single bits, then invert the predicate and swap the select operands.

40880

// This can lower using a vector shift bit-hack rather than mask and compare.

40881

if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

40882

N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

40883

Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

40884

Cond.getOperand(0).getOpcode() == ISD::AND &&

40885

isNullOrNullSplat(Cond.getOperand(1)) &&

40886

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

40887

Cond.getOperand(0).getValueType() == VT) {

40888

// The 'and' mask must be composed of power-of-2 constants.

40889

SDValue And = Cond.getOperand(0);

40890

auto *C = isConstOrConstSplat(And.getOperand(1));

40891

if (C && C->getAPIntValue().isPowerOf2()) {

40892

// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

40893

SDValue NotCond =

40894

DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

40895

return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

40896

}

40897

40898

// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

40899

// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

40900

// 16-bit lacks a proper blendv.

40901

unsigned EltBitWidth = VT.getScalarSizeInBits();

40902

bool CanShiftBlend =

40903

TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

40904

(Subtarget.hasAVX2() && EltBitWidth == 64) ||

40905

(Subtarget.hasXOP()));

40906

if (CanShiftBlend &&

40907

ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

40908

return C->getAPIntValue().isPowerOf2();

40909

})) {

40910

// Create a left-shift constant to get the mask bits over to the sign-bit.

40911

SDValue Mask = And.getOperand(1);

40912

SmallVector<int, 32> ShlVals;

40913

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

40914

auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

40915

ShlVals.push_back(EltBitWidth - 1 -

40916

MaskVal->getAPIntValue().exactLogBase2());

40917

}

40918

// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

40919

SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

40920

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

40921

SDValue NewCond =

40922

DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

40923

return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

40924

}

40925

}

40926

40927

return SDValue();

40928

}

40929

40930

/// Combine:

40931

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

40932

/// to:

40933

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

40934

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

40935

/// Note that this is only legal for some op/cc combinations.

40936

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

40937

SelectionDAG &DAG,

40938

const X86Subtarget &Subtarget) {

40939

// This combine only operates on CMP-like nodes.

40940

if (!(Cmp.getOpcode() == X86ISD::CMP ||

40941

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

40942

return SDValue();

40943

40944

// Can't replace the cmp if it has more uses than the one we're looking at.

40945

// FIXME: We would like to be able to handle this, but would need to make sure

40946

// all uses were updated.

40947

if (!Cmp.hasOneUse())

40948

return SDValue();

40949

40950

// This only applies to variations of the common case:

40951

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

40952

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

40953

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

40954

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

40955

// Using the proper condcodes (see below), overflow is checked for.

40956

40957

// FIXME: We can generalize both constraints:

40958

// - XOR/OR/AND (if they were made to survive AtomicExpand)

40959

// - LHS != 1

40960

// if the result is compared.

40961

40962

SDValue CmpLHS = Cmp.getOperand(0);

40963

SDValue CmpRHS = Cmp.getOperand(1);

40964

40965

if (!CmpLHS.hasOneUse())

40966

return SDValue();

40967

40968

unsigned Opc = CmpLHS.getOpcode();

40969

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

40970

return SDValue();

40971

40972

SDValue OpRHS = CmpLHS.getOperand(2);

40973

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

40974

if (!OpRHSC)

40975

return SDValue();

40976

40977

APInt Addend = OpRHSC->getAPIntValue();

40978

if (Opc == ISD::ATOMIC_LOAD_SUB)

40979

Addend = -Addend;

40980

40981

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

40982

if (!CmpRHSC)

40983

return SDValue();

40984

40985

APInt Comparison = CmpRHSC->getAPIntValue();

40986

40987

// If the addend is the negation of the comparison value, then we can do

40988

// a full comparison by emitting the atomic arithmetic as a locked sub.

40989

if (Comparison == -Addend) {

40990

// The CC is fine, but we need to rewrite the LHS of the comparison as an

40991

// atomic sub.

40992

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

40993

auto AtomicSub = DAG.getAtomic(

40994

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),

40995

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

40996

/*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),

40997

AN->getMemOperand());

40998

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

40999

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),

41000

DAG.getUNDEF(CmpLHS.getValueType()));

41001

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

41002

return LockOp;

41003

}

41004

41005

// We can handle comparisons with zero in a number of cases by manipulating

41006

// the CC used.

41007

if (!Comparison.isNullValue())

41008

return SDValue();

41009

41010

if (CC == X86::COND_S && Addend == 1)

41011

CC = X86::COND_LE;

41012

else if (CC == X86::COND_NS && Addend == 1)

41013

CC = X86::COND_G;

41014

else if (CC == X86::COND_G && Addend == -1)

41015

CC = X86::COND_GE;

41016

else if (CC == X86::COND_LE && Addend == -1)

41017

CC = X86::COND_L;

41018

else

41019

return SDValue();

41020

41021

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

41022

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),

41023

DAG.getUNDEF(CmpLHS.getValueType()));

41024

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

41025

return LockOp;

41026

}

41027

41028

// Check whether a boolean test is testing a boolean value generated by

41029

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

41030

// code.

41031

//

41032

// Simplify the following patterns:

41033

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

41034

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

41035

// to (Op EFLAGS Cond)

41036

//

41037

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

41038

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

41039

// to (Op EFLAGS !Cond)

41040

//

41041

// where Op could be BRCOND or CMOV.

41042

//

41043

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

41044

// This combine only operates on CMP-like nodes.

41045

if (!(Cmp.getOpcode() == X86ISD::CMP ||

41046

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

41047

return SDValue();

41048

41049

// Quit if not used as a boolean value.

41050

if (CC != X86::COND_E && CC != X86::COND_NE)

41051

return SDValue();

41052

41053

// Check CMP operands. One of them should be 0 or 1 and the other should be

41054

// an SetCC or extended from it.

41055

SDValue Op1 = Cmp.getOperand(0);

41056

SDValue Op2 = Cmp.getOperand(1);

41057

41058

SDValue SetCC;

41059

const ConstantSDNode* C = nullptr;

41060

bool needOppositeCond = (CC == X86::COND_E);

41061

bool checkAgainstTrue = false; // Is it a comparison against 1?

41062

41063

if ((C = dyn_cast<ConstantSDNode>(Op1)))

41064

SetCC = Op2;

41065

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

41066

SetCC = Op1;

41067

else // Quit if all operands are not constants.

41068

return SDValue();

41069

41070

if (C->getZExtValue() == 1) {

41071

needOppositeCond = !needOppositeCond;

41072

checkAgainstTrue = true;

41073

} else if (C->getZExtValue() != 0)

41074

// Quit if the constant is neither 0 or 1.

41075

return SDValue();

41076

41077

bool truncatedToBoolWithAnd = false;

41078

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

41079

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

41080

SetCC.getOpcode() == ISD::TRUNCATE ||

41081

SetCC.getOpcode() == ISD::AND) {

41082

if (SetCC.getOpcode() == ISD::AND) {

41083

int OpIdx = -1;

41084

if (isOneConstant(SetCC.getOperand(0)))

41085

OpIdx = 1;

41086

if (isOneConstant(SetCC.getOperand(1)))

41087

OpIdx = 0;

41088

if (OpIdx < 0)

41089

break;

41090

SetCC = SetCC.getOperand(OpIdx);

41091

truncatedToBoolWithAnd = true;

41092

} else

41093

SetCC = SetCC.getOperand(0);

41094

}

41095

41096

switch (SetCC.getOpcode()) {

41097

case X86ISD::SETCC_CARRY:

41098

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

41099

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

41100

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

41101

// truncated to i1 using 'and'.

41102

if (checkAgainstTrue && !truncatedToBoolWithAnd)

41103

break;

41104

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41105, __PRETTY_FUNCTION__))

41105

"Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41105, __PRETTY_FUNCTION__));

41106

LLVM_FALLTHROUGH[[gnu::fallthrough]];

41107

case X86ISD::SETCC:

41108

// Set the condition code or opposite one if necessary.

41109

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

41110

if (needOppositeCond)

41111

CC = X86::GetOppositeBranchCondition(CC);

41112

return SetCC.getOperand(1);

41113

case X86ISD::CMOV: {

41114

// Check whether false/true value has canonical one, i.e. 0 or 1.

41115

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

41116

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

41117

// Quit if true value is not a constant.

41118

if (!TVal)

41119

return SDValue();

41120

// Quit if false value is not a constant.

41121

if (!FVal) {

41122

SDValue Op = SetCC.getOperand(0);

41123

// Skip 'zext' or 'trunc' node.

41124

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

41125

Op.getOpcode() == ISD::TRUNCATE)

41126

Op = Op.getOperand(0);

41127

// A special case for rdrand/rdseed, where 0 is set if false cond is

41128

// found.

41129

if ((Op.getOpcode() != X86ISD::RDRAND &&

41130

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

41131

return SDValue();

41132

}

41133

// Quit if false value is not the constant 0 or 1.

41134

bool FValIsFalse = true;

41135

if (FVal && FVal->getZExtValue() != 0) {

41136

if (FVal->getZExtValue() != 1)

41137

return SDValue();

41138

// If FVal is 1, opposite cond is needed.

41139

needOppositeCond = !needOppositeCond;

41140

FValIsFalse = false;

41141

}

41142

// Quit if TVal is not the constant opposite of FVal.

41143

if (FValIsFalse && TVal->getZExtValue() != 1)

41144

return SDValue();

41145

if (!FValIsFalse && TVal->getZExtValue() != 0)

41146

return SDValue();

41147

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

41148

if (needOppositeCond)

41149

CC = X86::GetOppositeBranchCondition(CC);

41150

return SetCC.getOperand(3);

41151

}

41152

}

41153

41154

return SDValue();

41155

}

41156

41157

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

41158

/// Match:

41159

/// (X86or (X86setcc) (X86setcc))

41160

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

41161

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

41162

X86::CondCode &CC1, SDValue &Flags,

41163

bool &isAnd) {

41164

if (Cond->getOpcode() == X86ISD::CMP) {

41165

if (!isNullConstant(Cond->getOperand(1)))

41166

return false;

41167

41168

Cond = Cond->getOperand(0);

41169

}

41170

41171

isAnd = false;

41172

41173

SDValue SetCC0, SetCC1;

41174

switch (Cond->getOpcode()) {

41175

default: return false;

41176

case ISD::AND:

41177

case X86ISD::AND:

41178

isAnd = true;

41179

LLVM_FALLTHROUGH[[gnu::fallthrough]];

41180

case ISD::OR:

41181

case X86ISD::OR:

41182

SetCC0 = Cond->getOperand(0);

41183

SetCC1 = Cond->getOperand(1);

41184

break;

41185

};

41186

41187

// Make sure we have SETCC nodes, using the same flags value.

41188

if (SetCC0.getOpcode() != X86ISD::SETCC ||

41189

SetCC1.getOpcode() != X86ISD::SETCC ||

41190

SetCC0->getOperand(1) != SetCC1->getOperand(1))

41191

return false;

41192

41193

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

41194

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

41195

Flags = SetCC0->getOperand(1);

41196

return true;

41197

}

41198

41199

// When legalizing carry, we create carries via add X, -1

41200

// If that comes from an actual carry, via setcc, we use the

41201

// carry directly.

41202

static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

41203

if (EFLAGS.getOpcode() == X86ISD::ADD) {

41204

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

41205

SDValue Carry = EFLAGS.getOperand(0);

41206

while (Carry.getOpcode() == ISD::TRUNCATE ||

41207

Carry.getOpcode() == ISD::ZERO_EXTEND ||

41208

Carry.getOpcode() == ISD::SIGN_EXTEND ||

41209

Carry.getOpcode() == ISD::ANY_EXTEND ||

41210

(Carry.getOpcode() == ISD::AND &&

41211

isOneConstant(Carry.getOperand(1))))

41212

Carry = Carry.getOperand(0);

41213

if (Carry.getOpcode() == X86ISD::SETCC ||

41214

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

41215

// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

41216

uint64_t CarryCC = Carry.getConstantOperandVal(0);

41217

SDValue CarryOp1 = Carry.getOperand(1);

41218

if (CarryCC == X86::COND_B)

41219

return CarryOp1;

41220

if (CarryCC == X86::COND_A) {

41221

// Try to convert COND_A into COND_B in an attempt to facilitate

41222

// materializing "setb reg".

41223

//

41224

// Do not flip "e > c", where "c" is a constant, because Cmp

41225

// instruction cannot take an immediate as its first operand.

41226

//

41227

if (CarryOp1.getOpcode() == X86ISD::SUB &&

41228

CarryOp1.getNode()->hasOneUse() &&

41229

CarryOp1.getValueType().isInteger() &&

41230

!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

41231

SDValue SubCommute =

41232

DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

41233

CarryOp1.getOperand(1), CarryOp1.getOperand(0));

41234

return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

41235

}

41236

}

41237

// If this is a check of the z flag of an add with 1, switch to the

41238

// C flag.

41239

if (CarryCC == X86::COND_E &&

41240

CarryOp1.getOpcode() == X86ISD::ADD &&

41241

isOneConstant(CarryOp1.getOperand(1)))

41242

return CarryOp1;

41243

}

41244

}

41245

}

41246

41247

return SDValue();

41248

}

41249

41250

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

41251

/// to avoid the inversion.

41252

static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

41253

SelectionDAG &DAG,

41254

const X86Subtarget &Subtarget) {

41255

// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

41256

if (EFLAGS.getOpcode() != X86ISD::PTEST &&

41257

EFLAGS.getOpcode() != X86ISD::TESTP)

41258

return SDValue();

41259

41260

// PTEST/TESTP sets EFLAGS as:

41261

// TESTZ: ZF = (Op0 & Op1) == 0

41262

// TESTC: CF = (~Op0 & Op1) == 0

41263

// TESTNZC: ZF == 0 && CF == 0

41264

EVT VT = EFLAGS.getValueType();

41265

SDValue Op0 = EFLAGS.getOperand(0);

41266

SDValue Op1 = EFLAGS.getOperand(1);

41267

EVT OpVT = Op0.getValueType();

41268

41269

// TEST*(~X,Y) == TEST*(X,Y)

41270

if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

41271

X86::CondCode InvCC;

41272

switch (CC) {

41273

case X86::COND_B:

41274

// testc -> testz.

41275

InvCC = X86::COND_E;

41276

break;

41277

case X86::COND_AE:

41278

// !testc -> !testz.

41279

InvCC = X86::COND_NE;

41280

break;

41281

case X86::COND_E:

41282

// testz -> testc.

41283

InvCC = X86::COND_B;

41284

break;

41285

case X86::COND_NE:

41286

// !testz -> !testc.

41287

InvCC = X86::COND_AE;

41288

break;

41289

case X86::COND_A:

41290

case X86::COND_BE:

41291

// testnzc -> testnzc (no change).

41292

InvCC = CC;

41293

break;

41294

default:

41295

InvCC = X86::COND_INVALID;

41296

break;

41297

}

41298

41299

if (InvCC != X86::COND_INVALID) {

41300

CC = InvCC;

41301

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

41302

DAG.getBitcast(OpVT, NotOp0), Op1);

41303

}

41304

}

41305

41306

if (CC == X86::COND_E || CC == X86::COND_NE) {

41307

// TESTZ(X,~Y) == TESTC(Y,X)

41308

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

41309

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

41310

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

41311

DAG.getBitcast(OpVT, NotOp1), Op0);

41312

}

41313

41314

if (Op0 == Op1) {

41315

SDValue BC = peekThroughBitcasts(Op0);

41316

EVT BCVT = BC.getValueType();

41317

assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&((BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal
(BCVT) && "Unexpected vector type") ? static_cast<
void> (0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41318, __PRETTY_FUNCTION__))

41318

"Unexpected vector type")((BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal
(BCVT) && "Unexpected vector type") ? static_cast<
void> (0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41318, __PRETTY_FUNCTION__));

41319

41320

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

41321

if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

41322

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

41323

DAG.getBitcast(OpVT, BC.getOperand(0)),

41324

DAG.getBitcast(OpVT, BC.getOperand(1)));

41325

}

41326

41327

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

41328

if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

41329

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

41330

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

41331

DAG.getBitcast(OpVT, BC.getOperand(0)),

41332

DAG.getBitcast(OpVT, BC.getOperand(1)));

41333

}

41334

41335

// If every element is an all-sign value, see if we can use MOVMSK to

41336

// more efficiently extract the sign bits and compare that.

41337

// TODO: Handle TESTC with comparison inversion.

41338

// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

41339

// MOVMSK combines to make sure its never worse than PTEST?

41340

unsigned EltBits = BCVT.getScalarSizeInBits();

41341

if (DAG.ComputeNumSignBits(BC) == EltBits) {

41342

assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")((VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41342, __PRETTY_FUNCTION__));

41343

APInt SignMask = APInt::getSignMask(EltBits);

41344

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41345

if (SDValue Res =

41346

TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

41347

// For vXi16 cases we need to use pmovmksb and extract every other

41348

// sign bit.

41349

SDLoc DL(EFLAGS);

41350

if (EltBits == 16) {

41351

MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

41352

Res = DAG.getBitcast(MovmskVT, Res);

41353

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

41354

Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

41355

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

41356

} else {

41357

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

41358

}

41359

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

41360

DAG.getConstant(0, DL, MVT::i32));

41361

}

41362

}

41363

}

41364

41365

// TESTZ(-1,X) == TESTZ(X,X)

41366

if (ISD::isBuildVectorAllOnes(Op0.getNode()))

41367

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

41368

41369

// TESTZ(X,-1) == TESTZ(X,X)

41370

if (ISD::isBuildVectorAllOnes(Op1.getNode()))

41371

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

41372

}

41373

41374

return SDValue();

41375

}

41376

41377

// Attempt to simplify the MOVMSK input based on the comparison type.

41378

static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

41379

SelectionDAG &DAG,

41380

const X86Subtarget &Subtarget) {

41381

// Handle eq/ne against zero (any_of).

41382

// Handle eq/ne against -1 (all_of).

41383

if (!(CC == X86::COND_E || CC == X86::COND_NE))

41384

return SDValue();

41385

if (EFLAGS.getValueType() != MVT::i32)

41386

return SDValue();

41387

unsigned CmpOpcode = EFLAGS.getOpcode();

41388

if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

41389

return SDValue();

41390

auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

41391

if (!CmpConstant)

41392

return SDValue();

41393

const APInt &CmpVal = CmpConstant->getAPIntValue();

41394

41395

SDValue CmpOp = EFLAGS.getOperand(0);

41396

unsigned CmpBits = CmpOp.getValueSizeInBits();

41397

assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")((CmpBits == CmpVal.getBitWidth() && "Value size mismatch"
) ? static_cast<void> (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41397, __PRETTY_FUNCTION__));

41398

41399

// Peek through any truncate.

41400

if (CmpOp.getOpcode() == ISD::TRUNCATE)

41401

CmpOp = CmpOp.getOperand(0);

41402

41403

// Bail if we don't find a MOVMSK.

41404

if (CmpOp.getOpcode() != X86ISD::MOVMSK)

41405

return SDValue();

41406

41407

SDValue Vec = CmpOp.getOperand(0);

41408

MVT VecVT = Vec.getSimpleValueType();

41409

assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
"Unexpected MOVMSK operand") ? static_cast<void> (0) :
__assert_fail ("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41410, __PRETTY_FUNCTION__))

41410

"Unexpected MOVMSK operand")(((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
"Unexpected MOVMSK operand") ? static_cast<void> (0) :
__assert_fail ("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41410, __PRETTY_FUNCTION__));

41411

unsigned NumElts = VecVT.getVectorNumElements();

41412

unsigned NumEltBits = VecVT.getScalarSizeInBits();

41413

41414

bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();

41415

bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&

41416

CmpVal.isMask(NumElts);

41417

if (!IsAnyOf && !IsAllOf)

41418

return SDValue();

41419

41420

// See if we can peek through to a vector with a wider element type, if the

41421

// signbits extend down to all the sub-elements as well.

41422

// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

41423

// potential SimplifyDemandedBits/Elts cases.

41424

if (Vec.getOpcode() == ISD::BITCAST) {

41425

SDValue BC = peekThroughBitcasts(Vec);

41426

MVT BCVT = BC.getSimpleValueType();

41427

unsigned BCNumElts = BCVT.getVectorNumElements();

41428

unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

41429

if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

41430

BCNumEltBits > NumEltBits &&

41431

DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

41432

SDLoc DL(EFLAGS);

41433

unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);

41434

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

41435

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

41436

DAG.getConstant(CmpMask, DL, MVT::i32));

41437

}

41438

}

41439

41440

// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

41441

// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

41442

if (IsAllOf && Subtarget.hasSSE41()) {

41443

SDValue BC = peekThroughBitcasts(Vec);

41444

if (BC.getOpcode() == X86ISD::PCMPEQ &&

41445

ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {

41446

MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

41447

SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));

41448

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

41449

}

41450

}

41451

41452

// See if we can avoid a PACKSS by calling MOVMSK on the sources.

41453

// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

41454

// sign bits prior to the comparison with zero unless we know that

41455

// the vXi16 splats the sign bit down to the lower i8 half.

41456

// TODO: Handle all_of patterns.

41457

if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

41458

SDValue VecOp0 = Vec.getOperand(0);

41459

SDValue VecOp1 = Vec.getOperand(1);

41460

bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

41461

bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

41462

// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

41463

if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

41464

SDLoc DL(EFLAGS);

41465

SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

41466

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

41467

Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

41468

if (!SignExt0) {

41469

Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

41470

DAG.getConstant(0xAAAA, DL, MVT::i16));

41471

}

41472

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

41473

DAG.getConstant(0, DL, MVT::i16));

41474

}

41475

// PMOVMSKB(PACKSSBW(LO(X), HI(X)))

41476

// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

41477

if (CmpBits == 16 && Subtarget.hasInt256() &&

41478

VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

41479

VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

41480

VecOp0.getOperand(0) == VecOp1.getOperand(0) &&

41481

VecOp0.getConstantOperandAPInt(1) == 0 &&

41482

VecOp1.getConstantOperandAPInt(1) == 8 &&

41483

(IsAnyOf || (SignExt0 && SignExt1))) {

41484

SDLoc DL(EFLAGS);

41485

SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));

41486

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

41487

unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

41488

if (!SignExt0 || !SignExt1) {

41489

assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns")((IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? static_cast<void> (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41489, __PRETTY_FUNCTION__));

41490

Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

41491

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

41492

}

41493

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

41494

DAG.getConstant(CmpMask, DL, MVT::i32));

41495

}

41496

}

41497

41498

// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

41499

SmallVector<int, 32> ShuffleMask;

41500

SmallVector<SDValue, 2> ShuffleInputs;

41501

if (NumElts == CmpBits &&

41502

getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

41503

ShuffleMask, DAG) &&

41504

ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

41505

ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

41506

unsigned NumShuffleElts = ShuffleMask.size();

41507

APInt DemandedElts = APInt::getNullValue(NumShuffleElts);

41508

for (int M : ShuffleMask) {

41509

assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")((0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index"
) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41509, __PRETTY_FUNCTION__));

41510

DemandedElts.setBit(M);

41511

}

41512

if (DemandedElts.isAllOnesValue()) {

41513

SDLoc DL(EFLAGS);

41514

SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

41515

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

41516

Result =

41517

DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

41518

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

41519

EFLAGS.getOperand(1));

41520

}

41521

}

41522

41523

return SDValue();

41524

}

41525

41526

/// Optimize an EFLAGS definition used according to the condition code \p CC

41527

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

41528

/// uses of chain values.

41529

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

41530

SelectionDAG &DAG,

41531

const X86Subtarget &Subtarget) {

41532

if (CC == X86::COND_B)

41533

if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

41534

return Flags;

41535

41536

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

41537

return R;

41538

41539

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

41540

return R;

41541

41542

if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

41543

return R;

41544

41545

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

41546

}

41547

41548

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

41549

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

41550

TargetLowering::DAGCombinerInfo &DCI,

41551

const X86Subtarget &Subtarget) {

41552

SDLoc DL(N);

41553

41554

SDValue FalseOp = N->getOperand(0);

41555

SDValue TrueOp = N->getOperand(1);

41556

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

41557

SDValue Cond = N->getOperand(3);

41558

41559

// cmov X, X, ?, ? --> X

41560

if (TrueOp == FalseOp)

41561

return TrueOp;

41562

41563

// Try to simplify the EFLAGS and condition code operands.

41564

// We can't always do this as FCMOV only supports a subset of X86 cond.

41565

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

41566

if (!(FalseOp.getValueType() == MVT::f80 ||

41567

(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

41568

(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

41569

!Subtarget.hasCMov() || hasFPCMov(CC)) {

41570

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

41571

Flags};

41572

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

41573

}

41574

}

41575

41576

// If this is a select between two integer constants, try to do some

41577

// optimizations. Note that the operands are ordered the opposite of SELECT

41578

// operands.

41579

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

41580

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

41581

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

41582

// larger than FalseC (the false value).

41583

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

41584

CC = X86::GetOppositeBranchCondition(CC);

41585

std::swap(TrueC, FalseC);

41586

std::swap(TrueOp, FalseOp);

41587

}

41588

41589

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

41590

// This is efficient for any integer data type (including i8/i16) and

41591

// shift amount.

41592

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

41593

Cond = getSETCC(CC, Cond, DL, DAG);

41594

41595

// Zero extend the condition if needed.

41596

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

41597

41598

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

41599

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

41600

DAG.getConstant(ShAmt, DL, MVT::i8));

41601

return Cond;

41602

}

41603

41604

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

41605

// for any integer data type, including i8/i16.

41606

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

41607

Cond = getSETCC(CC, Cond, DL, DAG);

41608

41609

// Zero extend the condition if needed.

41610

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

41611

FalseC->getValueType(0), Cond);

41612

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

41613

SDValue(FalseC, 0));

41614

return Cond;

41615

}

41616

41617

// Optimize cases that will turn into an LEA instruction. This requires

41618

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

41619

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

41620

APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

41621

assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41622, __PRETTY_FUNCTION__))

41622

"Implicit constant truncation")((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41622, __PRETTY_FUNCTION__));

41623

41624

bool isFastMultiplier = false;

41625

if (Diff.ult(10)) {

41626

switch (Diff.getZExtValue()) {

41627

default: break;

41628

case 1: // result = add base, cond

41629

case 2: // result = lea base( , cond*2)

41630

case 3: // result = lea base(cond, cond*2)

41631

case 4: // result = lea base( , cond*4)

41632

case 5: // result = lea base(cond, cond*4)

41633

case 8: // result = lea base( , cond*8)

41634

case 9: // result = lea base(cond, cond*8)

41635

isFastMultiplier = true;

41636

break;

41637

}

41638

}

41639

41640

if (isFastMultiplier) {

41641

Cond = getSETCC(CC, Cond, DL ,DAG);

41642

// Zero extend the condition if needed.

41643

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

41644

Cond);

41645

// Scale the condition by the difference.

41646

if (Diff != 1)

41647

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

41648

DAG.getConstant(Diff, DL, Cond.getValueType()));

41649

41650

// Add the base if non-zero.

41651

if (FalseC->getAPIntValue() != 0)

41652

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

41653

SDValue(FalseC, 0));

41654

return Cond;

41655

}

41656

}

41657

}

41658

}

41659

41660

// Handle these cases:

41661

// (select (x != c), e, c) -> select (x != c), e, x),

41662

// (select (x == c), c, e) -> select (x == c), x, e)

41663

// where the c is an integer constant, and the "select" is the combination

41664

// of CMOV and CMP.

41665

//

41666

// The rationale for this change is that the conditional-move from a constant

41667

// needs two instructions, however, conditional-move from a register needs

41668

// only one instruction.

41669

//

41670

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

41671

// some instruction-combining opportunities. This opt needs to be

41672

// postponed as late as possible.

41673

//

41674

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

41675

// the DCI.xxxx conditions are provided to postpone the optimization as

41676

// late as possible.

41677

41678

ConstantSDNode *CmpAgainst = nullptr;

41679

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

41680

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

41681

!isa<ConstantSDNode>(Cond.getOperand(0))) {

41682

41683

if (CC == X86::COND_NE &&

41684

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

41685

CC = X86::GetOppositeBranchCondition(CC);

41686

std::swap(TrueOp, FalseOp);

41687

}

41688

41689

if (CC == X86::COND_E &&

41690

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

41691

SDValue Ops[] = {FalseOp, Cond.getOperand(0),

41692

DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

41693

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

41694

}

41695

}

41696

}

41697

41698

// Fold and/or of setcc's to double CMOV:

41699

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

41700

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

41701

//

41702

// This combine lets us generate:

41703

// cmovcc1 (jcc1 if we don't have CMOV)

41704

// cmovcc2 (same)

41705

// instead of:

41706

// setcc1

41707

// setcc2

41708

// and/or

41709

// cmovne (jne if we don't have CMOV)

41710

// When we can't use the CMOV instruction, it might increase branch

41711

// mispredicts.

41712

// When we can use CMOV, or when there is no mispredict, this improves

41713

// throughput and reduces register pressure.

41714

//

41715

if (CC == X86::COND_NE) {

41716

SDValue Flags;

41717

X86::CondCode CC0, CC1;

41718

bool isAndSetCC;

41719

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

41720

if (isAndSetCC) {

41721

std::swap(FalseOp, TrueOp);

41722

CC0 = X86::GetOppositeBranchCondition(CC0);

41723

CC1 = X86::GetOppositeBranchCondition(CC1);

41724

}

41725

41726

SDValue LOps[] = {FalseOp, TrueOp,

41727

DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

41728

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

41729

SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

41730

Flags};

41731

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

41732

return CMOV;

41733

}

41734

}

41735

41736

// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

41737

// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

41738

// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

41739

// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

41740

if ((CC == X86::COND_NE || CC == X86::COND_E) &&

41741

Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

41742

SDValue Add = TrueOp;

41743

SDValue Const = FalseOp;

41744

// Canonicalize the condition code for easier matching and output.

41745

if (CC == X86::COND_E)

41746

std::swap(Add, Const);

41747

41748

// We might have replaced the constant in the cmov with the LHS of the

41749

// compare. If so change it to the RHS of the compare.

41750

if (Const == Cond.getOperand(0))

41751

Const = Cond.getOperand(1);

41752

41753

// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

41754

if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

41755

Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

41756

(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

41757

Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

41758

Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

41759

EVT VT = N->getValueType(0);

41760

// This should constant fold.

41761

SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

41762

SDValue CMov =

41763

DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

41764

DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

41765

return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

41766

}

41767

}

41768

41769

return SDValue();

41770

}

41771

41772

/// Different mul shrinking modes.

41773

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

41774

41775

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

41776

EVT VT = N->getOperand(0).getValueType();

41777

if (VT.getScalarSizeInBits() != 32)

41778

return false;

41779

41780

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((N->getNumOperands() == 2 && "NumOperands of Mul are 2"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41780, __PRETTY_FUNCTION__));

41781

unsigned SignBits[2] = {1, 1};

41782

bool IsPositive[2] = {false, false};

41783

for (unsigned i = 0; i < 2; i++) {

41784

SDValue Opd = N->getOperand(i);

41785

41786

SignBits[i] = DAG.ComputeNumSignBits(Opd);

41787

IsPositive[i] = DAG.SignBitIsZero(Opd);

41788

}

41789

41790

bool AllPositive = IsPositive[0] && IsPositive[1];

41791

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

41792

// When ranges are from -128 ~ 127, use MULS8 mode.

41793

if (MinSignBits >= 25)

41794

Mode = ShrinkMode::MULS8;

41795

// When ranges are from 0 ~ 255, use MULU8 mode.

41796

else if (AllPositive && MinSignBits >= 24)

41797

Mode = ShrinkMode::MULU8;

41798

// When ranges are from -32768 ~ 32767, use MULS16 mode.

41799

else if (MinSignBits >= 17)

41800

Mode = ShrinkMode::MULS16;

41801

// When ranges are from 0 ~ 65535, use MULU16 mode.

41802

else if (AllPositive && MinSignBits >= 16)

41803

Mode = ShrinkMode::MULU16;

41804

else

41805

return false;

41806

return true;

41807

}

41808

41809

/// When the operands of vector mul are extended from smaller size values,

41810

/// like i8 and i16, the type of mul may be shrinked to generate more

41811

/// efficient code. Two typical patterns are handled:

41812

/// Pattern1:

41813

/// %2 = sext/zext <N x i8> %1 to <N x i32>

41814

/// %4 = sext/zext <N x i8> %3 to <N x i32>

41815

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

41816

/// %5 = mul <N x i32> %2, %4

41817

///

41818

/// Pattern2:

41819

/// %2 = zext/sext <N x i16> %1 to <N x i32>

41820

/// %4 = zext/sext <N x i16> %3 to <N x i32>

41821

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

41822

/// %5 = mul <N x i32> %2, %4

41823

///

41824

/// There are four mul shrinking modes:

41825

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

41826

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

41827

/// generate pmullw+sext32 for it (MULS8 mode).

41828

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

41829

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

41830

/// generate pmullw+zext32 for it (MULU8 mode).

41831

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

41832

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

41833

/// generate pmullw+pmulhw for it (MULS16 mode).

41834

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

41835

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

41836

/// generate pmullw+pmulhuw for it (MULU16 mode).

41837

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

41838

const X86Subtarget &Subtarget) {

41839

// Check for legality

41840

// pmullw/pmulhw are not supported by SSE.

41841

if (!Subtarget.hasSSE2())

41842

return SDValue();

41843

41844

// Check for profitability

41845

// pmulld is supported since SSE41. It is better to use pmulld

41846

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

41847

// the expansion.

41848

bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

41849

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

41850

return SDValue();

41851

41852

ShrinkMode Mode;

41853

if (!canReduceVMulWidth(N, DAG, Mode))

41854

return SDValue();

41855

41856

SDLoc DL(N);

41857

SDValue N0 = N->getOperand(0);

41858

SDValue N1 = N->getOperand(1);

41859

EVT VT = N->getOperand(0).getValueType();

41860

unsigned NumElts = VT.getVectorNumElements();

41861

if ((NumElts % 2) != 0)

41862

return SDValue();

41863

41864

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

41865

41866

// Shrink the operands of mul.

41867

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

41868

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

41869

41870

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

41871

// lower part is needed.

41872

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

41873

if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

41874

return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

41875

: ISD::SIGN_EXTEND,

41876

DL, VT, MulLo);

41877

41878

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

41879

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

41880

// the higher part is also needed.

41881

SDValue MulHi =

41882

DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

41883

ReducedVT, NewN0, NewN1);

41884

41885

// Repack the lower part and higher part result of mul into a wider

41886

// result.

41887

// Generate shuffle functioning as punpcklwd.

41888

SmallVector<int, 16> ShuffleMask(NumElts);

41889

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

41890

ShuffleMask[2 * i] = i;

41891

ShuffleMask[2 * i + 1] = i + NumElts;

41892

}

41893

SDValue ResLo =

41894

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

41895

ResLo = DAG.getBitcast(ResVT, ResLo);

41896

// Generate shuffle functioning as punpckhwd.

41897

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

41898

ShuffleMask[2 * i] = i + NumElts / 2;

41899

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

41900

}

41901

SDValue ResHi =

41902

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

41903

ResHi = DAG.getBitcast(ResVT, ResHi);

41904

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

41905

}

41906

41907

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

41908

EVT VT, const SDLoc &DL) {

41909

41910

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

41911

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

41912

DAG.getConstant(Mult, DL, VT));

41913

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

41914

DAG.getConstant(Shift, DL, MVT::i8));

41915

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

41916

N->getOperand(0));

41917

return Result;

41918

};

41919

41920

auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

41921

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

41922

DAG.getConstant(Mul1, DL, VT));

41923

Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

41924

DAG.getConstant(Mul2, DL, VT));

41925

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

41926

N->getOperand(0));

41927

return Result;

41928

};

41929

41930

switch (MulAmt) {

41931

default:

41932

break;

41933

case 11:

41934

// mul x, 11 => add ((shl (mul x, 5), 1), x)

41935

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

41936

case 21:

41937

// mul x, 21 => add ((shl (mul x, 5), 2), x)

41938

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

41939

case 41:

41940

// mul x, 41 => add ((shl (mul x, 5), 3), x)

41941

return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

41942

case 22:

41943

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

41944

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

41945

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

41946

case 19:

41947

// mul x, 19 => add ((shl (mul x, 9), 1), x)

41948

return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

41949

case 37:

41950

// mul x, 37 => add ((shl (mul x, 9), 2), x)

41951

return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

41952

case 73:

41953

// mul x, 73 => add ((shl (mul x, 9), 3), x)

41954

return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

41955

case 13:

41956

// mul x, 13 => add ((shl (mul x, 3), 2), x)

41957

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

41958

case 23:

41959

// mul x, 23 => sub ((shl (mul x, 3), 3), x)

41960

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

41961

case 26:

41962

// mul x, 26 => add ((mul (mul x, 5), 5), x)

41963

return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

41964

case 28:

41965

// mul x, 28 => add ((mul (mul x, 9), 3), x)

41966

return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

41967

case 29:

41968

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

41969

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

41970

combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

41971

}

41972

41973

// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

41974

// by a single LEA.

41975

// First check if this a sum of two power of 2s because that's easy. Then

41976

// count how many zeros are up to the first bit.

41977

// TODO: We can do this even without LEA at a cost of two shifts and an add.

41978

if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

41979

unsigned ScaleShift = countTrailingZeros(MulAmt);

41980

if (ScaleShift >= 1 && ScaleShift < 4) {

41981

unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

41982

SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

41983

DAG.getConstant(ShiftAmt, DL, MVT::i8));

41984

SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

41985

DAG.getConstant(ScaleShift, DL, MVT::i8));

41986

return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

41987

}

41988

}

41989

41990

return SDValue();

41991

}

41992

41993

// If the upper 17 bits of each element are zero then we can use PMADDWD,

41994

// which is always at least as quick as PMULLD, except on KNL.

41995

static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

41996

const X86Subtarget &Subtarget) {

41997

if (!Subtarget.hasSSE2())

41998

return SDValue();

41999

42000

if (Subtarget.isPMADDWDSlow())

42001

return SDValue();

42002

42003

EVT VT = N->getValueType(0);

42004

42005

// Only support vXi32 vectors.

42006

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

42007

return SDValue();

42008

42009

// Make sure the type is legal or will be widened to a legal type.

42010

if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))

42011

return SDValue();

42012

42013

MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());

42014

42015

// Without BWI, we would need to split v32i16.

42016

if (WVT == MVT::v32i16 && !Subtarget.hasBWI())

42017

return SDValue();

42018

42019

SDValue N0 = N->getOperand(0);

42020

SDValue N1 = N->getOperand(1);

42021

42022

// If we are zero extending two steps without SSE4.1, its better to reduce

42023

// the vmul width instead.

42024

if (!Subtarget.hasSSE41() &&

42025

(N0.getOpcode() == ISD::ZERO_EXTEND &&

42026

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

42027

(N1.getOpcode() == ISD::ZERO_EXTEND &&

42028

N1.getOperand(0).getScalarValueSizeInBits() <= 8))

42029

return SDValue();

42030

42031

APInt Mask17 = APInt::getHighBitsSet(32, 17);

42032

if (!DAG.MaskedValueIsZero(N1, Mask17) ||

42033

!DAG.MaskedValueIsZero(N0, Mask17))

42034

return SDValue();

42035

42036

// Use SplitOpsAndApply to handle AVX splitting.

42037

auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

42038

ArrayRef<SDValue> Ops) {

42039

MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

42040

return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);

42041

};

42042

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

42043

{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },

42044

PMADDWDBuilder);

42045

}

42046

42047

static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,

42048

const X86Subtarget &Subtarget) {

42049

if (!Subtarget.hasSSE2())

42050

return SDValue();

42051

42052

EVT VT = N->getValueType(0);

42053

42054

// Only support vXi64 vectors.

42055

if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

42056

VT.getVectorNumElements() < 2 ||

42057

!isPowerOf2_32(VT.getVectorNumElements()))

42058

return SDValue();

42059

42060

SDValue N0 = N->getOperand(0);

42061

SDValue N1 = N->getOperand(1);

42062

42063

// MULDQ returns the 64-bit result of the signed multiplication of the lower

42064

// 32-bits. We can lower with this if the sign bits stretch that far.

42065

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

42066

DAG.ComputeNumSignBits(N1) > 32) {

42067

auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

42068

ArrayRef<SDValue> Ops) {

42069

return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

42070

};

42071

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

42072

PMULDQBuilder, /*CheckBWI*/false);

42073

}

42074

42075

// If the upper bits are zero we can use a single pmuludq.

42076

APInt Mask = APInt::getHighBitsSet(64, 32);

42077

if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

42078

auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

42079

ArrayRef<SDValue> Ops) {

42080

return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

42081

};

42082

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

42083

PMULUDQBuilder, /*CheckBWI*/false);

42084

}

42085

42086

return SDValue();

42087

}

42088

42089

/// Optimize a single multiply with constant into two operations in order to

42090

/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

42091

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

42092

TargetLowering::DAGCombinerInfo &DCI,

42093

const X86Subtarget &Subtarget) {

42094

EVT VT = N->getValueType(0);

42095

42096

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))

42097

return V;

42098

42099

if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))

42100

return V;

42101

42102

if (DCI.isBeforeLegalize() && VT.isVector())

42103

return reduceVMULWidth(N, DAG, Subtarget);

42104

42105

if (!MulConstantOptimization)

42106

return SDValue();

42107

// An imul is usually smaller than the alternative sequence.

42108

if (DAG.getMachineFunction().getFunction().hasMinSize())

42109

return SDValue();

42110

42111

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

42112

return SDValue();

42113

42114

if (VT != MVT::i64 && VT != MVT::i32)

42115

return SDValue();

42116

42117

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

42118

if (!C)

42119

return SDValue();

42120

if (isPowerOf2_64(C->getZExtValue()))

42121

return SDValue();

42122

42123

int64_t SignMulAmt = C->getSExtValue();

42124

assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")((SignMulAmt != (-9223372036854775807L -1) && "Int min should have been handled!"
) ? static_cast<void> (0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42124, __PRETTY_FUNCTION__));

42125

uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

42126

42127

SDLoc DL(N);

42128

if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

42129

SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

42130

DAG.getConstant(AbsMulAmt, DL, VT));

42131

if (SignMulAmt < 0)

42132

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

42133

NewMul);

42134

42135

return NewMul;

42136

}

42137

42138

uint64_t MulAmt1 = 0;

42139

uint64_t MulAmt2 = 0;

42140

if ((AbsMulAmt % 9) == 0) {

42141

MulAmt1 = 9;

42142

MulAmt2 = AbsMulAmt / 9;

42143

} else if ((AbsMulAmt % 5) == 0) {

42144

MulAmt1 = 5;

42145

MulAmt2 = AbsMulAmt / 5;

42146

} else if ((AbsMulAmt % 3) == 0) {

42147

MulAmt1 = 3;

42148

MulAmt2 = AbsMulAmt / 3;

42149

}

42150

42151

SDValue NewMul;

42152

// For negative multiply amounts, only allow MulAmt2 to be a power of 2.

42153

if (MulAmt2 &&

42154

(isPowerOf2_64(MulAmt2) ||

42155

(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {

42156

42157

if (isPowerOf2_64(MulAmt2) &&

42158

!(SignMulAmt >= 0 && N->hasOneUse() &&

42159

N->use_begin()->getOpcode() == ISD::ADD))

42160

// If second multiplifer is pow2, issue it first. We want the multiply by

42161

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

42162

// is an add. Only do this for positive multiply amounts since the

42163

// negate would prevent it from being used as an address mode anyway.

42164

std::swap(MulAmt1, MulAmt2);

42165

42166

if (isPowerOf2_64(MulAmt1))

42167

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

42168

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

42169

else

42170

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

42171

DAG.getConstant(MulAmt1, DL, VT));

42172

42173

if (isPowerOf2_64(MulAmt2))

42174

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

42175

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

42176

else

42177

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

42178

DAG.getConstant(MulAmt2, DL, VT));

42179

42180

// Negate the result.

42181

if (SignMulAmt < 0)

42182

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

42183

NewMul);

42184

} else if (!Subtarget.slowLEA())

42185

NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

42186

42187

if (!NewMul) {

42188

assert(C->getZExtValue() != 0 &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42191, __PRETTY_FUNCTION__))

42189

C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42191, __PRETTY_FUNCTION__))

42190

"Both cases that could cause potential overflows should have "((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42191, __PRETTY_FUNCTION__))

42191

"already been handled.")((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42191, __PRETTY_FUNCTION__));

42192

if (isPowerOf2_64(AbsMulAmt - 1)) {

42193

// (mul x, 2^N + 1) => (add (shl x, N), x)

42194

NewMul = DAG.getNode(

42195

ISD::ADD, DL, VT, N->getOperand(0),

42196

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

42197

DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,

42198

MVT::i8)));

42199

// To negate, subtract the number from zero

42200

if (SignMulAmt < 0)

42201

NewMul = DAG.getNode(ISD::SUB, DL, VT,

42202

DAG.getConstant(0, DL, VT), NewMul);

42203

} else if (isPowerOf2_64(AbsMulAmt + 1)) {

42204

// (mul x, 2^N - 1) => (sub (shl x, N), x)

42205

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

42206

DAG.getConstant(Log2_64(AbsMulAmt + 1),

42207

DL, MVT::i8));

42208

// To negate, reverse the operands of the subtract.

42209

if (SignMulAmt < 0)

42210

NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

42211

else

42212

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

42213

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {

42214

// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)

42215

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

42216

DAG.getConstant(Log2_64(AbsMulAmt - 2),

42217

DL, MVT::i8));

42218

NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));

42219

NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));

42220

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {

42221

// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)

42222

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

42223

DAG.getConstant(Log2_64(AbsMulAmt + 2),

42224

DL, MVT::i8));

42225

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

42226

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

42227

}

42228

}

42229

42230

return NewMul;

42231

}

42232

42233

// Try to form a MULHU or MULHS node by looking for

42234

// (srl (mul ext, ext), 16)

42235

// TODO: This is X86 specific because we want to be able to handle wide types

42236

// before type legalization. But we can only do it if the vector will be

42237

// legalized via widening/splitting. Type legalization can't handle promotion

42238

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

42239

// combiner.

42240

static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

42241

const X86Subtarget &Subtarget) {

42242

assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::
SRA) && "SRL or SRA node is required here!") ? static_cast
<void> (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42243, __PRETTY_FUNCTION__))

42243

"SRL or SRA node is required here!")(((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::
SRA) && "SRL or SRA node is required here!") ? static_cast
<void> (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42243, __PRETTY_FUNCTION__));

42244

SDLoc DL(N);

42245

42246

// Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand

42247

// the multiply.

42248

if (!Subtarget.hasSSE41())

42249

return SDValue();

42250

42251

// The operation feeding into the shift must be a multiply.

42252

SDValue ShiftOperand = N->getOperand(0);

42253

if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

42254

return SDValue();

42255

42256

// Input type should be at least vXi32.

42257

EVT VT = N->getValueType(0);

42258

if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

42259

return SDValue();

42260

42261

// Need a shift by 16.

42262

APInt ShiftAmt;

42263

if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

42264

ShiftAmt != 16)

42265

return SDValue();

42266

42267

SDValue LHS = ShiftOperand.getOperand(0);

42268

SDValue RHS = ShiftOperand.getOperand(1);

42269

42270

unsigned ExtOpc = LHS.getOpcode();

42271

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

42272

RHS.getOpcode() != ExtOpc)

42273

return SDValue();

42274

42275

// Peek through the extends.

42276

LHS = LHS.getOperand(0);

42277

RHS = RHS.getOperand(0);

42278

42279

// Ensure the input types match.

42280

EVT MulVT = LHS.getValueType();

42281

if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

42282

return SDValue();

42283

42284

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

42285

SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

42286

42287

ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

42288

return DAG.getNode(ExtOpc, DL, VT, Mulh);

42289

}

42290

42291

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

42292

SDValue N0 = N->getOperand(0);

42293

SDValue N1 = N->getOperand(1);

42294

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

42295

EVT VT = N0.getValueType();

42296

42297

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

42298

// since the result of setcc_c is all zero's or all ones.

42299

if (VT.isInteger() && !VT.isVector() &&

42300

N1C && N0.getOpcode() == ISD::AND &&

42301

N0.getOperand(1).getOpcode() == ISD::Constant) {

42302

SDValue N00 = N0.getOperand(0);

42303

APInt Mask = N0.getConstantOperandAPInt(1);

42304

Mask <<= N1C->getAPIntValue();

42305

bool MaskOK = false;

42306

// We can handle cases concerning bit-widening nodes containing setcc_c if

42307

// we carefully interrogate the mask to make sure we are semantics

42308

// preserving.

42309

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

42310

// of the underlying setcc_c operation if the setcc_c was zero extended.

42311

// Consider the following example:

42312

// zext(setcc_c) -> i32 0x0000FFFF

42313

// c1 -> i32 0x0000FFFF

42314

// c2 -> i32 0x00000001

42315

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

42316

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

42317

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

42318

MaskOK = true;

42319

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

42320

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

42321

MaskOK = true;

42322

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

42323

N00.getOpcode() == ISD::ANY_EXTEND) &&

42324

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

42325

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

42326

}

42327

if (MaskOK && Mask != 0) {

42328

SDLoc DL(N);

42329

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

42330

}

42331

}

42332

42333

// Hardware support for vector shifts is sparse which makes us scalarize the

42334

// vector operations in many cases. Also, on sandybridge ADD is faster than

42335

// shl.

42336

// (shl V, 1) -> add V,V

42337

if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))

42338

if (auto *N1SplatC = N1BV->getConstantSplatNode()) {

42339

assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type"
) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42339, __PRETTY_FUNCTION__));

42340

// We shift all of the values by one. In many cases we do not have

42341

// hardware support for this operation. This is better expressed as an ADD

42342

// of two values.

42343

if (N1SplatC->isOne())

42344

return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);

42345

}

42346

42347

return SDValue();

42348

}

42349

42350

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

42351

const X86Subtarget &Subtarget) {

42352

SDValue N0 = N->getOperand(0);

42353

SDValue N1 = N->getOperand(1);

42354

EVT VT = N0.getValueType();

42355

unsigned Size = VT.getSizeInBits();

42356

42357

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

42358

return V;

42359

42360

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

42361

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

42362

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

42363

// depending on sign of (SarConst - [56,48,32,24,16])

42364

42365

// sexts in X86 are MOVs. The MOVs have the same code size

42366

// as above SHIFTs (only SHIFT on 1 has lower code size).

42367

// However the MOVs have 2 advantages to a SHIFT:

42368

// 1. MOVs can write to a register that differs from source

42369

// 2. MOVs accept memory operands

42370

42371

if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

42372

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

42373

N0.getOperand(1).getOpcode() != ISD::Constant)

42374

return SDValue();

42375

42376

SDValue N00 = N0.getOperand(0);

42377

SDValue N01 = N0.getOperand(1);

42378

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

42379

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

42380

EVT CVT = N1.getValueType();

42381

42382

if (SarConst.isNegative())

42383

return SDValue();

42384

42385

for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

42386

unsigned ShiftSize = SVT.getSizeInBits();

42387

// skipping types without corresponding sext/zext and

42388

// ShlConst that is not one of [56,48,32,24,16]

42389

if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

42390

continue;

42391

SDLoc DL(N);

42392

SDValue NN =

42393

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

42394

SarConst = SarConst - (Size - ShiftSize);

42395

if (SarConst == 0)

42396

return NN;

42397

else if (SarConst.isNegative())

42398

return DAG.getNode(ISD::SHL, DL, VT, NN,

42399

DAG.getConstant(-SarConst, DL, CVT));

42400

else

42401

return DAG.getNode(ISD::SRA, DL, VT, NN,

42402

DAG.getConstant(SarConst, DL, CVT));

42403

}

42404

return SDValue();

42405

}

42406

42407

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

42408

TargetLowering::DAGCombinerInfo &DCI,

42409

const X86Subtarget &Subtarget) {

42410

SDValue N0 = N->getOperand(0);

42411

SDValue N1 = N->getOperand(1);

42412

EVT VT = N0.getValueType();

42413

42414

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

42415

return V;

42416

42417

// Only do this on the last DAG combine as it can interfere with other

42418

// combines.

42419

if (!DCI.isAfterLegalizeDAG())

42420

return SDValue();

42421

42422

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

42423

// TODO: This is a generic DAG combine that became an x86-only combine to

42424

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

42425

// and-not ('andn').

42426

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

42427

return SDValue();

42428

42429

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

42430

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

42431

if (!ShiftC || !AndC)

42432

return SDValue();

42433

42434

// If we can shrink the constant mask below 8-bits or 32-bits, then this

42435

// transform should reduce code size. It may also enable secondary transforms

42436

// from improved known-bits analysis or instruction selection.

42437

APInt MaskVal = AndC->getAPIntValue();

42438

42439

// If this can be matched by a zero extend, don't optimize.

42440

if (MaskVal.isMask()) {

42441

unsigned TO = MaskVal.countTrailingOnes();

42442

if (TO >= 8 && isPowerOf2_32(TO))

42443

return SDValue();

42444

}

42445

42446

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

42447

unsigned OldMaskSize = MaskVal.getMinSignedBits();

42448

unsigned NewMaskSize = NewMaskVal.getMinSignedBits();

42449

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

42450

(OldMaskSize > 32 && NewMaskSize <= 32)) {

42451

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

42452

SDLoc DL(N);

42453

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

42454

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

42455

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

42456

}

42457

return SDValue();

42458

}

42459

42460

static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

42461

const X86Subtarget &Subtarget) {

42462

unsigned Opcode = N->getOpcode();

42463

assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42466, __PRETTY_FUNCTION__))

42464

X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42466, __PRETTY_FUNCTION__))

42465

X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42466, __PRETTY_FUNCTION__))

42466

"Unexpected hadd/hsub/pack opcode")(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42466, __PRETTY_FUNCTION__));

42467

42468

EVT VT = N->getValueType(0);

42469

SDValue N0 = N->getOperand(0);

42470

SDValue N1 = N->getOperand(1);

42471

EVT SrcVT = N0.getValueType();

42472

42473

// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

42474

// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

42475

// truncation trees that help us avoid lane crossing shuffles.

42476

// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

42477

// TODO: We don't handle vXf64 shuffles yet.

42478

if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

42479

N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

42480

N0.getConstantOperandAPInt(1) == 0 &&

42481

N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&

42482

N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&

42483

N0.getOperand(0).getValueType().is256BitVector() &&

42484

SrcVT.getScalarSizeInBits() <= 32) {

42485

// TODO - support target/faux shuffles.

42486

SDValue Vec = peekThroughBitcasts(N0.getOperand(0));

42487

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {

42488

// To keep the HOP LHS/RHS coherency, we must be able to scale the unary

42489

// shuffle to a vXi64 width - we can probably relax this in the future.

42490

SmallVector<int, 4> ShuffleMask;

42491

if (SVN->getOperand(1).isUndef() &&

42492

scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {

42493

SDLoc DL(N);

42494

SDValue Lo, Hi;

42495

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

42496

std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);

42497

Lo = DAG.getBitcast(N0.getValueType(), Lo);

42498

Hi = DAG.getBitcast(N1.getValueType(), Hi);

42499

SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

42500

Res = DAG.getBitcast(ShufVT, Res);

42501

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

42502

return DAG.getBitcast(VT, Res);

42503

}

42504

}

42505

}

42506

42507

// Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).

42508

// TODO: Merge with binary shuffle folds below.

42509

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

42510

int PostShuffle[4] = {0, 1, 2, 3};

42511

42512

// If the op is an unary shuffle that can scale to v2x64,

42513

// then we can perform this as a v4x32 post shuffle.

42514

auto AdjustOp = [&](SDValue V, int Offset) {

42515

auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);

42516

SmallVector<int, 2> ScaledMask;

42517

if (!SVN || !SVN->getOperand(1).isUndef() ||

42518

!scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||

42519

!N->isOnlyUserOf(V.getNode()))

42520

return SDValue();

42521

PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];

42522

PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];

42523

return SVN->getOperand(0);

42524

};

42525

42526

SDValue Src0 = AdjustOp(N0, 0);

42527

SDValue Src1 = AdjustOp(N1, 2);

42528

if (Src0 || Src1) {

42529

Src0 = Src0 ? Src0 : N0;

42530

Src1 = Src1 ? Src1 : N1;

42531

SDLoc DL(N);

42532

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

42533

SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);

42534

Res = DAG.getBitcast(ShufVT, Res);

42535

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

42536

return DAG.getBitcast(VT, Res);

42537

}

42538

}

42539

42540

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

42541

// TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.

42542

if (VT.is256BitVector() && Subtarget.hasInt256()) {

42543

if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {

42544

if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {

42545

SmallVector<int, 2> ShuffleMask0, ShuffleMask1;

42546

if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&

42547

scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {

42548

SDValue Op00 = SVN0->getOperand(0);

42549

SDValue Op01 = SVN0->getOperand(1);

42550

SDValue Op10 = SVN1->getOperand(0);

42551

SDValue Op11 = SVN1->getOperand(1);

42552

if ((Op00 == Op11) && (Op01 == Op10)) {

42553

std::swap(Op10, Op11);

42554

ShuffleVectorSDNode::commuteMask(ShuffleMask1);

42555

}

42556

if ((Op00 == Op10) && (Op01 == Op11)) {

42557

SmallVector<int, 4> ShuffleMask;

42558

ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());

42559

ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());

42560

SDLoc DL(N);

42561

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

42562

SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);

42563

Res = DAG.getBitcast(ShufVT, Res);

42564

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

42565

return DAG.getBitcast(VT, Res);

42566

}

42567

}

42568

}

42569

}

42570

}

42571

42572

return SDValue();

42573

}

42574

42575

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

42576

TargetLowering::DAGCombinerInfo &DCI,

42577

const X86Subtarget &Subtarget) {

42578

unsigned Opcode = N->getOpcode();

42579

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected pack opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42580, __PRETTY_FUNCTION__))

42580

"Unexpected pack opcode")(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected pack opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42580, __PRETTY_FUNCTION__));

42581

42582

EVT VT = N->getValueType(0);

42583

SDValue N0 = N->getOperand(0);

42584

SDValue N1 = N->getOperand(1);

42585

unsigned NumDstElts = VT.getVectorNumElements();

42586

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

42587

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

42588

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42590, __PRETTY_FUNCTION__))

42589

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42590, __PRETTY_FUNCTION__))

42590

"Unexpected PACKSS/PACKUS input type")((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42590, __PRETTY_FUNCTION__));

42591

42592

bool IsSigned = (X86ISD::PACKSS == Opcode);

42593

42594

// Constant Folding.

42595

APInt UndefElts0, UndefElts1;

42596

SmallVector<APInt, 32> EltBits0, EltBits1;

42597

if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

42598

(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

42599

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

42600

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

42601

unsigned NumLanes = VT.getSizeInBits() / 128;

42602

unsigned NumSrcElts = NumDstElts / 2;

42603

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

42604

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

42605

42606

APInt Undefs(NumDstElts, 0);

42607

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));

42608

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

42609

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

42610

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

42611

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

42612

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

42613

42614

if (UndefElts[SrcIdx]) {

42615

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

42616

continue;

42617

}

42618

42619

APInt &Val = EltBits[SrcIdx];

42620

if (IsSigned) {

42621

// PACKSS: Truncate signed value with signed saturation.

42622

// Source values less than dst minint are saturated to minint.

42623

// Source values greater than dst maxint are saturated to maxint.

42624

if (Val.isSignedIntN(DstBitsPerElt))

42625

Val = Val.trunc(DstBitsPerElt);

42626

else if (Val.isNegative())

42627

Val = APInt::getSignedMinValue(DstBitsPerElt);

42628

else

42629

Val = APInt::getSignedMaxValue(DstBitsPerElt);

42630

} else {

42631

// PACKUS: Truncate signed value with unsigned saturation.

42632

// Source values less than zero are saturated to zero.

42633

// Source values greater than dst maxuint are saturated to maxuint.

42634

if (Val.isIntN(DstBitsPerElt))

42635

Val = Val.trunc(DstBitsPerElt);

42636

else if (Val.isNegative())

42637

Val = APInt::getNullValue(DstBitsPerElt);

42638

else

42639

Val = APInt::getAllOnesValue(DstBitsPerElt);

42640

}

42641

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

42642

}

42643

}

42644

42645

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

42646

}

42647

42648

// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

42649

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

42650

return V;

42651

42652

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

42653

// truncate to create a larger truncate.

42654

if (Subtarget.hasAVX512() &&

42655

N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

42656

N0.getOperand(0).getValueType() == MVT::v8i32) {

42657

if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

42658

(!IsSigned &&

42659

DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

42660

if (Subtarget.hasVLX())

42661

return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

42662

42663

// Widen input to v16i32 so we can truncate that.

42664

SDLoc dl(N);

42665

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

42666

N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

42667

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

42668

}

42669

}

42670

42671

// Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

42672

if (VT.is128BitVector()) {

42673

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

42674

SDValue Src0, Src1;

42675

if (N0.getOpcode() == ExtOpc &&

42676

N0.getOperand(0).getValueType().is64BitVector() &&

42677

N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

42678

Src0 = N0.getOperand(0);

42679

}

42680

if (N1.getOpcode() == ExtOpc &&

42681

N1.getOperand(0).getValueType().is64BitVector() &&

42682

N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

42683

Src1 = N1.getOperand(0);

42684

}

42685

if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

42686

assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)") ? static_cast
<void> (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42686, __PRETTY_FUNCTION__));

42687

Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

42688

Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

42689

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

42690

}

42691

}

42692

42693

// Attempt to combine as shuffle.

42694

SDValue Op(N, 0);

42695

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

42696

return Res;

42697

42698

return SDValue();

42699

}

42700

42701

static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

42702

TargetLowering::DAGCombinerInfo &DCI,

42703

const X86Subtarget &Subtarget) {

42704

assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->
getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB
== N->getOpcode()) && "Unexpected horizontal add/sub opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42706, __PRETTY_FUNCTION__))

42705

X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->
getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB
== N->getOpcode()) && "Unexpected horizontal add/sub opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42706, __PRETTY_FUNCTION__))

42706

"Unexpected horizontal add/sub opcode")(((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->
getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB
== N->getOpcode()) && "Unexpected horizontal add/sub opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42706, __PRETTY_FUNCTION__));

42707

42708

// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

42709

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

42710

return V;

42711

42712

return SDValue();

42713

}

42714

42715

static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

42716

TargetLowering::DAGCombinerInfo &DCI,

42717

const X86Subtarget &Subtarget) {

42718

assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42720, __PRETTY_FUNCTION__))

42719

X86ISD::VSRL == N->getOpcode()) &&(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42720, __PRETTY_FUNCTION__))

42720

"Unexpected shift opcode")(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42720, __PRETTY_FUNCTION__));

42721

EVT VT = N->getValueType(0);

42722

SDValue N0 = N->getOperand(0);

42723

SDValue N1 = N->getOperand(1);

42724

42725

// Shift zero -> zero.

42726

if (ISD::isBuildVectorAllZeros(N0.getNode()))

42727

return DAG.getConstant(0, SDLoc(N), VT);

42728

42729

// Detect constant shift amounts.

42730

APInt UndefElts;

42731

SmallVector<APInt, 32> EltBits;

42732

if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {

42733

unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

42734

return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

42735

EltBits[0].getZExtValue(), DAG);

42736

}

42737

42738

APInt KnownUndef, KnownZero;

42739

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42740

APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());

42741

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,

42742

KnownZero, DCI))

42743

return SDValue(N, 0);

42744

42745

return SDValue();

42746

}

42747

42748

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

42749

TargetLowering::DAGCombinerInfo &DCI,

42750

const X86Subtarget &Subtarget) {

42751

unsigned Opcode = N->getOpcode();

42752

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42754, __PRETTY_FUNCTION__))

42753

X86ISD::VSRLI == Opcode) &&(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42754, __PRETTY_FUNCTION__))

42754

"Unexpected shift opcode")(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42754, __PRETTY_FUNCTION__));

42755

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

42756

EVT VT = N->getValueType(0);

42757

SDValue N0 = N->getOperand(0);

42758

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

42759

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42760, __PRETTY_FUNCTION__))

42760

"Unexpected value type")((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42760, __PRETTY_FUNCTION__));

42761

assert(N->getOperand(1).getValueType() == MVT::i8 &&((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42762, __PRETTY_FUNCTION__))

42762

"Unexpected shift amount type")((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42762, __PRETTY_FUNCTION__));

42763

42764

// Out of range logical bit shifts are guaranteed to be zero.

42765

// Out of range arithmetic bit shifts splat the sign bit.

42766

unsigned ShiftVal = N->getConstantOperandVal(1);

42767

if (ShiftVal >= NumBitsPerElt) {

42768

if (LogicalShift)

42769

return DAG.getConstant(0, SDLoc(N), VT);

42770

ShiftVal = NumBitsPerElt - 1;

42771

}

42772

42773

// (shift X, 0) -> X

42774

if (!ShiftVal)

42775

return N0;

42776

42777

// (shift 0, C) -> 0

42778

if (ISD::isBuildVectorAllZeros(N0.getNode()))

42779

// N0 is all zeros or undef. We guarantee that the bits shifted into the

42780

// result are all zeros, not undef.

42781

return DAG.getConstant(0, SDLoc(N), VT);

42782

42783

// (VSRAI -1, C) -> -1

42784

if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

42785

// N0 is all ones or undef. We guarantee that the bits shifted into the

42786

// result are all ones, not undef.

42787

return DAG.getConstant(-1, SDLoc(N), VT);

42788

42789

// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

42790

if (Opcode == N0.getOpcode()) {

42791

unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();

42792

unsigned NewShiftVal = ShiftVal + ShiftVal2;

42793

if (NewShiftVal >= NumBitsPerElt) {

42794

// Out of range logical bit shifts are guaranteed to be zero.

42795

// Out of range arithmetic bit shifts splat the sign bit.

42796

if (LogicalShift)

42797

return DAG.getConstant(0, SDLoc(N), VT);

42798

NewShiftVal = NumBitsPerElt - 1;

42799

}

42800

return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

42801

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

42802

}

42803

42804

// We can decode 'whole byte' logical bit shifts as shuffles.

42805

if (LogicalShift && (ShiftVal % 8) == 0) {

42806

SDValue Op(N, 0);

42807

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

42808

return Res;

42809

}

42810

42811

// Constant Folding.

42812

APInt UndefElts;

42813

SmallVector<APInt, 32> EltBits;

42814

if (N->isOnlyUserOf(N0.getNode()) &&

42815

getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {

42816

assert(EltBits.size() == VT.getVectorNumElements() &&((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42817, __PRETTY_FUNCTION__))

42817

"Unexpected shift value type")((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42817, __PRETTY_FUNCTION__));

42818

// Undef elements need to fold to 0. It's possible SimplifyDemandedBits

42819

// created an undef input due to no input bits being demanded, but user

42820

// still expects 0 in other bits.

42821

for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

42822

APInt &Elt = EltBits[i];

42823

if (UndefElts[i])

42824

Elt = 0;

42825

else if (X86ISD::VSHLI == Opcode)

42826

Elt <<= ShiftVal;

42827

else if (X86ISD::VSRAI == Opcode)

42828

Elt.ashrInPlace(ShiftVal);

42829

else

42830

Elt.lshrInPlace(ShiftVal);

42831

}

42832

// Reset undef elements since they were zeroed above.

42833

UndefElts = 0;

42834

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

42835

}

42836

42837

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42838

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

42839

APInt::getAllOnesValue(NumBitsPerElt), DCI))

42840

return SDValue(N, 0);

42841

42842

return SDValue();

42843

}

42844

42845

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

42846

TargetLowering::DAGCombinerInfo &DCI,

42847

const X86Subtarget &Subtarget) {

42848

EVT VT = N->getValueType(0);

42849

assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42852, __PRETTY_FUNCTION__))

42850

(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42852, __PRETTY_FUNCTION__))

42851

N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42852, __PRETTY_FUNCTION__))

42852

"Unexpected vector insertion")((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42852, __PRETTY_FUNCTION__));

42853

42854

if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {

42855

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

42856

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42857

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

42858

APInt::getAllOnesValue(NumBitsPerElt), DCI))

42859

return SDValue(N, 0);

42860

}

42861

42862

// Attempt to combine insertion patterns to a shuffle.

42863

if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

42864

SDValue Op(N, 0);

42865

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

42866

return Res;

42867

}

42868

42869

return SDValue();

42870

}

42871

42872

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

42873

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

42874

/// OR -> CMPNEQSS.

42875

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

42876

TargetLowering::DAGCombinerInfo &DCI,

42877

const X86Subtarget &Subtarget) {

42878

unsigned opcode;

42879

42880

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

42881

// we're requiring SSE2 for both.

42882

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

42883

SDValue N0 = N->getOperand(0);

42884

SDValue N1 = N->getOperand(1);

42885

SDValue CMP0 = N0.getOperand(1);

42886

SDValue CMP1 = N1.getOperand(1);

42887

SDLoc DL(N);

42888

42889

// The SETCCs should both refer to the same CMP.

42890

if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

42891

return SDValue();

42892

42893

SDValue CMP00 = CMP0->getOperand(0);

42894

SDValue CMP01 = CMP0->getOperand(1);

42895

EVT VT = CMP00.getValueType();

42896

42897

if (VT == MVT::f32 || VT == MVT::f64) {

42898

bool ExpectingFlags = false;

42899

// Check for any users that want flags:

42900

for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();

42901

!ExpectingFlags && UI != UE; ++UI)

42902

switch (UI->getOpcode()) {

42903

default:

42904

case ISD::BR_CC:

42905

case ISD::BRCOND:

42906

case ISD::SELECT:

42907

ExpectingFlags = true;

42908

break;

42909

case ISD::CopyToReg:

42910

case ISD::SIGN_EXTEND:

42911

case ISD::ZERO_EXTEND:

42912

case ISD::ANY_EXTEND:

42913

break;

42914

}

42915

42916

if (!ExpectingFlags) {

42917

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

42918

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

42919

42920

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

42921

X86::CondCode tmp = cc0;

42922

cc0 = cc1;

42923

cc1 = tmp;

42924

}

42925

42926

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

42927

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

42928

// FIXME: need symbolic constants for these magic numbers.

42929

// See X86ATTInstPrinter.cpp:printSSECC().

42930

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

42931

if (Subtarget.hasAVX512()) {

42932

SDValue FSetCC =

42933

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

42934

DAG.getTargetConstant(x86cc, DL, MVT::i8));

42935

// Need to fill with zeros to ensure the bitcast will produce zeroes

42936

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

42937

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

42938

DAG.getConstant(0, DL, MVT::v16i1),

42939

FSetCC, DAG.getIntPtrConstant(0, DL));

42940

return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

42941

N->getSimpleValueType(0));

42942

}

42943

SDValue OnesOrZeroesF =

42944

DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

42945

CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

42946

42947

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

42948

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

42949

42950

if (is64BitFP && !Subtarget.is64Bit()) {

42951

// On a 32-bit target, we cannot bitcast the 64-bit float to a

42952

// 64-bit integer, since that's not a legal type. Since

42953

// OnesOrZeroesF is all ones of all zeroes, we don't need all the

42954

// bits, but can do this little dance to extract the lowest 32 bits

42955

// and work with those going forward.

42956

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

42957

OnesOrZeroesF);

42958

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

42959

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

42960

Vector32, DAG.getIntPtrConstant(0, DL));

42961

IntVT = MVT::i32;

42962

}

42963

42964

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

42965

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

42966

DAG.getConstant(1, DL, IntVT));

42967

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

42968

ANDed);

42969

return OneBitOfTruth;

42970

}

42971

}

42972

}

42973

}

42974

return SDValue();

42975

}

42976

42977

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

42978

static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {

42979

assert(N->getOpcode() == ISD::AND)((N->getOpcode() == ISD::AND) ? static_cast<void> (0
) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42979, __PRETTY_FUNCTION__));

42980

42981

MVT VT = N->getSimpleValueType(0);

42982

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

42983

return SDValue();

42984

42985

SDValue X, Y;

42986

SDValue N0 = N->getOperand(0);

42987

SDValue N1 = N->getOperand(1);

42988

42989

auto GetNot = [&VT, &DAG](SDValue V) {

42990

// Basic X = NOT(Y) detection.

42991

if (SDValue Not = IsNOT(V, DAG))

42992

return Not;

42993

// Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).

42994

if (V.getOpcode() == X86ISD::VBROADCAST) {

42995

SDValue Src = V.getOperand(0);

42996

EVT SrcVT = Src.getValueType();

42997

if (!SrcVT.isVector())

42998

return SDValue();

42999

if (SDValue Not = IsNOT(Src, DAG))

43000

return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,

43001

DAG.getBitcast(SrcVT, Not));

43002

}

43003

return SDValue();

43004

};

43005

43006

if (SDValue Not = GetNot(N0)) {

43007

X = Not;

43008

Y = N1;

43009

} else if (SDValue Not = GetNot(N1)) {

43010

X = Not;

43011

Y = N0;

43012

} else

43013

return SDValue();

43014

43015

X = DAG.getBitcast(VT, X);

43016

Y = DAG.getBitcast(VT, Y);

43017

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

43018

}

43019

43020

// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

43021

// logical operations, like in the example below.

43022

// or (and (truncate x, truncate y)),

43023

// (xor (truncate z, build_vector (constants)))

43024

// Given a target type \p VT, we generate

43025

// or (and x, y), (xor z, zext(build_vector (constants)))

43026

// given x, y and z are of type \p VT. We can do so, if operands are either

43027

// truncates from VT types, the second operand is a vector of constants or can

43028

// be recursively promoted.

43029

static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

43030

unsigned Depth) {

43031

// Limit recursion to avoid excessive compile times.

43032

if (Depth >= SelectionDAG::MaxRecursionDepth)

43033

return SDValue();

43034

43035

if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

43036

N->getOpcode() != ISD::OR)

43037

return SDValue();

43038

43039

SDValue N0 = N->getOperand(0);

43040

SDValue N1 = N->getOperand(1);

43041

SDLoc DL(N);

43042

43043

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43044

if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

43045

return SDValue();

43046

43047

if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

43048

N0 = NN0;

43049

else {

43050

// The Left side has to be a trunc.

43051

if (N0.getOpcode() != ISD::TRUNCATE)

43052

return SDValue();

43053

43054

// The type of the truncated inputs.

43055

if (N0.getOperand(0).getValueType() != VT)

43056

return SDValue();

43057

43058

N0 = N0.getOperand(0);

43059

}

43060

43061

if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

43062

N1 = NN1;

43063

else {

43064

// The right side has to be a 'trunc' or a constant vector.

43065

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

43066

N1.getOperand(0).getValueType() == VT;

43067

if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

43068

return SDValue();

43069

43070

if (RHSTrunc)

43071

N1 = N1.getOperand(0);

43072

else

43073

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

43074

}

43075

43076

return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

43077

}

43078

43079

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

43080

// register. In most cases we actually compare or select YMM-sized registers

43081

// and mixing the two types creates horrible code. This method optimizes

43082

// some of the transition sequences.

43083

// Even with AVX-512 this is still useful for removing casts around logical

43084

// operations on vXi1 mask types.

43085

static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

43086

const X86Subtarget &Subtarget) {

43087

EVT VT = N->getValueType(0);

43088

assert(VT.isVector() && "Expected vector type")((VT.isVector() && "Expected vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43088, __PRETTY_FUNCTION__));

43089

43090

SDLoc DL(N);

43091

assert((N->getOpcode() == ISD::ANY_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43093, __PRETTY_FUNCTION__))

43092

N->getOpcode() == ISD::ZERO_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43093, __PRETTY_FUNCTION__))

43093

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43093, __PRETTY_FUNCTION__));

43094

43095

SDValue Narrow = N->getOperand(0);

43096

EVT NarrowVT = Narrow.getValueType();

43097

43098

// Generate the wide operation.

43099

SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

43100

if (!Op)

43101

return SDValue();

43102

switch (N->getOpcode()) {

43103

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43103);

43104

case ISD::ANY_EXTEND:

43105

return Op;

43106

case ISD::ZERO_EXTEND:

43107

return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

43108

case ISD::SIGN_EXTEND:

43109

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

43110

Op, DAG.getValueType(NarrowVT));

43111

}

43112

}

43113

43114

static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

43115

unsigned FPOpcode;

43116

switch (Opcode) {

43117

default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43117);

43118

case ISD::AND: FPOpcode = X86ISD::FAND; break;

43119

case ISD::OR: FPOpcode = X86ISD::FOR; break;

43120

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

43121

}

43122

return FPOpcode;

43123

}

43124

43125

/// If both input operands of a logic op are being cast from floating point

43126

/// types, try to convert this into a floating point logic node to avoid

43127

/// unnecessary moves from SSE to integer registers.

43128

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

43129

const X86Subtarget &Subtarget) {

43130

EVT VT = N->getValueType(0);

43131

SDValue N0 = N->getOperand(0);

43132

SDValue N1 = N->getOperand(1);

43133

SDLoc DL(N);

43134

43135

if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)

43136

return SDValue();

43137

43138

SDValue N00 = N0.getOperand(0);

43139

SDValue N10 = N1.getOperand(0);

43140

EVT N00Type = N00.getValueType();

43141

EVT N10Type = N10.getValueType();

43142

43143

// Ensure that both types are the same and are legal scalar fp types.

43144

if (N00Type != N10Type ||

43145

!((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

43146

(Subtarget.hasSSE2() && N00Type == MVT::f64)))

43147

return SDValue();

43148

43149

unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

43150

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

43151

return DAG.getBitcast(VT, FPLogic);

43152

}

43153

43154

// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

43155

// to reduce XMM->GPR traffic.

43156

static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

43157

unsigned Opc = N->getOpcode();

43158

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
"Unexpected bit opcode") ? static_cast<void> (0) : __assert_fail
("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43159, __PRETTY_FUNCTION__))

43159

"Unexpected bit opcode")(((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
"Unexpected bit opcode") ? static_cast<void> (0) : __assert_fail
("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43159, __PRETTY_FUNCTION__));

43160

43161

SDValue N0 = N->getOperand(0);

43162

SDValue N1 = N->getOperand(1);

43163

43164

// Both operands must be single use MOVMSK.

43165

if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

43166

N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

43167

return SDValue();

43168

43169

SDValue Vec0 = N0.getOperand(0);

43170

SDValue Vec1 = N1.getOperand(0);

43171

EVT VecVT0 = Vec0.getValueType();

43172

EVT VecVT1 = Vec1.getValueType();

43173

43174

// Both MOVMSK operands must be from vectors of the same size and same element

43175

// size, but its OK for a fp/int diff.

43176

if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

43177

VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

43178

return SDValue();

43179

43180

SDLoc DL(N);

43181

unsigned VecOpc =

43182

VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

43183

SDValue Result =

43184

DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

43185

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

43186

}

43187

43188

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

43189

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

43190

/// with a shift-right to eliminate loading the vector constant mask value.

43191

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

43192

const X86Subtarget &Subtarget) {

43193

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

43194

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

43195

EVT VT0 = Op0.getValueType();

43196

EVT VT1 = Op1.getValueType();

43197

43198

if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())

43199

return SDValue();

43200

43201

APInt SplatVal;

43202

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

43203

!SplatVal.isMask())

43204

return SDValue();

43205

43206

// Don't prevent creation of ANDN.

43207

if (isBitwiseNot(Op0))

43208

return SDValue();

43209

43210

if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))

43211

return SDValue();

43212

43213

unsigned EltBitWidth = VT0.getScalarSizeInBits();

43214

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

43215

return SDValue();

43216

43217

SDLoc DL(N);

43218

unsigned ShiftVal = SplatVal.countTrailingOnes();

43219

SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

43220

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);

43221

return DAG.getBitcast(N->getValueType(0), Shift);

43222

}

43223

43224

// Get the index node from the lowered DAG of a GEP IR instruction with one

43225

// indexing dimension.

43226

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

43227

if (Ld->isIndexed())

43228

return SDValue();

43229

43230

SDValue Base = Ld->getBasePtr();

43231

43232

if (Base.getOpcode() != ISD::ADD)

43233

return SDValue();

43234

43235

SDValue ShiftedIndex = Base.getOperand(0);

43236

43237

if (ShiftedIndex.getOpcode() != ISD::SHL)

43238

return SDValue();

43239

43240

return ShiftedIndex.getOperand(0);

43241

43242

}

43243

43244

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

43245

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

43246

switch (VT.getSizeInBits()) {

43247

default: return false;

43248

case 64: return Subtarget.is64Bit() ? true : false;

43249

case 32: return true;

43250

}

43251

}

43252

return false;

43253

}

43254

43255

// This function recognizes cases where X86 bzhi instruction can replace and

43256

// 'and-load' sequence.

43257

// In case of loading integer value from an array of constants which is defined

43258

// as follows:

43259

//

43260

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

43261

//

43262

// then applying a bitwise and on the result with another input.

43263

// It's equivalent to performing bzhi (zero high bits) on the input, with the

43264

// same index of the load.

43265

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

43266

const X86Subtarget &Subtarget) {

43267

MVT VT = Node->getSimpleValueType(0);

43268

SDLoc dl(Node);

43269

43270

// Check if subtarget has BZHI instruction for the node's type

43271

if (!hasBZHI(Subtarget, VT))

43272

return SDValue();

43273

43274

// Try matching the pattern for both operands.

43275

for (unsigned i = 0; i < 2; i++) {

43276

SDValue N = Node->getOperand(i);

43277

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

43278

43279

// continue if the operand is not a load instruction

43280

if (!Ld)

43281

return SDValue();

43282

43283

const Value *MemOp = Ld->getMemOperand()->getValue();

43284

43285

if (!MemOp)

43286

return SDValue();

43287

43288

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

43289

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

43290

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

43291

43292

Constant *Init = GV->getInitializer();

43293

Type *Ty = Init->getType();

43294

if (!isa<ConstantDataArray>(Init) ||

43295

!Ty->getArrayElementType()->isIntegerTy() ||

43296

Ty->getArrayElementType()->getScalarSizeInBits() !=

43297

VT.getSizeInBits() ||

43298

Ty->getArrayNumElements() >

43299

Ty->getArrayElementType()->getScalarSizeInBits())

43300

continue;

43301

43302

// Check if the array's constant elements are suitable to our case.

43303

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

43304

bool ConstantsMatch = true;

43305

for (uint64_t j = 0; j < ArrayElementCount; j++) {

43306

ConstantInt *Elem =

43307

dyn_cast<ConstantInt>(Init->getAggregateElement(j));

43308

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

43309

ConstantsMatch = false;

43310

break;

43311

}

43312

}

43313

if (!ConstantsMatch)

43314

continue;

43315

43316

// Do the transformation (For 32-bit type):

43317

// -> (and (load arr[idx]), inp)

43318

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

43319

// that will be replaced with one bzhi instruction.

43320

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

43321

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

43322

43323

// Get the Node which indexes into the array.

43324

SDValue Index = getIndexFromUnindexedLoad(Ld);

43325

if (!Index)

43326

return SDValue();

43327

Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

43328

43329

SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

43330

Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

43331

43332

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

43333

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

43334

43335

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

43336

}

43337

}

43338

}

43339

}

43340

return SDValue();

43341

}

43342

43343

// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

43344

// Where C is a mask containing the same number of bits as the setcc and

43345

// where the setcc will freely 0 upper bits of k-register. We can replace the

43346

// undef in the concat with 0s and remove the AND. This mainly helps with

43347

// v2i1/v4i1 setcc being casted to scalar.

43348

static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

43349

const X86Subtarget &Subtarget) {

43350

assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")((N->getOpcode() == ISD::AND && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43350, __PRETTY_FUNCTION__));

43351

43352

EVT VT = N->getValueType(0);

43353

43354

// Make sure this is an AND with constant. We will check the value of the

43355

// constant later.

43356

if (!isa<ConstantSDNode>(N->getOperand(1)))

43357

return SDValue();

43358

43359

// This is implied by the ConstantSDNode.

43360

assert(!VT.isVector() && "Expected scalar VT!")((!VT.isVector() && "Expected scalar VT!") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43360, __PRETTY_FUNCTION__));

43361

43362

if (N->getOperand(0).getOpcode() != ISD::BITCAST ||

43363

!N->getOperand(0).hasOneUse() ||

43364

!N->getOperand(0).getOperand(0).hasOneUse())

43365

return SDValue();

43366

43367

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43368

SDValue Src = N->getOperand(0).getOperand(0);

43369

EVT SrcVT = Src.getValueType();

43370

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

43371

!TLI.isTypeLegal(SrcVT))

43372

return SDValue();

43373

43374

if (Src.getOpcode() != ISD::CONCAT_VECTORS)

43375

return SDValue();

43376

43377

// We only care about the first subvector of the concat, we expect the

43378

// other subvectors to be ignored due to the AND if we make the change.

43379

SDValue SubVec = Src.getOperand(0);

43380

EVT SubVecVT = SubVec.getValueType();

43381

43382

// First subvector should be a setcc with a legal result type. The RHS of the

43383

// AND should be a mask with this many bits.

43384

if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||

43385

!N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))

43386

return SDValue();

43387

43388

EVT SetccVT = SubVec.getOperand(0).getValueType();

43389

if (!TLI.isTypeLegal(SetccVT) ||

43390

!(Subtarget.hasVLX() || SetccVT.is512BitVector()))

43391

return SDValue();

43392

43393

if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

43394

return SDValue();

43395

43396

// We passed all the checks. Rebuild the concat_vectors with zeroes

43397

// and cast it back to VT.

43398

SDLoc dl(N);

43399

SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

43400

DAG.getConstant(0, dl, SubVecVT));

43401

Ops[0] = SubVec;

43402

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

43403

Ops);

43404

return DAG.getBitcast(VT, Concat);

43405

}

43406

43407

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

43408

TargetLowering::DAGCombinerInfo &DCI,

43409

const X86Subtarget &Subtarget) {

43410

EVT VT = N->getValueType(0);

43411

43412

// If this is SSE1 only convert to FAND to avoid scalarization.

43413

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

43414

return DAG.getBitcast(

43415

MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,

43416

DAG.getBitcast(MVT::v4f32, N->getOperand(0)),

43417

DAG.getBitcast(MVT::v4f32, N->getOperand(1))));

43418

}

43419

43420

// Use a 32-bit and+zext if upper bits known zero.

43421

if (VT == MVT::i64 && Subtarget.is64Bit() &&

43422

!isa<ConstantSDNode>(N->getOperand(1))) {

43423

APInt HiMask = APInt::getHighBitsSet(64, 32);

43424

if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||

43425

DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {

43426

SDLoc dl(N);

43427

SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));

43428

SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));

43429

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

43430

DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

43431

}

43432

}

43433

43434

// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

43435

// TODO: Support multiple SrcOps.

43436

if (VT == MVT::i1) {

43437

SmallVector<SDValue, 2> SrcOps;

43438

SmallVector<APInt, 2> SrcPartials;

43439

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

43440

SrcOps.size() == 1) {

43441

SDLoc dl(N);

43442

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43443

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

43444

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

43445

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

43446

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

43447

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

43448

if (Mask) {

43449

assert(SrcPartials[0].getBitWidth() == NumElts &&((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43450, __PRETTY_FUNCTION__))

43450

"Unexpected partial reduction mask")((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43450, __PRETTY_FUNCTION__));

43451

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

43452

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

43453

return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

43454

}

43455

}

43456

}

43457

43458

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

43459

return V;

43460

43461

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

43462

return R;

43463

43464

if (DCI.isBeforeLegalizeOps())

43465

return SDValue();

43466

43467

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

43468

return R;

43469

43470

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

43471

return FPLogic;

43472

43473

if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))

43474

return R;

43475

43476

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

43477

return ShiftRight;

43478

43479

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

43480

return R;

43481

43482

// Attempt to recursively combine a bitmask AND with shuffles.

43483

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

43484

SDValue Op(N, 0);

43485

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

43486

return Res;

43487

}

43488

43489

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

43490

if ((VT.getScalarSizeInBits() % 8) == 0 &&

43491

N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

43492

isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {

43493

SDValue BitMask = N->getOperand(1);

43494

SDValue SrcVec = N->getOperand(0).getOperand(0);

43495

EVT SrcVecVT = SrcVec.getValueType();

43496

43497

// Check that the constant bitmask masks whole bytes.

43498

APInt UndefElts;

43499

SmallVector<APInt, 64> EltBits;

43500

if (VT == SrcVecVT.getScalarType() &&

43501

N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&

43502

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

43503

llvm::all_of(EltBits, [](const APInt &M) {

43504

return M.isNullValue() || M.isAllOnesValue();

43505

})) {

43506

unsigned NumElts = SrcVecVT.getVectorNumElements();

43507

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

43508

unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

43509

43510

// Create a root shuffle mask from the byte mask and the extracted index.

43511

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

43512

for (unsigned i = 0; i != Scale; ++i) {

43513

if (UndefElts[i])

43514

continue;

43515

int VecIdx = Scale * Idx + i;

43516

ShuffleMask[VecIdx] =

43517

EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;

43518

}

43519

43520

if (SDValue Shuffle = combineX86ShufflesRecursively(

43521

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,

43522

X86::MaxShuffleCombineDepth,

43523

/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))

43524

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,

43525

N->getOperand(0).getOperand(1));

43526

}

43527

}

43528

43529

return SDValue();

43530

}

43531

43532

// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))

43533

static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

43534

const X86Subtarget &Subtarget) {

43535

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43535, __PRETTY_FUNCTION__));

43536

43537

MVT VT = N->getSimpleValueType(0);

43538

if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)

43539

return SDValue();

43540

43541

SDValue N0 = peekThroughBitcasts(N->getOperand(0));

43542

SDValue N1 = peekThroughBitcasts(N->getOperand(1));

43543

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

43544

return SDValue();

43545

43546

// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

43547

// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

43548

bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||

43549

Subtarget.hasVLX();

43550

if (!(Subtarget.hasXOP() || UseVPTERNLOG ||

43551

!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

43552

return SDValue();

43553

43554

// Attempt to extract constant byte masks.

43555

APInt UndefElts0, UndefElts1;

43556

SmallVector<APInt, 32> EltBits0, EltBits1;

43557

if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

43558

false, false))

43559

return SDValue();

43560

if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

43561

false, false))

43562

return SDValue();

43563

43564

for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

43565

// TODO - add UNDEF elts support.

43566

if (UndefElts0[i] || UndefElts1[i])

43567

return SDValue();

43568

if (EltBits0[i] != ~EltBits1[i])

43569

return SDValue();

43570

}

43571

43572

SDLoc DL(N);

43573

43574

if (UseVPTERNLOG) {

43575

// Emit a VPTERNLOG node directly.

43576

SDValue A = DAG.getBitcast(VT, N0.getOperand(1));

43577

SDValue B = DAG.getBitcast(VT, N0.getOperand(0));

43578

SDValue C = DAG.getBitcast(VT, N1.getOperand(0));

43579

SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

43580

return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);

43581

}

43582

43583

SDValue X = N->getOperand(0);

43584

SDValue Y =

43585

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

43586

DAG.getBitcast(VT, N1.getOperand(0)));

43587

return DAG.getNode(ISD::OR, DL, VT, X, Y);

43588

}

43589

43590

// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

43591

static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

43592

if (N->getOpcode() != ISD::OR)

43593

return false;

43594

43595

SDValue N0 = N->getOperand(0);

43596

SDValue N1 = N->getOperand(1);

43597

43598

// Canonicalize AND to LHS.

43599

if (N1.getOpcode() == ISD::AND)

43600

std::swap(N0, N1);

43601

43602

// Attempt to match OR(AND(M,Y),ANDNP(M,X)).

43603

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

43604

return false;

43605

43606

Mask = N1.getOperand(0);

43607

X = N1.getOperand(1);

43608

43609

// Check to see if the mask appeared in both the AND and ANDNP.

43610

if (N0.getOperand(0) == Mask)

43611

Y = N0.getOperand(1);

43612

else if (N0.getOperand(1) == Mask)

43613

Y = N0.getOperand(0);

43614

else

43615

return false;

43616

43617

// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for

43618

// ANDNP combine allows other combines to happen that prevent matching.

43619

return true;

43620

}

43621

43622

// Try to fold:

43623

// (or (and (m, y), (pandn m, x)))

43624

// into:

43625

// (vselect m, x, y)

43626

// As a special case, try to fold:

43627

// (or (and (m, (sub 0, x)), (pandn m, x)))

43628

// into:

43629

// (sub (xor X, M), M)

43630

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

43631

const X86Subtarget &Subtarget) {

43632

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43632, __PRETTY_FUNCTION__));

43633

43634

EVT VT = N->getValueType(0);

43635

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

43636

(VT.is256BitVector() && Subtarget.hasInt256())))

43637

return SDValue();

43638

43639

SDValue X, Y, Mask;

43640

if (!matchLogicBlend(N, X, Y, Mask))

43641

return SDValue();

43642

43643

// Validate that X, Y, and Mask are bitcasts, and see through them.

43644

Mask = peekThroughBitcasts(Mask);

43645

X = peekThroughBitcasts(X);

43646

Y = peekThroughBitcasts(Y);

43647

43648

EVT MaskVT = Mask.getValueType();

43649

unsigned EltBits = MaskVT.getScalarSizeInBits();

43650

43651

// TODO: Attempt to handle floating point cases as well?

43652

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

43653

return SDValue();

43654

43655

SDLoc DL(N);

43656

43657

// Attempt to combine to conditional negate: (sub (xor X, M), M)

43658

if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

43659

DAG, Subtarget))

43660

return Res;

43661

43662

// PBLENDVB is only available on SSE 4.1.

43663

if (!Subtarget.hasSSE41())

43664

return SDValue();

43665

43666

// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

43667

if (Subtarget.hasVLX())

43668

return SDValue();

43669

43670

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

43671

43672

X = DAG.getBitcast(BlendVT, X);

43673

Y = DAG.getBitcast(BlendVT, Y);

43674

Mask = DAG.getBitcast(BlendVT, Mask);

43675

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

43676

return DAG.getBitcast(VT, Mask);

43677

}

43678

43679

// Helper function for combineOrCmpEqZeroToCtlzSrl

43680

// Transforms:

43681

// seteq(cmp x, 0)

43682

// into:

43683

// srl(ctlz x), log2(bitsize(x))

43684

// Input pattern is checked by caller.

43685

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,

43686

SelectionDAG &DAG) {

43687

SDValue Cmp = Op.getOperand(1);

43688

EVT VT = Cmp.getOperand(0).getValueType();

43689

unsigned Log2b = Log2_32(VT.getSizeInBits());

43690

SDLoc dl(Op);

43691

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

43692

// The result of the shift is true or false, and on X86, the 32-bit

43693

// encoding of shr and lzcnt is more desirable.

43694

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

43695

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

43696

DAG.getConstant(Log2b, dl, MVT::i8));

43697

return DAG.getZExtOrTrunc(Scc, dl, ExtTy);

43698

}

43699

43700

// Try to transform:

43701

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

43702

// into:

43703

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

43704

// Will also attempt to match more generic cases, eg:

43705

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

43706

// Only applies if the target supports the FastLZCNT feature.

43707

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

43708

TargetLowering::DAGCombinerInfo &DCI,

43709

const X86Subtarget &Subtarget) {

43710

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

43711

return SDValue();

43712

43713

auto isORCandidate = [](SDValue N) {

43714

return (N->getOpcode() == ISD::OR && N->hasOneUse());

43715

};

43716

43717

// Check the zero extend is extending to 32-bit or more. The code generated by

43718

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

43719

// instructions to clear the upper bits.

43720

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

43721

!isORCandidate(N->getOperand(0)))

43722

return SDValue();

43723

43724

// Check the node matches: setcc(eq, cmp 0)

43725

auto isSetCCCandidate = [](SDValue N) {

43726

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

43727

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

43728

N->getOperand(1).getOpcode() == X86ISD::CMP &&

43729

isNullConstant(N->getOperand(1).getOperand(1)) &&

43730

N->getOperand(1).getValueType().bitsGE(MVT::i32);

43731

};

43732

43733

SDNode *OR = N->getOperand(0).getNode();

43734

SDValue LHS = OR->getOperand(0);

43735

SDValue RHS = OR->getOperand(1);

43736

43737

// Save nodes matching or(or, setcc(eq, cmp 0)).

43738

SmallVector<SDNode *, 2> ORNodes;

43739

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

43740

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

43741

ORNodes.push_back(OR);

43742

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

43743

LHS = OR->getOperand(0);

43744

RHS = OR->getOperand(1);

43745

}

43746

43747

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

43748

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

43749

!isORCandidate(SDValue(OR, 0)))

43750

return SDValue();

43751

43752

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

43753

// to

43754

// or(srl(ctlz),srl(ctlz)).

43755

// The dag combiner can then fold it into:

43756

// srl(or(ctlz, ctlz)).

43757

EVT VT = OR->getValueType(0);

43758

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);

43759

SDValue Ret, NewRHS;

43760

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))

43761

Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

43762

43763

if (!Ret)

43764

return SDValue();

43765

43766

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

43767

while (ORNodes.size() > 0) {

43768

OR = ORNodes.pop_back_val();

43769

LHS = OR->getOperand(0);

43770

RHS = OR->getOperand(1);

43771

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

43772

if (RHS->getOpcode() == ISD::OR)

43773

std::swap(LHS, RHS);

43774

NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);

43775

if (!NewRHS)

43776

return SDValue();

43777

Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);

43778

}

43779

43780

if (Ret)

43781

Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

43782

43783

return Ret;

43784

}

43785

43786

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

43787

TargetLowering::DAGCombinerInfo &DCI,

43788

const X86Subtarget &Subtarget) {

43789

SDValue N0 = N->getOperand(0);

43790

SDValue N1 = N->getOperand(1);

43791

EVT VT = N->getValueType(0);

43792

43793

// If this is SSE1 only convert to FOR to avoid scalarization.

43794

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

43795

return DAG.getBitcast(MVT::v4i32,

43796

DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,

43797

DAG.getBitcast(MVT::v4f32, N0),

43798

DAG.getBitcast(MVT::v4f32, N1)));

43799

}

43800

43801

// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

43802

// TODO: Support multiple SrcOps.

43803

if (VT == MVT::i1) {

43804

SmallVector<SDValue, 2> SrcOps;

43805

SmallVector<APInt, 2> SrcPartials;

43806

if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

43807

SrcOps.size() == 1) {

43808

SDLoc dl(N);

43809

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43810

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

43811

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

43812

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

43813

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

43814

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

43815

if (Mask) {

43816

assert(SrcPartials[0].getBitWidth() == NumElts &&((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43817, __PRETTY_FUNCTION__))

43817

"Unexpected partial reduction mask")((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43817, __PRETTY_FUNCTION__));

43818

SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

43819

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

43820

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

43821

return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

43822

}

43823

}

43824

}

43825

43826

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

43827

return R;

43828

43829

if (DCI.isBeforeLegalizeOps())

43830

return SDValue();

43831

43832

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

43833

return R;

43834

43835

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

43836

return FPLogic;

43837

43838

if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

43839

return R;

43840

43841

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

43842

return R;

43843

43844

// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

43845

// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

43846

// iff the upper elements of the non-shifted arg are zero.

43847

// KUNPCK require 16+ bool vector elements.

43848

if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

43849

unsigned NumElts = VT.getVectorNumElements();

43850

unsigned HalfElts = NumElts / 2;

43851

APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

43852

if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

43853

N1.getConstantOperandAPInt(1) == HalfElts &&

43854

DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {

43855

SDLoc dl(N);

43856

return DAG.getNode(

43857

ISD::CONCAT_VECTORS, dl, VT,

43858

extractSubVector(N0, 0, DAG, dl, HalfElts),

43859

extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

43860

}

43861

if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

43862

N0.getConstantOperandAPInt(1) == HalfElts &&

43863

DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {

43864

SDLoc dl(N);

43865

return DAG.getNode(

43866

ISD::CONCAT_VECTORS, dl, VT,

43867

extractSubVector(N1, 0, DAG, dl, HalfElts),

43868

extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

43869

}

43870

}

43871

43872

// Attempt to recursively combine an OR of shuffles.

43873

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

43874

SDValue Op(N, 0);

43875

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

43876

return Res;

43877

}

43878

43879

return SDValue();

43880

}

43881

43882

/// Try to turn tests against the signbit in the form of:

43883

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

43884

/// into:

43885

/// SETGT(X, -1)

43886

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

43887

// This is only worth doing if the output type is i8 or i1.

43888

EVT ResultType = N->getValueType(0);

43889

if (ResultType != MVT::i8 && ResultType != MVT::i1)

43890

return SDValue();

43891

43892

SDValue N0 = N->getOperand(0);

43893

SDValue N1 = N->getOperand(1);

43894

43895

// We should be performing an xor against a truncated shift.

43896

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

43897

return SDValue();

43898

43899

// Make sure we are performing an xor against one.

43900

if (!isOneConstant(N1))

43901

return SDValue();

43902

43903

// SetCC on x86 zero extends so only act on this if it's a logical shift.

43904

SDValue Shift = N0.getOperand(0);

43905

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

43906

return SDValue();

43907

43908

// Make sure we are truncating from one of i16, i32 or i64.

43909

EVT ShiftTy = Shift.getValueType();

43910

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

43911

return SDValue();

43912

43913

// Make sure the shift amount extracts the sign bit.

43914

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

43915

Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

43916

return SDValue();

43917

43918

// Create a greater-than comparison against -1.

43919

// N.B. Using SETGE against 0 works but we want a canonical looking

43920

// comparison, using SETGT matches up with what TranslateX86CC.

43921

SDLoc DL(N);

43922

SDValue ShiftOp = Shift.getOperand(0);

43923

EVT ShiftOpTy = ShiftOp.getValueType();

43924

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43925

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

43926

*DAG.getContext(), ResultType);

43927

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

43928

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

43929

if (SetCCResultType != ResultType)

43930

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

43931

return Cond;

43932

}

43933

43934

/// Turn vector tests of the signbit in the form of:

43935

/// xor (sra X, elt_size(X)-1), -1

43936

/// into:

43937

/// pcmpgt X, -1

43938

///

43939

/// This should be called before type legalization because the pattern may not

43940

/// persist after that.

43941

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

43942

const X86Subtarget &Subtarget) {

43943

EVT VT = N->getValueType(0);

43944

if (!VT.isSimple())

43945

return SDValue();

43946

43947

switch (VT.getSimpleVT().SimpleTy) {

43948

default: return SDValue();

43949

case MVT::v16i8:

43950

case MVT::v8i16:

43951

case MVT::v4i32:

43952

case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

43953

case MVT::v32i8:

43954

case MVT::v16i16:

43955

case MVT::v8i32:

43956

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

43957

}

43958

43959

// There must be a shift right algebraic before the xor, and the xor must be a

43960

// 'not' operation.

43961

SDValue Shift = N->getOperand(0);

43962

SDValue Ones = N->getOperand(1);

43963

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

43964

!ISD::isBuildVectorAllOnes(Ones.getNode()))

43965

return SDValue();

43966

43967

// The shift should be smearing the sign bit across each vector element.

43968

auto *ShiftAmt =

43969

isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

43970

if (!ShiftAmt ||

43971

ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

43972

return SDValue();

43973

43974

// Create a greater-than comparison against -1. We don't use the more obvious

43975

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

43976

return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

43977

}

43978

43979

/// Detect patterns of truncation with unsigned saturation:

43980

///

43981

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

43982

/// Return the source value x to be truncated or SDValue() if the pattern was

43983

/// not matched.

43984

///

43985

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

43986

/// where C1 >= 0 and C2 is unsigned max of destination type.

43987

///

43988

/// (truncate (smax (smin (x, C2), C1)) to dest_type)

43989

/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

43990

///

43991

/// These two patterns are equivalent to:

43992

/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

43993

/// So return the smax(x, C1) value to be truncated or SDValue() if the

43994

/// pattern was not matched.

43995

static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

43996

const SDLoc &DL) {

43997

EVT InVT = In.getValueType();

43998

43999

// Saturation with truncation. We truncate from InVT to VT.

44000

assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44001, __PRETTY_FUNCTION__))

44001

"Unexpected types for truncate operation")((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44001, __PRETTY_FUNCTION__));

44002

44003

// Match min/max and return limit value as a parameter.

44004

auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {

44005

if (V.getOpcode() == Opcode &&

44006

ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))

44007

return V.getOperand(0);

44008

return SDValue();

44009

};

44010

44011

APInt C1, C2;

44012

if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))

44013

// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

44014

// the element size of the destination type.

44015

if (C2.isMask(VT.getScalarSizeInBits()))

44016

return UMin;

44017

44018

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))

44019

if (MatchMinMax(SMin, ISD::SMAX, C1))

44020

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

44021

return SMin;

44022

44023

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))

44024

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))

44025

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&

44026

C2.uge(C1)) {

44027

return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));

44028

}

44029

44030

return SDValue();

44031

}

44032

44033

/// Detect patterns of truncation with signed saturation:

44034

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

44035

/// signed_max_of_dest_type)) to dest_type)

44036

/// or:

44037

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

44038

/// signed_min_of_dest_type)) to dest_type).

44039

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

44040

/// Return the source value to be truncated or SDValue() if the pattern was not

44041

/// matched.

44042

static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

44043

unsigned NumDstBits = VT.getScalarSizeInBits();

44044

unsigned NumSrcBits = In.getScalarValueSizeInBits();

44045

assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")((NumSrcBits > NumDstBits && "Unexpected types for truncate operation"
) ? static_cast<void> (0) : __assert_fail ("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44045, __PRETTY_FUNCTION__));

44046

44047

auto MatchMinMax = [](SDValue V, unsigned Opcode,

44048

const APInt &Limit) -> SDValue {

44049

APInt C;

44050

if (V.getOpcode() == Opcode &&

44051

ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)

44052

return V.getOperand(0);

44053

return SDValue();

44054

};

44055

44056

APInt SignedMax, SignedMin;

44057

if (MatchPackUS) {

44058

SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);

44059

SignedMin = APInt(NumSrcBits, 0);

44060

} else {

44061

SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

44062

SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

44063

}

44064

44065

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))

44066

if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))

44067

return SMax;

44068

44069

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))

44070

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))

44071

return SMin;

44072

44073

return SDValue();

44074

}

44075

44076

static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

44077

SelectionDAG &DAG,

44078

const X86Subtarget &Subtarget) {

44079

if (!Subtarget.hasSSE2() || !VT.isVector())

44080

return SDValue();

44081

44082

EVT SVT = VT.getVectorElementType();

44083

EVT InVT = In.getValueType();

44084

EVT InSVT = InVT.getVectorElementType();

44085

44086

// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

44087

// split across two registers. We can use a packusdw+perm to clamp to 0-65535

44088

// and concatenate at the same time. Then we can use a final vpmovuswb to

44089

// clip to 0-255.

44090

if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

44091

InVT == MVT::v16i32 && VT == MVT::v16i8) {

44092

if (auto USatVal = detectSSatPattern(In, VT, true)) {

44093

// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

44094

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

44095

DL, DAG, Subtarget);

44096

assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44096, __PRETTY_FUNCTION__));

44097

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

44098

}

44099

}

44100

44101

// vXi32 truncate instructions are available with AVX512F.

44102

// vXi16 truncate instructions are only available with AVX512BW.

44103

// For 256-bit or smaller vectors, we require VLX.

44104

// FIXME: We could widen truncates to 512 to remove the VLX restriction.

44105

// If the result type is 256-bits or larger and we have disable 512-bit

44106

// registers, we should go ahead and use the pack instructions if possible.

44107

bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

44108

(Subtarget.hasBWI() && InSVT == MVT::i16)) &&

44109

(InVT.getSizeInBits() > 128) &&

44110

(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

44111

!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

44112

44113

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&

44114

VT.getSizeInBits() >= 64 &&

44115

(SVT == MVT::i8 || SVT == MVT::i16) &&

44116

(InSVT == MVT::i16 || InSVT == MVT::i32)) {

44117

if (auto USatVal = detectSSatPattern(In, VT, true)) {

44118

// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

44119

// Only do this when the result is at least 64 bits or we'll leaving

44120

// dangling PACKSSDW nodes.

44121

if (SVT == MVT::i8 && InSVT == MVT::i32) {

44122

EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

44123

VT.getVectorNumElements());

44124

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

44125

DAG, Subtarget);

44126

assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44126, __PRETTY_FUNCTION__));

44127

SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

44128

Subtarget);

44129

assert(V && "Failed to pack!")((V && "Failed to pack!") ? static_cast<void> (
0) : __assert_fail ("V && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44129, __PRETTY_FUNCTION__));

44130

return V;

44131

} else if (SVT == MVT::i8 || Subtarget.hasSSE41())

44132

return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

44133

Subtarget);

44134

}

44135

if (auto SSatVal = detectSSatPattern(In, VT))

44136

return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

44137

Subtarget);

44138

}

44139

44140

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44141

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

44142

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {

44143

unsigned TruncOpc = 0;

44144

SDValue SatVal;

44145

if (auto SSatVal = detectSSatPattern(In, VT)) {

44146

SatVal = SSatVal;

44147

TruncOpc = X86ISD::VTRUNCS;

44148

} else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {

44149

SatVal = USatVal;

44150

TruncOpc = X86ISD::VTRUNCUS;

44151

}

44152

if (SatVal) {

44153

unsigned ResElts = VT.getVectorNumElements();

44154

// If the input type is less than 512 bits and we don't have VLX, we need

44155

// to widen to 512 bits.

44156

if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

44157

unsigned NumConcats = 512 / InVT.getSizeInBits();

44158

ResElts *= NumConcats;

44159

SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

44160

ConcatOps[0] = SatVal;

44161

InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

44162

NumConcats * InVT.getVectorNumElements());

44163

SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

44164

}

44165

// Widen the result if its narrower than 128 bits.

44166

if (ResElts * SVT.getSizeInBits() < 128)

44167

ResElts = 128 / SVT.getSizeInBits();

44168

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

44169

SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

44170

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

44171

DAG.getIntPtrConstant(0, DL));

44172

}

44173

}

44174

44175

return SDValue();

44176

}

44177

44178

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

44179

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

44180

/// X86ISD::AVG instruction.

44181

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

44182

const X86Subtarget &Subtarget,

44183

const SDLoc &DL) {

44184

if (!VT.isVector())

44185

return SDValue();

44186

EVT InVT = In.getValueType();

44187

unsigned NumElems = VT.getVectorNumElements();

44188

44189

EVT ScalarVT = VT.getVectorElementType();

44190

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))

44191

return SDValue();

44192

44193

// InScalarVT is the intermediate type in AVG pattern and it should be greater

44194

// than the original input type (i8/i16).

44195

EVT InScalarVT = InVT.getVectorElementType();

44196

if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())

44197

return SDValue();

44198

44199

if (!Subtarget.hasSSE2())

44200

return SDValue();

44201

44202

// Detect the following pattern:

44203

//

44204

// %1 = zext <N x i8> %a to <N x i32>

44205

// %2 = zext <N x i8> %b to <N x i32>

44206

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

44207

// %4 = add nuw nsw <N x i32> %3, %2

44208

// %5 = lshr <N x i32> %N, <i32 1 x N>

44209

// %6 = trunc <N x i32> %5 to <N x i8>

44210

//

44211

// In AVX512, the last instruction can also be a trunc store.

44212

if (In.getOpcode() != ISD::SRL)

44213

return SDValue();

44214

44215

// A lambda checking the given SDValue is a constant vector and each element

44216

// is in the range [Min, Max].

44217

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

44218

return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

44219

return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

44220

});

44221

};

44222

44223

// Check if each element of the vector is right-shifted by one.

44224

auto LHS = In.getOperand(0);

44225

auto RHS = In.getOperand(1);

44226

if (!IsConstVectorInRange(RHS, 1, 1))

44227

return SDValue();

44228

if (LHS.getOpcode() != ISD::ADD)

44229

return SDValue();

44230

44231

// Detect a pattern of a + b + 1 where the order doesn't matter.

44232

SDValue Operands[3];

44233

Operands[0] = LHS.getOperand(0);

44234

Operands[1] = LHS.getOperand(1);

44235

44236

auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44237

ArrayRef<SDValue> Ops) {

44238

return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);

44239

};

44240

44241

auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {

44242

// Pad to a power-of-2 vector, split+apply and extract the original vector.

44243

unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);

44244

EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);

44245

if (NumElemsPow2 != NumElems) {

44246

SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));

44247

SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));

44248

for (unsigned i = 0; i != NumElems; ++i) {

44249

SDValue Idx = DAG.getIntPtrConstant(i, DL);

44250

Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);

44251

Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);

44252

}

44253

Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);

44254

Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);

44255

}

44256

SDValue Res =

44257

SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);

44258

if (NumElemsPow2 == NumElems)

44259

return Res;

44260

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

44261

DAG.getIntPtrConstant(0, DL));

44262

};

44263

44264

// Take care of the case when one of the operands is a constant vector whose

44265

// element is in the range [1, 256].

44266

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

44267

Operands[0].getOpcode() == ISD::ZERO_EXTEND &&

44268

Operands[0].getOperand(0).getValueType() == VT) {

44269

// The pattern is detected. Subtract one from the constant vector, then

44270

// demote it and emit X86ISD::AVG instruction.

44271

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

44272

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

44273

Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);

44274

return AVGSplitter(Operands[0].getOperand(0), Operands[1]);

44275

}

44276

44277

// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).

44278

// Match the or case only if its 'add-like' - can be replaced by an add.

44279

auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {

44280

if (ISD::ADD == V.getOpcode()) {

44281

Op0 = V.getOperand(0);

44282

Op1 = V.getOperand(1);

44283

return true;

44284

}

44285

if (ISD::ZERO_EXTEND != V.getOpcode())

44286

return false;

44287

V = V.getOperand(0);

44288

if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||

44289

!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))

44290

return false;

44291

Op0 = V.getOperand(0);

44292

Op1 = V.getOperand(1);

44293

return true;

44294

};

44295

44296

SDValue Op0, Op1;

44297

if (FindAddLike(Operands[0], Op0, Op1))

44298

std::swap(Operands[0], Operands[1]);

44299

else if (!FindAddLike(Operands[1], Op0, Op1))

44300

return SDValue();

44301

Operands[2] = Op0;

44302

Operands[1] = Op1;

44303

44304

// Now we have three operands of two additions. Check that one of them is a

44305

// constant vector with ones, and the other two can be promoted from i8/i16.

44306

for (int i = 0; i < 3; ++i) {

44307

if (!IsConstVectorInRange(Operands[i], 1, 1))

44308

continue;

44309

std::swap(Operands[i], Operands[2]);

44310

44311

// Check if Operands[0] and Operands[1] are results of type promotion.

44312

for (int j = 0; j < 2; ++j)

44313

if (Operands[j].getValueType() != VT) {

44314

if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||

44315

Operands[j].getOperand(0).getValueType() != VT)

44316

return SDValue();

44317

Operands[j] = Operands[j].getOperand(0);

44318

}

44319

44320

// The pattern is detected, emit X86ISD::AVG instruction(s).

44321

return AVGSplitter(Operands[0], Operands[1]);

44322

}

44323

44324

return SDValue();

44325

}

44326

44327

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

44328

TargetLowering::DAGCombinerInfo &DCI,

44329

const X86Subtarget &Subtarget) {

44330

LoadSDNode *Ld = cast<LoadSDNode>(N);

44331

EVT RegVT = Ld->getValueType(0);

44332

EVT MemVT = Ld->getMemoryVT();

44333

SDLoc dl(Ld);

44334

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44335

44336

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

44337

// into two 16-byte operations. Also split non-temporal aligned loads on

44338

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

44339

ISD::LoadExtType Ext = Ld->getExtensionType();

44340

bool Fast;

44341

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

44342

Ext == ISD::NON_EXTLOAD &&

44343

((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

44344

Ld->getAlignment() >= 16) ||

44345

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

44346

*Ld->getMemOperand(), &Fast) &&

44347

!Fast))) {

44348

unsigned NumElems = RegVT.getVectorNumElements();

44349

if (NumElems < 2)

44350

return SDValue();

44351

44352

unsigned HalfOffset = 16;

44353

SDValue Ptr1 = Ld->getBasePtr();

44354

SDValue Ptr2 =

44355

DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);

44356

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

44357

NumElems / 2);

44358

SDValue Load1 =

44359

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

44360

Ld->getOriginalAlign(),

44361

Ld->getMemOperand()->getFlags());

44362

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

44363

Ld->getPointerInfo().getWithOffset(HalfOffset),

44364

Ld->getOriginalAlign(),

44365

Ld->getMemOperand()->getFlags());

44366

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

44367

Load1.getValue(1), Load2.getValue(1));

44368

44369

SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

44370

return DCI.CombineTo(N, NewVec, TF, true);

44371

}

44372

44373

// Bool vector load - attempt to cast to an integer, as we have good

44374

// (vXiY *ext(vXi1 bitcast(iX))) handling.

44375

if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

44376

RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

44377

unsigned NumElts = RegVT.getVectorNumElements();

44378

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

44379

if (TLI.isTypeLegal(IntVT)) {

44380

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

44381

Ld->getPointerInfo(),

44382

Ld->getOriginalAlign(),

44383

Ld->getMemOperand()->getFlags());

44384

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

44385

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

44386

}

44387

}

44388

44389

// Cast ptr32 and ptr64 pointers to the default address space before a load.

44390

unsigned AddrSpace = Ld->getAddressSpace();

44391

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

44392

AddrSpace == X86AS::PTR32_UPTR) {

44393

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

44394

if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

44395

SDValue Cast =

44396

DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

44397

return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

44398

Ld->getOriginalAlign(),

44399

Ld->getMemOperand()->getFlags());

44400

}

44401

}

44402

44403

return SDValue();

44404

}

44405

44406

/// If V is a build vector of boolean constants and exactly one of those

44407

/// constants is true, return the operand index of that true element.

44408

/// Otherwise, return -1.

44409

static int getOneTrueElt(SDValue V) {

44410

// This needs to be a build vector of booleans.

44411

// TODO: Checking for the i1 type matches the IR definition for the mask,

44412

// but the mask check could be loosened to i8 or other types. That might

44413

// also require checking more than 'allOnesValue'; eg, the x86 HW

44414

// instructions only require that the MSB is set for each mask element.

44415

// The ISD::MSTORE comments/definition do not specify how the mask operand

44416

// is formatted.

44417

auto *BV = dyn_cast<BuildVectorSDNode>(V);

44418

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

44419

return -1;

44420

44421

int TrueIndex = -1;

44422

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

44423

for (unsigned i = 0; i < NumElts; ++i) {

44424

const SDValue &Op = BV->getOperand(i);

44425

if (Op.isUndef())

44426

continue;

44427

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

44428

if (!ConstNode)

44429

return -1;

44430

if (ConstNode->getAPIntValue().isAllOnesValue()) {

44431

// If we already found a one, this is too many.

44432

if (TrueIndex >= 0)

44433

return -1;

44434

TrueIndex = i;

44435

}

44436

}

44437

return TrueIndex;

44438

}

44439

44440

/// Given a masked memory load/store operation, return true if it has one mask

44441

/// bit set. If it has one mask bit set, then also return the memory address of

44442

/// the scalar element to load/store, the vector index to insert/extract that

44443

/// scalar element, and the alignment for the scalar memory access.

44444

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

44445

SelectionDAG &DAG, SDValue &Addr,

44446

SDValue &Index, Align &Alignment,

44447

unsigned &Offset) {

44448

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

44449

if (TrueMaskElt < 0)

44450

return false;

44451

44452

// Get the address of the one scalar element that is specified by the mask

44453

// using the appropriate offset from the base pointer.

44454

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

44455

Offset = 0;

44456

Addr = MaskedOp->getBasePtr();

44457

if (TrueMaskElt != 0) {

44458

Offset = TrueMaskElt * EltVT.getStoreSize();

44459

Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),

44460

SDLoc(MaskedOp));

44461

}

44462

44463

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

44464

Alignment = commonAlignment(MaskedOp->getOriginalAlign(),

44465

EltVT.getStoreSize());

44466

return true;

44467

}

44468

44469

/// If exactly one element of the mask is set for a non-extending masked load,

44470

/// it is a scalar load and vector insert.

44471

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

44472

/// mask have already been optimized in IR, so we don't bother with those here.

44473

static SDValue

44474

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

44475

TargetLowering::DAGCombinerInfo &DCI) {

44476

assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44476, __PRETTY_FUNCTION__));

44477

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

44478

// However, some target hooks may need to be added to know when the transform

44479

// is profitable. Endianness would also have to be considered.

44480

44481

SDValue Addr, VecIndex;

44482

Align Alignment;

44483

unsigned Offset;

44484

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

44485

return SDValue();

44486

44487

// Load the one scalar element that is specified by the mask using the

44488

// appropriate offset from the base pointer.

44489

SDLoc DL(ML);

44490

EVT VT = ML->getValueType(0);

44491

EVT EltVT = VT.getVectorElementType();

44492

SDValue Load =

44493

DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

44494

ML->getPointerInfo().getWithOffset(Offset),

44495

Alignment, ML->getMemOperand()->getFlags());

44496

44497

// Insert the loaded element into the appropriate place in the vector.

44498

SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,

44499

ML->getPassThru(), Load, VecIndex);

44500

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

44501

}

44502

44503

static SDValue

44504

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

44505

TargetLowering::DAGCombinerInfo &DCI) {

44506

assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44506, __PRETTY_FUNCTION__));

44507

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

44508

return SDValue();

44509

44510

SDLoc DL(ML);

44511

EVT VT = ML->getValueType(0);

44512

44513

// If we are loading the first and last elements of a vector, it is safe and

44514

// always faster to load the whole vector. Replace the masked load with a

44515

// vector load and select.

44516

unsigned NumElts = VT.getVectorNumElements();

44517

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

44518

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

44519

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

44520

if (LoadFirstElt && LoadLastElt) {

44521

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

44522

ML->getMemOperand());

44523

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

44524

ML->getPassThru());

44525

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

44526

}

44527

44528

// Convert a masked load with a constant mask into a masked load and a select.

44529

// This allows the select operation to use a faster kind of select instruction

44530

// (for example, vblendvps -> vblendps).

44531

44532

// Don't try this if the pass-through operand is already undefined. That would

44533

// cause an infinite loop because that's what we're about to create.

44534

if (ML->getPassThru().isUndef())

44535

return SDValue();

44536

44537

if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

44538

return SDValue();

44539

44540

// The new masked load has an undef pass-through operand. The select uses the

44541

// original pass-through operand.

44542

SDValue NewML = DAG.getMaskedLoad(

44543

VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

44544

DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

44545

ML->getAddressingMode(), ML->getExtensionType());

44546

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

44547

ML->getPassThru());

44548

44549

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

44550

}

44551

44552

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

44553

TargetLowering::DAGCombinerInfo &DCI,

44554

const X86Subtarget &Subtarget) {

44555

auto *Mld = cast<MaskedLoadSDNode>(N);

44556

44557

// TODO: Expanding load with constant mask may be optimized as well.

44558

if (Mld->isExpandingLoad())

44559

return SDValue();

44560

44561

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

44562

if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))

44563

return ScalarLoad;

44564

44565

// TODO: Do some AVX512 subsets benefit from this transform?

44566

if (!Subtarget.hasAVX512())

44567

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

44568

return Blend;

44569

}

44570

44571

// If the mask value has been legalized to a non-boolean vector, try to

44572

// simplify ops leading up to it. We only demand the MSB of each lane.

44573

SDValue Mask = Mld->getMask();

44574

if (Mask.getScalarValueSizeInBits() != 1) {

44575

EVT VT = Mld->getValueType(0);

44576

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44577

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

44578

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

44579

if (N->getOpcode() != ISD::DELETED_NODE)

44580

DCI.AddToWorklist(N);

44581

return SDValue(N, 0);

44582

}

44583

if (SDValue NewMask =

44584

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

44585

return DAG.getMaskedLoad(

44586

VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

44587

NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

44588

Mld->getAddressingMode(), Mld->getExtensionType());

44589

}

44590

44591

return SDValue();

44592

}

44593

44594

/// If exactly one element of the mask is set for a non-truncating masked store,

44595

/// it is a vector extract and scalar store.

44596

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

44597

/// mask have already been optimized in IR, so we don't bother with those here.

44598

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

44599

SelectionDAG &DAG) {

44600

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

44601

// However, some target hooks may need to be added to know when the transform

44602

// is profitable. Endianness would also have to be considered.

44603

44604

SDValue Addr, VecIndex;

44605

Align Alignment;

44606

unsigned Offset;

44607

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

44608

return SDValue();

44609

44610

// Extract the one scalar element that is actually being stored.

44611

SDLoc DL(MS);

44612

EVT VT = MS->getValue().getValueType();

44613

EVT EltVT = VT.getVectorElementType();

44614

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,

44615

MS->getValue(), VecIndex);

44616

44617

// Store that element at the appropriate offset from the base pointer.

44618

return DAG.getStore(MS->getChain(), DL, Extract, Addr,

44619

MS->getPointerInfo().getWithOffset(Offset),

44620

Alignment, MS->getMemOperand()->getFlags());

44621

}

44622

44623

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

44624

TargetLowering::DAGCombinerInfo &DCI,

44625

const X86Subtarget &Subtarget) {

44626

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

44627

if (Mst->isCompressingStore())

44628

return SDValue();

44629

44630

EVT VT = Mst->getValue().getValueType();

44631

SDLoc dl(Mst);

44632

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44633

44634

if (Mst->isTruncatingStore())

44635

return SDValue();

44636

44637

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))

44638

return ScalarStore;

44639

44640

// If the mask value has been legalized to a non-boolean vector, try to

44641

// simplify ops leading up to it. We only demand the MSB of each lane.

44642

SDValue Mask = Mst->getMask();

44643

if (Mask.getScalarValueSizeInBits() != 1) {

44644

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

44645

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

44646

if (N->getOpcode() != ISD::DELETED_NODE)

44647

DCI.AddToWorklist(N);

44648

return SDValue(N, 0);

44649

}

44650

if (SDValue NewMask =

44651

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

44652

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

44653

Mst->getBasePtr(), Mst->getOffset(), NewMask,

44654

Mst->getMemoryVT(), Mst->getMemOperand(),

44655

Mst->getAddressingMode());

44656

}

44657

44658

SDValue Value = Mst->getValue();

44659

if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

44660

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

44661

Mst->getMemoryVT())) {

44662

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

44663

Mst->getBasePtr(), Mst->getOffset(), Mask,

44664

Mst->getMemoryVT(), Mst->getMemOperand(),

44665

Mst->getAddressingMode(), true);

44666

}

44667

44668

return SDValue();

44669

}

44670

44671

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

44672

TargetLowering::DAGCombinerInfo &DCI,

44673

const X86Subtarget &Subtarget) {

44674

StoreSDNode *St = cast<StoreSDNode>(N);

44675

EVT StVT = St->getMemoryVT();

44676

SDLoc dl(St);

44677

SDValue StoredVal = St->getValue();

44678

EVT VT = StoredVal.getValueType();

44679

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44680

44681

// Convert a store of vXi1 into a store of iX and a bitcast.

44682

if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

44683

VT.getVectorElementType() == MVT::i1) {

44684

44685

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

44686

StoredVal = DAG.getBitcast(NewVT, StoredVal);

44687

44688

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

44689

St->getPointerInfo(), St->getOriginalAlign(),

44690

St->getMemOperand()->getFlags());

44691

}

44692

44693

// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

44694

// This will avoid a copy to k-register.

44695

if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

44696

StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

44697

StoredVal.getOperand(0).getValueType() == MVT::i8) {

44698

return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),

44699

St->getBasePtr(), St->getPointerInfo(),

44700

St->getOriginalAlign(),

44701

St->getMemOperand()->getFlags());

44702

}

44703

44704

// Widen v2i1/v4i1 stores to v8i1.

44705

if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

44706

Subtarget.hasAVX512()) {

44707

unsigned NumConcats = 8 / VT.getVectorNumElements();

44708

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));

44709

Ops[0] = StoredVal;

44710

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

44711

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

44712

St->getPointerInfo(), St->getOriginalAlign(),

44713

St->getMemOperand()->getFlags());

44714

}

44715

44716

// Turn vXi1 stores of constants into a scalar store.

44717

if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

44718

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

44719

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

44720

// If its a v64i1 store without 64-bit support, we need two stores.

44721

if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

44722

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

44723

StoredVal->ops().slice(0, 32));

44724

Lo = combinevXi1ConstantToInteger(Lo, DAG);

44725

SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

44726

StoredVal->ops().slice(32, 32));

44727

Hi = combinevXi1ConstantToInteger(Hi, DAG);

44728

44729

SDValue Ptr0 = St->getBasePtr();

44730

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);

44731

44732

SDValue Ch0 =

44733

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

44734

St->getOriginalAlign(),

44735

St->getMemOperand()->getFlags());

44736

SDValue Ch1 =

44737

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

44738

St->getPointerInfo().getWithOffset(4),

44739

St->getOriginalAlign(),

44740

St->getMemOperand()->getFlags());

44741

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

44742

}

44743

44744

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

44745

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

44746

St->getPointerInfo(), St->getOriginalAlign(),

44747

St->getMemOperand()->getFlags());

44748

}

44749

44750

// If we are saving a 32-byte vector and 32-byte stores are slow, such as on

44751

// Sandy Bridge, perform two 16-byte stores.

44752

bool Fast;

44753

if (VT.is256BitVector() && StVT == VT &&

44754

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

44755

*St->getMemOperand(), &Fast) &&

44756

!Fast) {

44757

unsigned NumElems = VT.getVectorNumElements();

44758

if (NumElems < 2)

44759

return SDValue();

44760

44761

return splitVectorStore(St, DAG);

44762

}

44763

44764

// Split under-aligned vector non-temporal stores.

44765

if (St->isNonTemporal() && StVT == VT &&

44766

St->getAlignment() < VT.getStoreSize()) {

44767

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

44768

// vectors or the legalizer can scalarize it to use MOVNTI.

44769

if (VT.is256BitVector() || VT.is512BitVector()) {

44770

unsigned NumElems = VT.getVectorNumElements();

44771

if (NumElems < 2)

44772

return SDValue();

44773

return splitVectorStore(St, DAG);

44774

}

44775

44776

// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

44777

// to use MOVNTI.

44778

if (VT.is128BitVector() && Subtarget.hasSSE2()) {

44779

MVT NTVT = Subtarget.hasSSE4A()

44780

? MVT::v2f64

44781

: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

44782

return scalarizeVectorStore(St, NTVT, DAG);

44783

}

44784

}

44785

44786

// Try to optimize v16i16->v16i8 truncating stores when BWI is not

44787

// supported, but avx512f is by extending to v16i32 and truncating.

44788

if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

44789

St->getValue().getOpcode() == ISD::TRUNCATE &&

44790

St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

44791

TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

44792

St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

44793

SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());

44794

return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

44795

MVT::v16i8, St->getMemOperand());

44796

}

44797

44798

// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

44799

if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&

44800

(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

44801

StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

44802

TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

44803

bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

44804

return EmitTruncSStore(IsSigned, St->getChain(),

44805

dl, StoredVal.getOperand(0), St->getBasePtr(),

44806

VT, St->getMemOperand(), DAG);

44807

}

44808

44809

// Try to fold a extract_element(VTRUNC) pattern into a truncating store.

44810

if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {

44811

auto IsExtractedElement = [](SDValue V) {

44812

if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())

44813

V = V.getOperand(0);

44814

unsigned Opc = V.getOpcode();

44815

if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {

44816

if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))

44817

return V.getOperand(0);

44818

}

44819

return SDValue();

44820

};

44821

if (SDValue Extract = IsExtractedElement(StoredVal)) {

44822

SDValue Trunc = peekThroughOneUseBitcasts(Extract);

44823

if (Trunc.getOpcode() == X86ISD::VTRUNC) {

44824

SDValue Src = Trunc.getOperand(0);

44825

MVT DstVT = Trunc.getSimpleValueType();

44826

MVT SrcVT = Src.getSimpleValueType();

44827

unsigned NumSrcElts = SrcVT.getVectorNumElements();

44828

unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

44829

MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

44830

if (NumTruncBits == VT.getSizeInBits() &&

44831

TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

44832

return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

44833

TruncVT, St->getMemOperand());

44834

}

44835

}

44836

}

44837

}

44838

44839

// Optimize trunc store (of multiple scalars) to shuffle and store.

44840

// First, pack all of the elements in one place. Next, store to memory

44841

// in fewer chunks.

44842

if (St->isTruncatingStore() && VT.isVector()) {

44843

// Check if we can detect an AVG pattern from the truncation. If yes,

44844

// replace the trunc store by a normal store with the result of X86ISD::AVG

44845

// instruction.

44846

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))

44847

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

44848

Subtarget, dl))

44849

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

44850

St->getPointerInfo(), St->getOriginalAlign(),

44851

St->getMemOperand()->getFlags());

44852

44853

if (TLI.isTruncStoreLegal(VT, StVT)) {

44854

if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

44855

return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

44856

dl, Val, St->getBasePtr(),

44857

St->getMemoryVT(), St->getMemOperand(), DAG);

44858

if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

44859

DAG, dl))

44860

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

44861

dl, Val, St->getBasePtr(),

44862

St->getMemoryVT(), St->getMemOperand(), DAG);

44863

}

44864

44865

return SDValue();

44866

}

44867

44868

// Cast ptr32 and ptr64 pointers to the default address space before a store.

44869

unsigned AddrSpace = St->getAddressSpace();

44870

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

44871

AddrSpace == X86AS::PTR32_UPTR) {

44872

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

44873

if (PtrVT != St->getBasePtr().getSimpleValueType()) {

44874

SDValue Cast =

44875

DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

44876

return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

44877

St->getPointerInfo(), St->getOriginalAlign(),

44878

St->getMemOperand()->getFlags(), St->getAAInfo());

44879

}

44880

}

44881

44882

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

44883

// the FP state in cases where an emms may be missing.

44884

// A preferable solution to the general problem is to figure out the right

44885

// places to insert EMMS. This qualifies as a quick hack.

44886

44887

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

44888

if (VT.getSizeInBits() != 64)

44889

return SDValue();

44890

44891

const Function &F = DAG.getMachineFunction().getFunction();

44892

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

44893

bool F64IsLegal =

44894

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

44895

if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&

44896

isa<LoadSDNode>(St->getValue()) &&

44897

cast<LoadSDNode>(St->getValue())->isSimple() &&

44898

St->getChain().hasOneUse() && St->isSimple()) {

44899

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

44900

44901

if (!ISD::isNormalLoad(Ld))

44902

return SDValue();

44903

44904

// Avoid the transformation if there are multiple uses of the loaded value.

44905

if (!Ld->hasNUsesOfValue(1, 0))

44906

return SDValue();

44907

44908

SDLoc LdDL(Ld);

44909

SDLoc StDL(N);

44910

// Lower to a single movq load/store pair.

44911

SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

44912

Ld->getBasePtr(), Ld->getMemOperand());

44913

44914

// Make sure new load is placed in same chain order.

44915

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

44916

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

44917

St->getMemOperand());

44918

}

44919

44920

// This is similar to the above case, but here we handle a scalar 64-bit

44921

// integer store that is extracted from a vector on a 32-bit target.

44922

// If we have SSE2, then we can treat it like a floating-point double

44923

// to get past legalization. The execution dependencies fixup pass will

44924

// choose the optimal machine instruction for the store if this really is

44925

// an integer or v2f32 rather than an f64.

44926

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

44927

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

44928

SDValue OldExtract = St->getOperand(1);

44929

SDValue ExtOp0 = OldExtract.getOperand(0);

44930

unsigned VecSize = ExtOp0.getValueSizeInBits();

44931

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

44932

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

44933

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

44934

BitCast, OldExtract.getOperand(1));

44935

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

44936

St->getPointerInfo(), St->getOriginalAlign(),

44937

St->getMemOperand()->getFlags());

44938

}

44939

44940

return SDValue();

44941

}

44942

44943

static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

44944

TargetLowering::DAGCombinerInfo &DCI,

44945

const X86Subtarget &Subtarget) {

44946

auto *St = cast<MemIntrinsicSDNode>(N);

44947

44948

SDValue StoredVal = N->getOperand(1);

44949

MVT VT = StoredVal.getSimpleValueType();

44950

EVT MemVT = St->getMemoryVT();

44951

44952

// Figure out which elements we demand.

44953

unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

44954

APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

44955

44956

APInt KnownUndef, KnownZero;

44957

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44958

if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,

44959

KnownZero, DCI)) {

44960

if (N->getOpcode() != ISD::DELETED_NODE)

44961

DCI.AddToWorklist(N);

44962

return SDValue(N, 0);

44963

}

44964

44965

return SDValue();

44966

}

44967

44968

/// Return 'true' if this vector operation is "horizontal"

44969

/// and return the operands for the horizontal operation in LHS and RHS. A

44970

/// horizontal operation performs the binary operation on successive elements

44971

/// of its first operand, then on successive elements of its second operand,

44972

/// returning the resulting values in a vector. For example, if

44973

/// A = < float a0, float a1, float a2, float a3 >

44974

/// and

44975

/// B = < float b0, float b1, float b2, float b3 >

44976

/// then the result of doing a horizontal operation on A and B is

44977

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

44978

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

44979

/// A horizontal-op B, for some already available A and B, and if so then LHS is

44980

/// set to A, RHS to B, and the routine returns 'true'.

44981

static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,

44982

const X86Subtarget &Subtarget, bool IsCommutative,

44983

SmallVectorImpl<int> &PostShuffleMask) {

44984

// If either operand is undef, bail out. The binop should be simplified.

44985

if (LHS.isUndef() || RHS.isUndef())

44986

return false;

44987

44988

// Look for the following pattern:

44989

// A = < float a0, float a1, float a2, float a3 >

44990

// B = < float b0, float b1, float b2, float b3 >

44991

// and

44992

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

44993

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

44994

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

44995

// which is A horizontal-op B.

44996

44997

MVT VT = LHS.getSimpleValueType();

44998

assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44999, __PRETTY_FUNCTION__))

44999

"Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44999, __PRETTY_FUNCTION__));

45000

unsigned NumElts = VT.getVectorNumElements();

45001

45002

// TODO - can we make a general helper method that does all of this for us?

45003

auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

45004

SmallVectorImpl<int> &ShuffleMask) {

45005

if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {

45006

if (!Op.getOperand(0).isUndef())

45007

N0 = Op.getOperand(0);

45008

if (!Op.getOperand(1).isUndef())

45009

N1 = Op.getOperand(1);

45010

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

45011

ShuffleMask.append(Mask.begin(), Mask.end());

45012

return;

45013

}

45014

bool UseSubVector = false;

45015

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

45016

Op.getOperand(0).getValueType().is256BitVector() &&

45017

llvm::isNullConstant(Op.getOperand(1))) {

45018

Op = Op.getOperand(0);

45019

UseSubVector = true;

45020

}

45021

bool IsUnary;

45022

SmallVector<SDValue, 2> SrcOps;

45023

SmallVector<int, 16> SrcShuffleMask;

45024

SDValue BC = peekThroughBitcasts(Op);

45025

if (isTargetShuffle(BC.getOpcode()) &&

45026

getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,

45027

SrcOps, SrcShuffleMask, IsUnary)) {

45028

if (!UseSubVector && SrcShuffleMask.size() == NumElts &&

45029

SrcOps.size() <= 2) {

45030

N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();

45031

N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

45032

ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());

45033

}

45034

if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&

45035

SrcOps.size() == 1) {

45036

N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));

45037

N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));

45038

ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);

45039

ShuffleMask.append(Mask.begin(), Mask.end());

45040

}

45041

}

45042

};

45043

45044

// View LHS in the form

45045

// LHS = VECTOR_SHUFFLE A, B, LMask

45046

// If LHS is not a shuffle, then pretend it is the identity shuffle:

45047

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

45048

// NOTE: A default initialized SDValue represents an UNDEF of type VT.

45049

SDValue A, B;

45050

SmallVector<int, 16> LMask;

45051

GetShuffle(LHS, A, B, LMask);

45052

45053

// Likewise, view RHS in the form

45054

// RHS = VECTOR_SHUFFLE C, D, RMask

45055

SDValue C, D;

45056

SmallVector<int, 16> RMask;

45057

GetShuffle(RHS, C, D, RMask);

45058

45059

// At least one of the operands should be a vector shuffle.

45060

unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

45061

if (NumShuffles == 0)

45062

return false;

45063

45064

if (LMask.empty()) {

45065

A = LHS;

45066

for (unsigned i = 0; i != NumElts; ++i)

45067

LMask.push_back(i);

45068

}

45069

45070

if (RMask.empty()) {

45071

C = RHS;

45072

for (unsigned i = 0; i != NumElts; ++i)

45073

RMask.push_back(i);

45074

}

45075

45076

// If A and B occur in reverse order in RHS, then canonicalize by commuting

45077

// RHS operands and shuffle mask.

45078

if (A != C) {

45079

std::swap(C, D);

45080

ShuffleVectorSDNode::commuteMask(RMask);

45081

}

45082

// Check that the shuffles are both shuffling the same vectors.

45083

if (!(A == C && B == D))

45084

return false;

45085

45086

PostShuffleMask.clear();

45087

PostShuffleMask.append(NumElts, SM_SentinelUndef);

45088

45089

// LHS and RHS are now:

45090

// LHS = shuffle A, B, LMask

45091

// RHS = shuffle A, B, RMask

45092

// Check that the masks correspond to performing a horizontal operation.

45093

// AVX defines horizontal add/sub to operate independently on 128-bit lanes,

45094

// so we just repeat the inner loop if this is a 256-bit op.

45095

unsigned Num128BitChunks = VT.getSizeInBits() / 128;

45096

unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

45097

unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

45098

assert((NumEltsPer128BitChunk % 2 == 0) &&(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45099, __PRETTY_FUNCTION__))

45099

"Vector type should have an even number of elements in each lane")(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45099, __PRETTY_FUNCTION__));

45100

for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

45101

for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

45102

// Ignore undefined components.

45103

int LIdx = LMask[i + j], RIdx = RMask[i + j];

45104

if (LIdx < 0 || RIdx < 0 ||

45105

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

45106

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

45107

continue;

45108

45109

// Check that successive odd/even elements are being operated on. If not,

45110

// this is not a horizontal operation.

45111

if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

45112

!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

45113

return false;

45114

45115

// Compute the post-shuffle mask index based on where the element

45116

// is stored in the HOP result, and where it needs to be moved to.

45117

int Base = LIdx & ~1u;

45118

int Index = ((Base % NumEltsPer128BitChunk) / 2) +

45119

((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

45120

45121

// The low half of the 128-bit result must choose from A.

45122

// The high half of the 128-bit result must choose from B,

45123

// unless B is undef. In that case, we are always choosing from A.

45124

if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

45125

Index += NumEltsPer64BitChunk;

45126

PostShuffleMask[i + j] = Index;

45127

}

45128

}

45129

45130

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

45131

SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

45132

45133

bool IsIdentityPostShuffle =

45134

isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

45135

if (IsIdentityPostShuffle)

45136

PostShuffleMask.clear();

45137

45138

// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

45139

if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

45140

isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

45141

return false;

45142

45143

// Assume a SingleSource HOP if we only shuffle one input and don't need to

45144

// shuffle the result.

45145

if (!shouldUseHorizontalOp(NewLHS == NewRHS &&

45146

(NumShuffles < 2 || !IsIdentityPostShuffle),

45147

DAG, Subtarget))

45148

return false;

45149

45150

LHS = DAG.getBitcast(VT, NewLHS);

45151

RHS = DAG.getBitcast(VT, NewRHS);

45152

return true;

45153

}

45154

45155

/// Do target-specific dag combines on floating-point adds/subs.

45156

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

45157

const X86Subtarget &Subtarget) {

45158

EVT VT = N->getValueType(0);

45159

SDValue LHS = N->getOperand(0);

45160

SDValue RHS = N->getOperand(1);

45161

bool IsFadd = N->getOpcode() == ISD::FADD;

45162

auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;

45163

assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode")(((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("(IsFadd || N->getOpcode() == ISD::FSUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45163, __PRETTY_FUNCTION__));

45164

45165

// Try to synthesize horizontal add/sub from adds/subs of shuffles.

45166

SmallVector<int, 8> PostShuffleMask;

45167

if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

45168

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&

45169

isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {

45170

SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

45171

if (!PostShuffleMask.empty())

45172

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

45173

DAG.getUNDEF(VT), PostShuffleMask);

45174

return HorizBinOp;

45175

}

45176

45177

return SDValue();

45178

}

45179

45180

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

45181

/// the codegen.

45182

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

45183

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

45184

/// anything that is guaranteed to be transformed by DAGCombiner.

45185

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

45186

const X86Subtarget &Subtarget,

45187

const SDLoc &DL) {

45188

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45188, __PRETTY_FUNCTION__));

45189

SDValue Src = N->getOperand(0);

45190

unsigned SrcOpcode = Src.getOpcode();

45191

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45192

45193

EVT VT = N->getValueType(0);

45194

EVT SrcVT = Src.getValueType();

45195

45196

auto IsFreeTruncation = [VT](SDValue Op) {

45197

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

45198

45199

// See if this has been extended from a smaller/equal size to

45200

// the truncation size, allowing a truncation to combine with the extend.

45201

unsigned Opcode = Op.getOpcode();

45202

if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

45203

Opcode == ISD::ZERO_EXTEND) &&

45204

Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

45205

return true;

45206

45207

// See if this is a single use constant which can be constant folded.

45208

// NOTE: We don't peek throught bitcasts here because there is currently

45209

// no support for constant folding truncate+bitcast+vector_of_constants. So

45210

// we'll just send up with a truncate on both operands which will

45211

// get turned back into (truncate (binop)) causing an infinite loop.

45212

return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

45213

};

45214

45215

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

45216

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

45217

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

45218

return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

45219

};

45220

45221

// Don't combine if the operation has other uses.

45222

if (!Src.hasOneUse())

45223

return SDValue();

45224

45225

// Only support vector truncation for now.

45226

// TODO: i64 scalar math would benefit as well.

45227

if (!VT.isVector())

45228

return SDValue();

45229

45230

// In most cases its only worth pre-truncating if we're only facing the cost

45231

// of one truncation.

45232

// i.e. if one of the inputs will constant fold or the input is repeated.

45233

switch (SrcOpcode) {

45234

case ISD::MUL:

45235

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

45236

// better to truncate if we have the chance.

45237

if (SrcVT.getScalarType() == MVT::i64 &&

45238

TLI.isOperationLegal(SrcOpcode, VT) &&

45239

!TLI.isOperationLegal(SrcOpcode, SrcVT))

45240

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

45241

LLVM_FALLTHROUGH[[gnu::fallthrough]];

45242

case ISD::AND:

45243

case ISD::XOR:

45244

case ISD::OR:

45245

case ISD::ADD:

45246

case ISD::SUB: {

45247

SDValue Op0 = Src.getOperand(0);

45248

SDValue Op1 = Src.getOperand(1);

45249

if (TLI.isOperationLegal(SrcOpcode, VT) &&

45250

(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

45251

return TruncateArithmetic(Op0, Op1);

45252

break;

45253

}

45254

}

45255

45256

return SDValue();

45257

}

45258

45259

/// Truncate using ISD::AND mask and X86ISD::PACKUS.

45260

/// e.g. trunc <8 x i32> X to <8 x i16> -->

45261

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

45262

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)

45263

static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,

45264

const X86Subtarget &Subtarget,

45265

SelectionDAG &DAG) {

45266

SDValue In = N->getOperand(0);

45267

EVT InVT = In.getValueType();

45268

EVT OutVT = N->getValueType(0);

45269

45270

APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),

45271

OutVT.getScalarSizeInBits());

45272

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));

45273

return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);

45274

}

45275

45276

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

45277

static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,

45278

const X86Subtarget &Subtarget,

45279

SelectionDAG &DAG) {

45280

SDValue In = N->getOperand(0);

45281

EVT InVT = In.getValueType();

45282

EVT OutVT = N->getValueType(0);

45283

In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,

45284

DAG.getValueType(OutVT));

45285

return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);

45286

}

45287

45288

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

45289

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

45290

/// legalization the truncation will be translated into a BUILD_VECTOR with each

45291

/// element that is extracted from a vector and then truncated, and it is

45292

/// difficult to do this optimization based on them.

45293

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

45294

const X86Subtarget &Subtarget) {

45295

EVT OutVT = N->getValueType(0);

45296

if (!OutVT.isVector())

45297

return SDValue();

45298

45299

SDValue In = N->getOperand(0);

45300

if (!In.getValueType().isSimple())

45301

return SDValue();

45302

45303

EVT InVT = In.getValueType();

45304

unsigned NumElems = OutVT.getVectorNumElements();

45305

45306

// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on

45307

// SSE2, and we need to take care of it specially.

45308

// AVX512 provides vpmovdb.

45309

if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())

45310

return SDValue();

45311

45312

EVT OutSVT = OutVT.getVectorElementType();

45313

EVT InSVT = InVT.getVectorElementType();

45314

if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&

45315

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

45316

NumElems >= 8))

45317

return SDValue();

45318

45319

// SSSE3's pshufb results in less instructions in the cases below.

45320

if (Subtarget.hasSSSE3() && NumElems == 8 &&

45321

((OutSVT == MVT::i8 && InSVT != MVT::i64) ||

45322

(InSVT == MVT::i32 && OutSVT == MVT::i16)))

45323

return SDValue();

45324

45325

SDLoc DL(N);

45326

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

45327

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

45328

// truncate 2 x v4i32 to v8i16.

45329

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

45330

return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);

45331

if (InSVT == MVT::i32)

45332

return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

45333

45334

return SDValue();

45335

}

45336

45337

/// This function transforms vector truncation of 'extended sign-bits' or

45338

/// 'extended zero-bits' values.

45339

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

45340

static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

45341

SelectionDAG &DAG,

45342

const X86Subtarget &Subtarget) {

45343

// Requires SSE2.

45344

if (!Subtarget.hasSSE2())

45345

return SDValue();

45346

45347

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

45348

return SDValue();

45349

45350

SDValue In = N->getOperand(0);

45351

if (!In.getValueType().isSimple())

45352

return SDValue();

45353

45354

MVT VT = N->getValueType(0).getSimpleVT();

45355

MVT SVT = VT.getScalarType();

45356

45357

MVT InVT = In.getValueType().getSimpleVT();

45358

MVT InSVT = InVT.getScalarType();

45359

45360

// Check we have a truncation suited for PACKSS/PACKUS.

45361

if (!isPowerOf2_32(VT.getVectorNumElements()))

45362

return SDValue();

45363

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

45364

return SDValue();

45365

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

45366

return SDValue();

45367

45368

// Truncation to sub-128bit vXi32 can be better handled with shuffles.

45369

if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

45370

return SDValue();

45371

45372

// AVX512 has fast truncate, but if the input is already going to be split,

45373

// there's no harm in trying pack.

45374

if (Subtarget.hasAVX512() &&

45375

!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&

45376

InVT.is512BitVector()))

45377

return SDValue();

45378

45379

unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

45380

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

45381

45382

// Use PACKUS if the input has zero-bits that extend all the way to the

45383

// packed/truncated value. e.g. masks, zext_in_reg, etc.

45384

KnownBits Known = DAG.computeKnownBits(In);

45385

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

45386

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))

45387

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

45388

45389

// Use PACKSS if the input has sign-bits that extend all the way to the

45390

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

45391

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

45392

45393

// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

45394

// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

45395

// on and combines/simplifications can't then use it.

45396

if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

45397

return SDValue();

45398

45399

if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))

45400

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

45401

45402

return SDValue();

45403

}

45404

45405

// Try to form a MULHU or MULHS node by looking for

45406

// (trunc (srl (mul ext, ext), 16))

45407

// TODO: This is X86 specific because we want to be able to handle wide types

45408

// before type legalization. But we can only do it if the vector will be

45409

// legalized via widening/splitting. Type legalization can't handle promotion

45410

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

45411

// combiner.

45412

static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

45413

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

45414

// First instruction should be a right shift of a multiply.

45415

if (Src.getOpcode() != ISD::SRL ||

45416

Src.getOperand(0).getOpcode() != ISD::MUL)

45417

return SDValue();

45418

45419

if (!Subtarget.hasSSE2())

45420

return SDValue();

45421

45422

// Only handle vXi16 types that are at least 128-bits unless they will be

45423

// widened.

45424

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

45425

return SDValue();

45426

45427

// Input type should be at least vXi32.

45428

EVT InVT = Src.getValueType();

45429

if (InVT.getVectorElementType().getSizeInBits() < 32)

45430

return SDValue();

45431

45432

// Need a shift by 16.

45433

APInt ShiftAmt;

45434

if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||

45435

ShiftAmt != 16)

45436

return SDValue();

45437

45438

SDValue LHS = Src.getOperand(0).getOperand(0);

45439

SDValue RHS = Src.getOperand(0).getOperand(1);

45440

45441

unsigned ExtOpc = LHS.getOpcode();

45442

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

45443

RHS.getOpcode() != ExtOpc)

45444

return SDValue();

45445

45446

// Peek through the extends.

45447

LHS = LHS.getOperand(0);

45448

RHS = RHS.getOperand(0);

45449

45450

// Ensure the input types match.

45451

if (LHS.getValueType() != VT || RHS.getValueType() != VT)

45452

return SDValue();

45453

45454

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

45455

return DAG.getNode(Opc, DL, VT, LHS, RHS);

45456

}

45457

45458

// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

45459

// from one vector with signed bytes from another vector, adds together

45460

// adjacent pairs of 16-bit products, and saturates the result before

45461

// truncating to 16-bits.

45462

//

45463

// Which looks something like this:

45464

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

45465

// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))

45466

static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

45467

const X86Subtarget &Subtarget,

45468

const SDLoc &DL) {

45469

if (!VT.isVector() || !Subtarget.hasSSSE3())

45470

return SDValue();

45471

45472

unsigned NumElems = VT.getVectorNumElements();

45473

EVT ScalarVT = VT.getVectorElementType();

45474

if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

45475

return SDValue();

45476

45477

SDValue SSatVal = detectSSatPattern(In, VT);

45478

if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

45479

return SDValue();

45480

45481

// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

45482

// of multiplies from even/odd elements.

45483

SDValue N0 = SSatVal.getOperand(0);

45484

SDValue N1 = SSatVal.getOperand(1);

45485

45486

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

45487

return SDValue();

45488

45489

SDValue N00 = N0.getOperand(0);

45490

SDValue N01 = N0.getOperand(1);

45491

SDValue N10 = N1.getOperand(0);

45492

SDValue N11 = N1.getOperand(1);

45493

45494

// TODO: Handle constant vectors and use knownbits/computenumsignbits?

45495

// Canonicalize zero_extend to LHS.

45496

if (N01.getOpcode() == ISD::ZERO_EXTEND)

45497

std::swap(N00, N01);

45498

if (N11.getOpcode() == ISD::ZERO_EXTEND)

45499

std::swap(N10, N11);

45500

45501

// Ensure we have a zero_extend and a sign_extend.

45502

if (N00.getOpcode() != ISD::ZERO_EXTEND ||

45503

N01.getOpcode() != ISD::SIGN_EXTEND ||

45504

N10.getOpcode() != ISD::ZERO_EXTEND ||

45505

N11.getOpcode() != ISD::SIGN_EXTEND)

45506

return SDValue();

45507

45508

// Peek through the extends.

45509

N00 = N00.getOperand(0);

45510

N01 = N01.getOperand(0);

45511

N10 = N10.getOperand(0);

45512

N11 = N11.getOperand(0);

45513

45514

// Ensure the extend is from vXi8.

45515

if (N00.getValueType().getVectorElementType() != MVT::i8 ||

45516

N01.getValueType().getVectorElementType() != MVT::i8 ||

45517

N10.getValueType().getVectorElementType() != MVT::i8 ||

45518

N11.getValueType().getVectorElementType() != MVT::i8)

45519

return SDValue();

45520

45521

// All inputs should be build_vectors.

45522

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

45523

N01.getOpcode() != ISD::BUILD_VECTOR ||

45524

N10.getOpcode() != ISD::BUILD_VECTOR ||

45525

N11.getOpcode() != ISD::BUILD_VECTOR)

45526

return SDValue();

45527

45528

// N00/N10 are zero extended. N01/N11 are sign extended.

45529

45530

// For each element, we need to ensure we have an odd element from one vector

45531

// multiplied by the odd element of another vector and the even element from

45532

// one of the same vectors being multiplied by the even element from the

45533

// other vector. So we need to make sure for each element i, this operator

45534

// is being performed:

45535

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

45536

SDValue ZExtIn, SExtIn;

45537

for (unsigned i = 0; i != NumElems; ++i) {

45538

SDValue N00Elt = N00.getOperand(i);

45539

SDValue N01Elt = N01.getOperand(i);

45540

SDValue N10Elt = N10.getOperand(i);

45541

SDValue N11Elt = N11.getOperand(i);

45542

// TODO: Be more tolerant to undefs.

45543

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

45544

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

45545

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

45546

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

45547

return SDValue();

45548

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

45549

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

45550

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

45551

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

45552

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

45553

return SDValue();

45554

unsigned IdxN00 = ConstN00Elt->getZExtValue();

45555

unsigned IdxN01 = ConstN01Elt->getZExtValue();

45556

unsigned IdxN10 = ConstN10Elt->getZExtValue();

45557

unsigned IdxN11 = ConstN11Elt->getZExtValue();

45558

// Add is commutative so indices can be reordered.

45559

if (IdxN00 > IdxN10) {

45560

std::swap(IdxN00, IdxN10);

45561

std::swap(IdxN01, IdxN11);

45562

}

45563

// N0 indices be the even element. N1 indices must be the next odd element.

45564

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

45565

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

45566

return SDValue();

45567

SDValue N00In = N00Elt.getOperand(0);

45568

SDValue N01In = N01Elt.getOperand(0);

45569

SDValue N10In = N10Elt.getOperand(0);

45570

SDValue N11In = N11Elt.getOperand(0);

45571

// First time we find an input capture it.

45572

if (!ZExtIn) {

45573

ZExtIn = N00In;

45574

SExtIn = N01In;

45575

}

45576

if (ZExtIn != N00In || SExtIn != N01In ||

45577

ZExtIn != N10In || SExtIn != N11In)

45578

return SDValue();

45579

}

45580

45581

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45582

ArrayRef<SDValue> Ops) {

45583

// Shrink by adding truncate nodes and let DAGCombine fold with the

45584

// sources.

45585

EVT InVT = Ops[0].getValueType();

45586

assert(InVT.getScalarType() == MVT::i8 &&((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45587, __PRETTY_FUNCTION__))

45587

"Unexpected scalar element type")((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45587, __PRETTY_FUNCTION__));

45588

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45588, __PRETTY_FUNCTION__));

45589

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

45590

InVT.getVectorNumElements() / 2);

45591

return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

45592

};

45593

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

45594

PMADDBuilder);

45595

}

45596

45597

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

45598

const X86Subtarget &Subtarget) {

45599

EVT VT = N->getValueType(0);

45600

SDValue Src = N->getOperand(0);

45601

SDLoc DL(N);

45602

45603

// Attempt to pre-truncate inputs to arithmetic ops instead.

45604

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

45605

return V;

45606

45607

// Try to detect AVG pattern first.

45608

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

45609

return Avg;

45610

45611

// Try to detect PMADD

45612

if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

45613

return PMAdd;

45614

45615

// Try to combine truncation with signed/unsigned saturation.

45616

if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

45617

return Val;

45618

45619

// Try to combine PMULHUW/PMULHW for vXi16.

45620

if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

45621

return V;

45622

45623

// The bitcast source is a direct mmx result.

45624

// Detect bitcasts between i32 to x86mmx

45625

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

45626

SDValue BCSrc = Src.getOperand(0);

45627

if (BCSrc.getValueType() == MVT::x86mmx)

45628

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

45629

}

45630

45631

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

45632

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

45633

return V;

45634

45635

return combineVectorTruncation(N, DAG, Subtarget);

45636

}

45637

45638

static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

45639

TargetLowering::DAGCombinerInfo &DCI) {

45640

EVT VT = N->getValueType(0);

45641

SDValue In = N->getOperand(0);

45642

SDLoc DL(N);

45643

45644

if (auto SSatVal = detectSSatPattern(In, VT))

45645

return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

45646

if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))

45647

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

45648

45649

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45650

APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));

45651

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

45652

return SDValue(N, 0);

45653

45654

return SDValue();

45655

}

45656

45657

/// Returns the negated value if the node \p N flips sign of FP value.

45658

///

45659

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

45660

/// or FSUB(0, x)

45661

/// AVX512F does not have FXOR, so FNEG is lowered as

45662

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

45663

/// In this case we go though all bitcasts.

45664

/// This also recognizes splat of a negated value and returns the splat of that

45665

/// value.

45666

static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

45667

if (N->getOpcode() == ISD::FNEG)

45668

return N->getOperand(0);

45669

45670

// Don't recurse exponentially.

45671

if (Depth > SelectionDAG::MaxRecursionDepth)

45672

return SDValue();

45673

45674

unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

45675

45676

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

45677

EVT VT = Op->getValueType(0);

45678

45679

// Make sure the element size doesn't change.

45680

if (VT.getScalarSizeInBits() != ScalarSize)

45681

return SDValue();

45682

45683

unsigned Opc = Op.getOpcode();

45684

switch (Opc) {

45685

case ISD::VECTOR_SHUFFLE: {

45686

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

45687

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

45688

if (!Op.getOperand(1).isUndef())

45689

return SDValue();

45690

if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

45691

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

45692

return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

45693

cast<ShuffleVectorSDNode>(Op)->getMask());

45694

break;

45695

}

45696

case ISD::INSERT_VECTOR_ELT: {

45697

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

45698

// -V, INDEX).

45699

SDValue InsVector = Op.getOperand(0);

45700

SDValue InsVal = Op.getOperand(1);

45701

if (!InsVector.isUndef())

45702

return SDValue();

45703

if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

45704

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

45705

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

45706

NegInsVal, Op.getOperand(2));

45707

break;

45708

}

45709

case ISD::FSUB:

45710

case ISD::XOR:

45711

case X86ISD::FXOR: {

45712

SDValue Op1 = Op.getOperand(1);

45713

SDValue Op0 = Op.getOperand(0);

45714

45715

// For XOR and FXOR, we want to check if constant

45716

// bits of Op1 are sign bit masks. For FSUB, we

45717

// have to check if constant bits of Op0 are sign

45718

// bit masks and hence we swap the operands.

45719

if (Opc == ISD::FSUB)

45720

std::swap(Op0, Op1);

45721

45722

APInt UndefElts;

45723

SmallVector<APInt, 16> EltBits;

45724

// Extract constant bits and see if they are all

45725

// sign bit masks. Ignore the undef elements.

45726

if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

45727

/* AllowWholeUndefs */ true,

45728

/* AllowPartialUndefs */ false)) {

45729

for (unsigned I = 0, E = EltBits.size(); I < E; I++)

45730

if (!UndefElts[I] && !EltBits[I].isSignMask())

45731

return SDValue();

45732

45733

return peekThroughBitcasts(Op0);

45734

}

45735

}

45736

}

45737

45738

return SDValue();

45739

}

45740

45741

static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

45742

bool NegRes) {

45743

if (NegMul) {

45744

switch (Opcode) {

45745

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45745);

45746

case ISD::FMA: Opcode = X86ISD::FNMADD; break;

45747

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

45748

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

45749

case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

45750

case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

45751

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

45752

case X86ISD::FNMADD: Opcode = ISD::FMA; break;

45753

case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

45754

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

45755

case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

45756

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

45757

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

45758

}

45759

}

45760

45761

if (NegAcc) {

45762

switch (Opcode) {

45763

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45763);

45764

case ISD::FMA: Opcode = X86ISD::FMSUB; break;

45765

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

45766

case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

45767

case X86ISD::FMSUB: Opcode = ISD::FMA; break;

45768

case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

45769

case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

45770

case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

45771

case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

45772

case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

45773

case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

45774

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

45775

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

45776

case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

45777

case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

45778

case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

45779

case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

45780

}

45781

}

45782

45783

if (NegRes) {

45784

switch (Opcode) {

45785

// For accuracy reason, we never combine fneg and fma under strict FP.

45786

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45786);

45787

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

45788

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

45789

case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;

45790

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

45791

case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;

45792

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

45793

case X86ISD::FNMSUB: Opcode = ISD::FMA; break;

45794

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

45795

}

45796

}

45797

45798

return Opcode;

45799

}

45800

45801

/// Do target-specific dag combines on floating point negations.

45802

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

45803

TargetLowering::DAGCombinerInfo &DCI,

45804

const X86Subtarget &Subtarget) {

45805

EVT OrigVT = N->getValueType(0);

45806

SDValue Arg = isFNEG(DAG, N);

45807

if (!Arg)

45808

return SDValue();

45809

45810

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45811

EVT VT = Arg.getValueType();

45812

EVT SVT = VT.getScalarType();

45813

SDLoc DL(N);

45814

45815

// Let legalize expand this if it isn't a legal type yet.

45816

if (!TLI.isTypeLegal(VT))

45817

return SDValue();

45818

45819

// If we're negating a FMUL node on a target with FMA, then we can avoid the

45820

// use of a constant by performing (-0 - A*B) instead.

45821

// FIXME: Check rounding control flags as well once it becomes available.

45822

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

45823

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

45824

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

45825

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

45826

Arg.getOperand(1), Zero);

45827

return DAG.getBitcast(OrigVT, NewNode);

45828

}

45829

45830

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

45831

bool LegalOperations = !DCI.isBeforeLegalizeOps();

45832

if (SDValue NegArg =

45833

TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

45834

return DAG.getBitcast(OrigVT, NegArg);

45835

45836

return SDValue();

45837

}

45838

45839

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

45840

bool LegalOperations,

45841

bool ForCodeSize,

45842

NegatibleCost &Cost,

45843

unsigned Depth) const {

45844

// fneg patterns are removable even if they have multiple uses.

45845

if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

45846

Cost = NegatibleCost::Cheaper;

45847

return DAG.getBitcast(Op.getValueType(), Arg);

45848

}

45849

45850

EVT VT = Op.getValueType();

45851

EVT SVT = VT.getScalarType();

45852

unsigned Opc = Op.getOpcode();

45853

switch (Opc) {

45854

case ISD::FMA:

45855

case X86ISD::FMSUB:

45856

case X86ISD::FNMADD:

45857

case X86ISD::FNMSUB:

45858

case X86ISD::FMADD_RND:

45859

case X86ISD::FMSUB_RND:

45860

case X86ISD::FNMADD_RND:

45861

case X86ISD::FNMSUB_RND: {

45862

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

45863

!(SVT == MVT::f32 || SVT == MVT::f64) ||

45864

!isOperationLegal(ISD::FMA, VT))

45865

break;

45866

45867

// This is always negatible for free but we might be able to remove some

45868

// extra operand negations as well.

45869

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

45870

for (int i = 0; i != 3; ++i)

45871

NewOps[i] = getCheaperNegatedExpression(

45872

Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

45873

45874

bool NegA = !!NewOps[0];

45875

bool NegB = !!NewOps[1];

45876

bool NegC = !!NewOps[2];

45877

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

45878

45879

Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

45880

: NegatibleCost::Neutral;

45881

45882

// Fill in the non-negated ops with the original values.

45883

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

45884

if (!NewOps[i])

45885

NewOps[i] = Op.getOperand(i);

45886

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

45887

}

45888

case X86ISD::FRCP:

45889

if (SDValue NegOp0 =

45890

getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

45891

ForCodeSize, Cost, Depth + 1))

45892

return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

45893

break;

45894

}

45895

45896

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

45897

ForCodeSize, Cost, Depth);

45898

}

45899

45900

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

45901

const X86Subtarget &Subtarget) {

45902

MVT VT = N->getSimpleValueType(0);

45903

// If we have integer vector types available, use the integer opcodes.

45904

if (!VT.isVector() || !Subtarget.hasSSE2())

45905

return SDValue();

45906

45907

SDLoc dl(N);

45908

45909

unsigned IntBits = VT.getScalarSizeInBits();

45910

MVT IntSVT = MVT::getIntegerVT(IntBits);

45911

MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

45912

45913

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

45914

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

45915

unsigned IntOpcode;

45916

switch (N->getOpcode()) {

45917

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45917);

45918

case X86ISD::FOR: IntOpcode = ISD::OR; break;

45919

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

45920

case X86ISD::FAND: IntOpcode = ISD::AND; break;

45921

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

45922

}

45923

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

45924

return DAG.getBitcast(VT, IntOp);

45925

}

45926

45927

45928

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

45929

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

45930

if (N->getOpcode() != ISD::XOR)

45931

return SDValue();

45932

45933

SDValue LHS = N->getOperand(0);

45934

if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

45935

return SDValue();

45936

45937

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

45938

X86::CondCode(LHS->getConstantOperandVal(0)));

45939

SDLoc DL(N);

45940

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

45941

}

45942

45943

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

45944

TargetLowering::DAGCombinerInfo &DCI,

45945

const X86Subtarget &Subtarget) {

45946

// If this is SSE1 only convert to FXOR to avoid scalarization.

45947

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&

45948

N->getValueType(0) == MVT::v4i32) {

45949

return DAG.getBitcast(

45950

MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

45951

DAG.getBitcast(MVT::v4f32, N->getOperand(0)),

45952

DAG.getBitcast(MVT::v4f32, N->getOperand(1))));

45953

}

45954

45955

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

45956

return Cmp;

45957

45958

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

45959

return R;

45960

45961

if (DCI.isBeforeLegalizeOps())

45962

return SDValue();

45963

45964

if (SDValue SetCC = foldXor1SetCC(N, DAG))

45965

return SetCC;

45966

45967

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

45968

return RV;

45969

45970

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

45971

return FPLogic;

45972

45973

return combineFneg(N, DAG, DCI, Subtarget);

45974

}

45975

45976

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

45977

TargetLowering::DAGCombinerInfo &DCI,

45978

const X86Subtarget &Subtarget) {

45979

EVT VT = N->getValueType(0);

45980

unsigned NumBits = VT.getSizeInBits();

45981

45982

// TODO - Constant Folding.

45983

45984

// Simplify the inputs.

45985

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45986

APInt DemandedMask(APInt::getAllOnesValue(NumBits));

45987

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

45988

return SDValue(N, 0);

45989

45990

return SDValue();

45991

}

45992

45993

static bool isNullFPScalarOrVectorConst(SDValue V) {

45994

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

45995

}

45996

45997

/// If a value is a scalar FP zero or a vector FP zero (potentially including

45998

/// undefined elements), return a zero constant that may be used to fold away

45999

/// that value. In the case of a vector, the returned constant will not contain

46000

/// undefined elements even if the input parameter does. This makes it suitable

46001

/// to be used as a replacement operand with operations (eg, bitwise-and) where

46002

/// an undef should not propagate.

46003

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

46004

const X86Subtarget &Subtarget) {

46005

if (!isNullFPScalarOrVectorConst(V))

46006

return SDValue();

46007

46008

if (V.getValueType().isVector())

46009

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

46010

46011

return V;

46012

}

46013

46014

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

46015

const X86Subtarget &Subtarget) {

46016

SDValue N0 = N->getOperand(0);

46017

SDValue N1 = N->getOperand(1);

46018

EVT VT = N->getValueType(0);

46019

SDLoc DL(N);

46020

46021

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

46022

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

46023

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

46024

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

46025

return SDValue();

46026

46027

auto isAllOnesConstantFP = [](SDValue V) {

46028

if (V.getSimpleValueType().isVector())

46029

return ISD::isBuildVectorAllOnes(V.getNode());

46030

auto *C = dyn_cast<ConstantFPSDNode>(V);

46031

return C && C->getConstantFPValue()->isAllOnesValue();

46032

};

46033

46034

// fand (fxor X, -1), Y --> fandn X, Y

46035

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

46036

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

46037

46038

// fand X, (fxor Y, -1) --> fandn Y, X

46039

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

46040

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

46041

46042

return SDValue();

46043

}

46044

46045

/// Do target-specific dag combines on X86ISD::FAND nodes.

46046

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

46047

const X86Subtarget &Subtarget) {

46048

// FAND(0.0, x) -> 0.0

46049

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

46050

return V;

46051

46052

// FAND(x, 0.0) -> 0.0

46053

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

46054

return V;

46055

46056

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

46057

return V;

46058

46059

return lowerX86FPLogicOp(N, DAG, Subtarget);

46060

}

46061

46062

/// Do target-specific dag combines on X86ISD::FANDN nodes.

46063

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

46064

const X86Subtarget &Subtarget) {

46065

// FANDN(0.0, x) -> x

46066

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

46067

return N->getOperand(1);

46068

46069

// FANDN(x, 0.0) -> 0.0

46070

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

46071

return V;

46072

46073

return lowerX86FPLogicOp(N, DAG, Subtarget);

46074

}

46075

46076

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

46077

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

46078

TargetLowering::DAGCombinerInfo &DCI,

46079

const X86Subtarget &Subtarget) {

46080

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD
::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46080, __PRETTY_FUNCTION__));

46081

46082

// F[X]OR(0.0, x) -> x

46083

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

46084

return N->getOperand(1);

46085

46086

// F[X]OR(x, 0.0) -> x

46087

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

46088

return N->getOperand(0);

46089

46090

if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

46091

return NewVal;

46092

46093

return lowerX86FPLogicOp(N, DAG, Subtarget);

46094

}

46095

46096

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

46097

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

46098

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD
::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46098, __PRETTY_FUNCTION__));

46099

46100

// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

46101

if (!DAG.getTarget().Options.NoNaNsFPMath ||

46102

!DAG.getTarget().Options.NoSignedZerosFPMath)

46103

return SDValue();

46104

46105

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

46106

// into FMINC and FMAXC, which are Commutative operations.

46107

unsigned NewOp = 0;

46108

switch (N->getOpcode()) {

46109

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46109);

46110

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

46111

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

46112

}

46113

46114

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

46115

N->getOperand(0), N->getOperand(1));

46116

}

46117

46118

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

46119

const X86Subtarget &Subtarget) {

46120

if (Subtarget.useSoftFloat())

46121

return SDValue();

46122

46123

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46124

46125

EVT VT = N->getValueType(0);

46126

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

46127

(Subtarget.hasSSE2() && VT == MVT::f64) ||

46128

(VT.isVector() && TLI.isTypeLegal(VT))))

46129

return SDValue();

46130

46131

SDValue Op0 = N->getOperand(0);

46132

SDValue Op1 = N->getOperand(1);

46133

SDLoc DL(N);

46134

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

46135

46136

// If we don't have to respect NaN inputs, this is a direct translation to x86

46137

// min/max instructions.

46138

if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

46139

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

46140

46141

// If one of the operands is known non-NaN use the native min/max instructions

46142

// with the non-NaN input as second operand.

46143

if (DAG.isKnownNeverNaN(Op1))

46144

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

46145

if (DAG.isKnownNeverNaN(Op0))

46146

return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

46147

46148

// If we have to respect NaN inputs, this takes at least 3 instructions.

46149

// Favor a library call when operating on a scalar and minimizing code size.

46150

if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

46151

return SDValue();

46152

46153

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

46154

VT);

46155

46156

// There are 4 possibilities involving NaN inputs, and these are the required

46157

// outputs:

46158

// Op1

46159

// Num NaN

46160

// ----------------

46161

// Num | Max | Op0 |

46162

// Op0 ----------------

46163

// NaN | Op1 | NaN |

46164

// ----------------

46165

//

46166

// The SSE FP max/min instructions were not designed for this case, but rather

46167

// to implement:

46168

// Min = Op1 < Op0 ? Op1 : Op0

46169

// Max = Op1 > Op0 ? Op1 : Op0

46170

//

46171

// So they always return Op0 if either input is a NaN. However, we can still

46172

// use those instructions for fmaxnum by selecting away a NaN input.

46173

46174

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

46175

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

46176

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

46177

46178

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

46179

// are NaN, the NaN value of Op1 is the result.

46180

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

46181

}

46182

46183

static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

46184

TargetLowering::DAGCombinerInfo &DCI) {

46185

EVT VT = N->getValueType(0);

46186

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46187

46188

APInt KnownUndef, KnownZero;

46189

APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());

46190

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,

46191

KnownZero, DCI))

46192

return SDValue(N, 0);

46193

46194

// Convert a full vector load into vzload when not all bits are needed.

46195

SDValue In = N->getOperand(0);

46196

MVT InVT = In.getSimpleValueType();

46197

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

46198

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

46199

assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46199, __PRETTY_FUNCTION__));

46200

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

46201

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

46202

MVT MemVT = MVT::getIntegerVT(NumBits);

46203

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

46204

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

46205

SDLoc dl(N);

46206

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

46207

DAG.getBitcast(InVT, VZLoad));

46208

DCI.CombineTo(N, Convert);

46209

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

46210

DCI.recursivelyDeleteUnusedNodes(LN);

46211

return SDValue(N, 0);

46212

}

46213

}

46214

46215

return SDValue();

46216

}

46217

46218

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

46219

TargetLowering::DAGCombinerInfo &DCI) {

46220

bool IsStrict = N->isTargetStrictFPOpcode();

46221

EVT VT = N->getValueType(0);

46222

46223

// Convert a full vector load into vzload when not all bits are needed.

46224

SDValue In = N->getOperand(IsStrict ? 1 : 0);

46225

MVT InVT = In.getSimpleValueType();

46226

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

46227

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

46228

assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46228, __PRETTY_FUNCTION__));

46229

LoadSDNode *LN = cast<LoadSDNode>(In);

46230

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

46231

MVT MemVT = MVT::getFloatingPointVT(NumBits);

46232

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

46233

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

46234

SDLoc dl(N);

46235

if (IsStrict) {

46236

SDValue Convert =

46237

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

46238

{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

46239

DCI.CombineTo(N, Convert, Convert.getValue(1));

46240

} else {

46241

SDValue Convert =

46242

DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

46243

DCI.CombineTo(N, Convert);

46244

}

46245

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

46246

DCI.recursivelyDeleteUnusedNodes(LN);

46247

return SDValue(N, 0);

46248

}

46249

}

46250

46251

return SDValue();

46252

}

46253

46254

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

46255

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

46256

TargetLowering::DAGCombinerInfo &DCI,

46257

const X86Subtarget &Subtarget) {

46258

MVT VT = N->getSimpleValueType(0);

46259

46260

// ANDNP(0, x) -> x

46261

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

46262

return N->getOperand(1);

46263

46264

// ANDNP(x, 0) -> 0

46265

if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))

46266

return DAG.getConstant(0, SDLoc(N), VT);

46267

46268

// Turn ANDNP back to AND if input is inverted.

46269

if (SDValue Not = IsNOT(N->getOperand(0), DAG))

46270

return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),

46271

N->getOperand(1));

46272

46273

// Attempt to recursively combine a bitmask ANDNP with shuffles.

46274

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

46275

SDValue Op(N, 0);

46276

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

46277

return Res;

46278

}

46279

46280

return SDValue();

46281

}

46282

46283

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

46284

TargetLowering::DAGCombinerInfo &DCI) {

46285

SDValue N1 = N->getOperand(1);

46286

46287

// BT ignores high bits in the bit index operand.

46288

unsigned BitWidth = N1.getValueSizeInBits();

46289

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

46290

if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

46291

if (N->getOpcode() != ISD::DELETED_NODE)

46292

DCI.AddToWorklist(N);

46293

return SDValue(N, 0);

46294

}

46295

46296

return SDValue();

46297

}

46298

46299

static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

46300

TargetLowering::DAGCombinerInfo &DCI) {

46301

bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

46302

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

46303

46304

if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

46305

APInt KnownUndef, KnownZero;

46306

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46307

APInt DemandedElts = APInt::getLowBitsSet(8, 4);

46308

if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

46309

DCI)) {

46310

if (N->getOpcode() != ISD::DELETED_NODE)

46311

DCI.AddToWorklist(N);

46312

return SDValue(N, 0);

46313

}

46314

46315

// Convert a full vector load into vzload when not all bits are needed.

46316

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

46317

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

46318

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

46319

SDLoc dl(N);

46320

if (IsStrict) {

46321

SDValue Convert = DAG.getNode(

46322

N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

46323

{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

46324

DCI.CombineTo(N, Convert, Convert.getValue(1));

46325

} else {

46326

SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

46327

DAG.getBitcast(MVT::v8i16, VZLoad));

46328

DCI.CombineTo(N, Convert);

46329

}

46330

46331

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

46332

DCI.recursivelyDeleteUnusedNodes(LN);

46333

return SDValue(N, 0);

46334

}

46335

}

46336

}

46337

46338

return SDValue();

46339

}

46340

46341

// Try to combine sext_in_reg of a cmov of constants by extending the constants.

46342

static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

46343

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46343, __PRETTY_FUNCTION__));

46344

46345

EVT DstVT = N->getValueType(0);

46346

46347

SDValue N0 = N->getOperand(0);

46348

SDValue N1 = N->getOperand(1);

46349

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

46350

46351

if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

46352

return SDValue();

46353

46354

// Look through single use any_extends / truncs.

46355

SDValue IntermediateBitwidthOp;

46356

if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

46357

N0.hasOneUse()) {

46358

IntermediateBitwidthOp = N0;

46359

N0 = N0.getOperand(0);

46360

}

46361

46362

// See if we have a single use cmov.

46363

if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

46364

return SDValue();

46365

46366

SDValue CMovOp0 = N0.getOperand(0);

46367

SDValue CMovOp1 = N0.getOperand(1);

46368

46369

// Make sure both operands are constants.

46370

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

46371

!isa<ConstantSDNode>(CMovOp1.getNode()))

46372

return SDValue();

46373

46374

SDLoc DL(N);

46375

46376

// If we looked through an any_extend/trunc above, add one to the constants.

46377

if (IntermediateBitwidthOp) {

46378

unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

46379

CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

46380

CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

46381

}

46382

46383

CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

46384

CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

46385

46386

EVT CMovVT = DstVT;

46387

// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

46388

if (DstVT == MVT::i16) {

46389

CMovVT = MVT::i32;

46390

CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

46391

CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

46392

}

46393

46394

SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

46395

N0.getOperand(2), N0.getOperand(3));

46396

46397

if (CMovVT != DstVT)

46398

CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

46399

46400

return CMov;

46401

}

46402

46403

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

46404

const X86Subtarget &Subtarget) {

46405

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46405, __PRETTY_FUNCTION__));

46406

46407

if (SDValue V = combineSextInRegCmov(N, DAG))

46408

return V;

46409

46410

EVT VT = N->getValueType(0);

46411

SDValue N0 = N->getOperand(0);

46412

SDValue N1 = N->getOperand(1);

46413

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

46414

SDLoc dl(N);

46415

46416

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

46417

// both SSE and AVX2 since there is no sign-extended shift right

46418

// operation on a vector with 64-bit elements.

46419

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

46420

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

46421

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

46422

N0.getOpcode() == ISD::SIGN_EXTEND)) {

46423

SDValue N00 = N0.getOperand(0);

46424

46425

// EXTLOAD has a better solution on AVX2,

46426

// it may be replaced with X86ISD::VSEXT node.

46427

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

46428

if (!ISD::isNormalLoad(N00.getNode()))

46429

return SDValue();

46430

46431

// Attempt to promote any comparison mask ops before moving the

46432

// SIGN_EXTEND_INREG in the way.

46433

if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

46434

return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

46435

46436

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

46437

SDValue Tmp =

46438

DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

46439

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

46440

}

46441

}

46442

return SDValue();

46443

}

46444

46445

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

46446

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

46447

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

46448

/// opportunities to combine math ops, use an LEA, or use a complex addressing

46449

/// mode. This can eliminate extend, add, and shift instructions.

46450

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

46451

const X86Subtarget &Subtarget) {

46452

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

46453

Ext->getOpcode() != ISD::ZERO_EXTEND)

46454

return SDValue();

46455

46456

// TODO: This should be valid for other integer types.

46457

EVT VT = Ext->getValueType(0);

46458

if (VT != MVT::i64)

46459

return SDValue();

46460

46461

SDValue Add = Ext->getOperand(0);

46462

if (Add.getOpcode() != ISD::ADD)

46463

return SDValue();

46464

46465

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

46466

bool NSW = Add->getFlags().hasNoSignedWrap();

46467

bool NUW = Add->getFlags().hasNoUnsignedWrap();

46468

46469

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

46470

// into the 'zext'

46471

if ((Sext && !NSW) || (!Sext && !NUW))

46472

return SDValue();

46473

46474

// Having a constant operand to the 'add' ensures that we are not increasing

46475

// the instruction count because the constant is extended for free below.

46476

// A constant operand can also become the displacement field of an LEA.

46477

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

46478

if (!AddOp1)

46479

return SDValue();

46480

46481

// Don't make the 'add' bigger if there's no hope of combining it with some

46482

// other 'add' or 'shl' instruction.

46483

// TODO: It may be profitable to generate simpler LEA instructions in place

46484

// of single 'add' instructions, but the cost model for selecting an LEA

46485

// currently has a high threshold.

46486

bool HasLEAPotential = false;

46487

for (auto *User : Ext->uses()) {

46488

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

46489

HasLEAPotential = true;

46490

break;

46491

}

46492

}

46493

if (!HasLEAPotential)

46494

return SDValue();

46495

46496

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

46497

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

46498

SDValue AddOp0 = Add.getOperand(0);

46499

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

46500

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

46501

46502

// The wider add is guaranteed to not wrap because both operands are

46503

// sign-extended.

46504

SDNodeFlags Flags;

46505

Flags.setNoSignedWrap(NSW);

46506

Flags.setNoUnsignedWrap(NUW);

46507

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

46508

}

46509

46510

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

46511

// operands and the result of CMOV is not used anywhere else - promote CMOV

46512

// itself instead of promoting its result. This could be beneficial, because:

46513

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

46514

// (or more) pseudo-CMOVs only when they go one-after-another and

46515

// getting rid of result extension code after CMOV will help that.

46516

// 2) Promotion of constant CMOV arguments is free, hence the

46517

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

46518

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

46519

// promotion is also good in terms of code-size.

46520

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

46521

// promotion).

46522

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

46523

SDValue CMovN = Extend->getOperand(0);

46524

if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

46525

return SDValue();

46526

46527

EVT TargetVT = Extend->getValueType(0);

46528

unsigned ExtendOpcode = Extend->getOpcode();

46529

SDLoc DL(Extend);

46530

46531

EVT VT = CMovN.getValueType();

46532

SDValue CMovOp0 = CMovN.getOperand(0);

46533

SDValue CMovOp1 = CMovN.getOperand(1);

46534

46535

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

46536

!isa<ConstantSDNode>(CMovOp1.getNode()))

46537

return SDValue();

46538

46539

// Only extend to i32 or i64.

46540

if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

46541

return SDValue();

46542

46543

// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

46544

// are free.

46545

if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

46546

return SDValue();

46547

46548

// If this a zero extend to i64, we should only extend to i32 and use a free

46549

// zero extend to finish.

46550

EVT ExtendVT = TargetVT;

46551

if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

46552

ExtendVT = MVT::i32;

46553

46554

CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

46555

CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

46556

46557

SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

46558

CMovN.getOperand(2), CMovN.getOperand(3));

46559

46560

// Finish extending if needed.

46561

if (ExtendVT != TargetVT)

46562

Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

46563

46564

return Res;

46565

}

46566

46567

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

46568

// This is more or less the reverse of combineBitcastvxi1.

46569

static SDValue

46570

combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,

46571

TargetLowering::DAGCombinerInfo &DCI,

46572

const X86Subtarget &Subtarget) {

46573

unsigned Opcode = N->getOpcode();

46574

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

46575

Opcode != ISD::ANY_EXTEND)

46576

return SDValue();

46577

if (!DCI.isBeforeLegalizeOps())

46578

return SDValue();

46579

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

46580

return SDValue();

46581

46582

SDValue N0 = N->getOperand(0);

46583

EVT VT = N->getValueType(0);

46584

EVT SVT = VT.getScalarType();

46585

EVT InSVT = N0.getValueType().getScalarType();

46586

unsigned EltSizeInBits = SVT.getSizeInBits();

46587

46588

// Input type must be extending a bool vector (bit-casted from a scalar

46589

// integer) to legal integer types.

46590

if (!VT.isVector())

46591

return SDValue();

46592

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

46593

return SDValue();

46594

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

46595

return SDValue();

46596

46597

SDValue N00 = N0.getOperand(0);

46598

EVT SclVT = N0.getOperand(0).getValueType();

46599

if (!SclVT.isScalarInteger())

46600

return SDValue();

46601

46602

SDLoc DL(N);

46603

SDValue Vec;

46604

SmallVector<int, 32> ShuffleMask;

46605

unsigned NumElts = VT.getVectorNumElements();

46606

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")((NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46606, __PRETTY_FUNCTION__));

46607

46608

// Broadcast the scalar integer to the vector elements.

46609

if (NumElts > EltSizeInBits) {

46610

// If the scalar integer is greater than the vector element size, then we

46611

// must split it down into sub-sections for broadcasting. For example:

46612

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

46613

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

46614

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46614, __PRETTY_FUNCTION__));

46615

unsigned Scale = NumElts / EltSizeInBits;

46616

EVT BroadcastVT =

46617

EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

46618

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

46619

Vec = DAG.getBitcast(VT, Vec);

46620

46621

for (unsigned i = 0; i != Scale; ++i)

46622

ShuffleMask.append(EltSizeInBits, i);

46623

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

46624

} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

46625

(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

46626

// If we have register broadcast instructions, use the scalar size as the

46627

// element type for the shuffle. Then cast to the wider element type. The

46628

// widened bits won't be used, and this might allow the use of a broadcast

46629

// load.

46630

assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale"
) ? static_cast<void> (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46630, __PRETTY_FUNCTION__));

46631

unsigned Scale = EltSizeInBits / NumElts;

46632

EVT BroadcastVT =

46633

EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

46634

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

46635

ShuffleMask.append(NumElts * Scale, 0);

46636

Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

46637

Vec = DAG.getBitcast(VT, Vec);

46638

} else {

46639

// For smaller scalar integers, we can simply any-extend it to the vector

46640

// element size (we don't care about the upper bits) and broadcast it to all

46641

// elements.

46642

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

46643

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

46644

ShuffleMask.append(NumElts, 0);

46645

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

46646

}

46647

46648

// Now, mask the relevant bit in each element.

46649

SmallVector<SDValue, 32> Bits;

46650

for (unsigned i = 0; i != NumElts; ++i) {

46651

int BitIdx = (i % EltSizeInBits);

46652

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

46653

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

46654

}

46655

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

46656

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

46657

46658

// Compare against the bitmask and extend the result.

46659

EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

46660

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

46661

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

46662

46663

// For SEXT, this is now done, otherwise shift the result down for

46664

// zero-extension.

46665

if (Opcode == ISD::SIGN_EXTEND)

46666

return Vec;

46667

return DAG.getNode(ISD::SRL, DL, VT, Vec,

46668

DAG.getConstant(EltSizeInBits - 1, DL, VT));

46669

}

46670

46671

// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

46672

// result type.

46673

static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

46674

const X86Subtarget &Subtarget) {

46675

SDValue N0 = N->getOperand(0);

46676

EVT VT = N->getValueType(0);

46677

SDLoc dl(N);

46678

46679

// Only do this combine with AVX512 for vector extends.

46680

if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

46681

return SDValue();

46682

46683

// Only combine legal element types.

46684

EVT SVT = VT.getVectorElementType();

46685

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

46686

SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

46687

return SDValue();

46688

46689

// We can only do this if the vector size in 256 bits or less.

46690

unsigned Size = VT.getSizeInBits();

46691

if (Size > 256 && Subtarget.useAVX512Regs())

46692

return SDValue();

46693

46694

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

46695

// that's the only integer compares with we have.

46696

ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

46697

if (ISD::isUnsignedIntSetCC(CC))

46698

return SDValue();

46699

46700

// Only do this combine if the extension will be fully consumed by the setcc.

46701

EVT N00VT = N0.getOperand(0).getValueType();

46702

EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

46703

if (Size != MatchingVecType.getSizeInBits())

46704

return SDValue();

46705

46706

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

46707

46708

if (N->getOpcode() == ISD::ZERO_EXTEND)

46709

Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

46710

46711

return Res;

46712

}

46713

46714

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

46715

TargetLowering::DAGCombinerInfo &DCI,

46716

const X86Subtarget &Subtarget) {

46717

SDValue N0 = N->getOperand(0);

46718

EVT VT = N->getValueType(0);

46719

EVT InVT = N0.getValueType();

46720

SDLoc DL(N);

46721

46722

// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

46723

if (!DCI.isBeforeLegalizeOps() &&

46724

N0.getOpcode() == X86ISD::SETCC_CARRY) {

46725

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

46726

N0->getOperand(1));

46727

bool ReplaceOtherUses = !N0.hasOneUse();

46728

DCI.CombineTo(N, Setcc);

46729

// Replace other uses with a truncate of the widened setcc_carry.

46730

if (ReplaceOtherUses) {

46731

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

46732

N0.getValueType(), Setcc);

46733

DCI.CombineTo(N0.getNode(), Trunc);

46734

}

46735

46736

return SDValue(N, 0);

46737

}

46738

46739

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

46740

return NewCMov;

46741

46742

if (!DCI.isBeforeLegalizeOps())

46743

return SDValue();

46744

46745

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

46746

return V;

46747

46748

if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&

46749

isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {

46750

// Invert and sign-extend a boolean is the same as zero-extend and subtract

46751

// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently

46752

// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.

46753

// sext (xor Bool, -1) --> sub (zext Bool), 1

46754

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));

46755

return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));

46756

}

46757

46758

if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))

46759

return V;

46760

46761

if (VT.isVector())

46762

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

46763

return R;

46764

46765

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

46766

return NewAdd;

46767

46768

return SDValue();

46769

}

46770

46771

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

46772

TargetLowering::DAGCombinerInfo &DCI,

46773

const X86Subtarget &Subtarget) {

46774

SDLoc dl(N);

46775

EVT VT = N->getValueType(0);

46776

bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

46777

46778

// Let legalize expand this if it isn't a legal type yet.

46779

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46780

if (!TLI.isTypeLegal(VT))

46781

return SDValue();

46782

46783

SDValue A = N->getOperand(IsStrict ? 1 : 0);

46784

SDValue B = N->getOperand(IsStrict ? 2 : 1);

46785

SDValue C = N->getOperand(IsStrict ? 3 : 2);

46786

46787

// If the operation allows fast-math and the target does not support FMA,

46788

// split this into mul+add to avoid libcall(s).

46789

SDNodeFlags Flags = N->getFlags();

46790

if (!IsStrict && Flags.hasAllowReassociation() &&

46791

TLI.isOperationExpand(ISD::FMA, VT)) {

46792

SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

46793

return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

46794

}

46795

46796

EVT ScalarVT = VT.getScalarType();

46797

if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())

46798

return SDValue();

46799

46800

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

46801

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

46802

bool LegalOperations = !DCI.isBeforeLegalizeOps();

46803

if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

46804

CodeSize)) {

46805

V = NegV;

46806

return true;

46807

}

46808

// Look through extract_vector_elts. If it comes from an FNEG, create a

46809

// new extract from the FNEG input.

46810

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

46811

isNullConstant(V.getOperand(1))) {

46812

SDValue Vec = V.getOperand(0);

46813

if (SDValue NegV = TLI.getCheaperNegatedExpression(

46814

Vec, DAG, LegalOperations, CodeSize)) {

46815

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

46816

NegV, V.getOperand(1));

46817

return true;

46818

}

46819

}

46820

46821

return false;

46822

};

46823

46824

// Do not convert the passthru input of scalar intrinsics.

46825

// FIXME: We could allow negations of the lower element only.

46826

bool NegA = invertIfNegative(A);

46827

bool NegB = invertIfNegative(B);

46828

bool NegC = invertIfNegative(C);

46829

46830

if (!NegA && !NegB && !NegC)

46831

return SDValue();

46832

46833

unsigned NewOpcode =

46834

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

46835

46836

if (IsStrict) {

46837

assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")((N->getNumOperands() == 4 && "Shouldn't be greater than 4"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46837, __PRETTY_FUNCTION__));

46838

return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

46839

{N->getOperand(0), A, B, C});

46840

} else {

46841

if (N->getNumOperands() == 4)

46842

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

46843

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

46844

}

46845

}

46846

46847

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

46848

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)

46849

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

46850

TargetLowering::DAGCombinerInfo &DCI) {

46851

SDLoc dl(N);

46852

EVT VT = N->getValueType(0);

46853

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46854

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

46855

bool LegalOperations = !DCI.isBeforeLegalizeOps();

46856

46857

SDValue N2 = N->getOperand(2);

46858

46859

SDValue NegN2 =

46860

TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

46861

if (!NegN2)

46862

return SDValue();

46863

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

46864

46865

if (N->getNumOperands() == 4)

46866

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

46867

NegN2, N->getOperand(3));

46868

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

46869

NegN2);

46870

}

46871

46872

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

46873

TargetLowering::DAGCombinerInfo &DCI,

46874

const X86Subtarget &Subtarget) {

46875

SDLoc dl(N);

46876

SDValue N0 = N->getOperand(0);

46877

EVT VT = N->getValueType(0);

46878

46879

// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

46880

// FIXME: Is this needed? We don't seem to have any tests for it.

46881

if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

46882

N0.getOpcode() == X86ISD::SETCC_CARRY) {

46883

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

46884

N0->getOperand(1));

46885

bool ReplaceOtherUses = !N0.hasOneUse();

46886

DCI.CombineTo(N, Setcc);

46887

// Replace other uses with a truncate of the widened setcc_carry.

46888

if (ReplaceOtherUses) {

46889

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

46890

N0.getValueType(), Setcc);

46891

DCI.CombineTo(N0.getNode(), Trunc);

46892

}

46893

46894

return SDValue(N, 0);

46895

}

46896

46897

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

46898

return NewCMov;

46899

46900

if (DCI.isBeforeLegalizeOps())

46901

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

46902

return V;

46903

46904

if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))

46905

return V;

46906

46907

if (VT.isVector())

46908

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

46909

return R;

46910

46911

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

46912

return NewAdd;

46913

46914

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

46915

return R;

46916

46917

// TODO: Combine with any target/faux shuffle.

46918

if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

46919

VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

46920

SDValue N00 = N0.getOperand(0);

46921

SDValue N01 = N0.getOperand(1);

46922

unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

46923

APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

46924

if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

46925

(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

46926

return concatSubVectors(N00, N01, DAG, dl);

46927

}

46928

}

46929

46930

return SDValue();

46931

}

46932

46933

/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

46934

/// recognizable memcmp expansion.

46935

static bool isOrXorXorTree(SDValue X, bool Root = true) {

46936

if (X.getOpcode() == ISD::OR)

46937

return isOrXorXorTree(X.getOperand(0), false) &&

46938

isOrXorXorTree(X.getOperand(1), false);

46939

if (Root)

46940

return false;

46941

return X.getOpcode() == ISD::XOR;

46942

}

46943

46944

/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

46945

/// expansion.

46946

template<typename F>

46947

static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,

46948

EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

46949

SDValue Op0 = X.getOperand(0);

46950

SDValue Op1 = X.getOperand(1);

46951

if (X.getOpcode() == ISD::OR) {

46952

SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

46953

SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

46954

if (VecVT != CmpVT)

46955

return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

46956

if (HasPT)

46957

return DAG.getNode(ISD::OR, DL, VecVT, A, B);

46958

return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

46959

} else if (X.getOpcode() == ISD::XOR) {

46960

SDValue A = SToV(Op0);

46961

SDValue B = SToV(Op1);

46962

if (VecVT != CmpVT)

46963

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

46964

if (HasPT)

46965

return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

46966

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

46967

}

46968

llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46968);

46969

}

46970

46971

/// Try to map a 128-bit or larger integer comparison to vector instructions

46972

/// before type legalization splits it up into chunks.

46973

static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

46974

const X86Subtarget &Subtarget) {

46975

ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();

46976

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46976, __PRETTY_FUNCTION__));

46977

46978

// We're looking for an oversized integer equality comparison.

46979

SDValue X = SetCC->getOperand(0);

46980

SDValue Y = SetCC->getOperand(1);

46981

EVT OpVT = X.getValueType();

46982

unsigned OpSize = OpVT.getSizeInBits();

46983

if (!OpVT.isScalarInteger() || OpSize < 128)

46984

return SDValue();

46985

46986

// Ignore a comparison with zero because that gets special treatment in

46987

// EmitTest(). But make an exception for the special case of a pair of

46988

// logically-combined vector-sized operands compared to zero. This pattern may

46989

// be generated by the memcmp expansion pass with oversized integer compares

46990

// (see PR33325).

46991

bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

46992

if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

46993

return SDValue();

46994

46995

// Don't perform this combine if constructing the vector will be expensive.

46996

auto IsVectorBitCastCheap = [](SDValue X) {

46997

X = peekThroughBitcasts(X);

46998

return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

46999

X.getOpcode() == ISD::LOAD;

47000

};

47001

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

47002

!IsOrXorXorTreeCCZero)

47003

return SDValue();

47004

47005

EVT VT = SetCC->getValueType(0);

47006

SDLoc DL(SetCC);

47007

47008

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

47009

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

47010

// Otherwise use PCMPEQ (plus AND) and mask testing.

47011

if ((OpSize == 128 && Subtarget.hasSSE2()) ||

47012

(OpSize == 256 && Subtarget.hasAVX()) ||

47013

(OpSize == 512 && Subtarget.useAVX512Regs())) {

47014

bool HasPT = Subtarget.hasSSE41();

47015

47016

// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

47017

// vector registers are essentially free. (Technically, widening registers

47018

// prevents load folding, but the tradeoff is worth it.)

47019

bool PreferKOT = Subtarget.preferMaskRegisters();

47020

bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

47021

47022

EVT VecVT = MVT::v16i8;

47023

EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

47024

if (OpSize == 256) {

47025

VecVT = MVT::v32i8;

47026

CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

47027

}

47028

EVT CastVT = VecVT;

47029

bool NeedsAVX512FCast = false;

47030

if (OpSize == 512 || NeedZExt) {

47031

if (Subtarget.hasBWI()) {

47032

VecVT = MVT::v64i8;

47033

CmpVT = MVT::v64i1;

47034

if (OpSize == 512)

47035

CastVT = VecVT;

47036

} else {

47037

VecVT = MVT::v16i32;

47038

CmpVT = MVT::v16i1;

47039

CastVT = OpSize == 512 ? VecVT :

47040

OpSize == 256 ? MVT::v8i32 : MVT::v4i32;

47041

NeedsAVX512FCast = true;

47042

}

47043

}

47044

47045

auto ScalarToVector = [&](SDValue X) -> SDValue {

47046

bool TmpZext = false;

47047

EVT TmpCastVT = CastVT;

47048

if (X.getOpcode() == ISD::ZERO_EXTEND) {

47049

SDValue OrigX = X.getOperand(0);

47050

unsigned OrigSize = OrigX.getScalarValueSizeInBits();

47051

if (OrigSize < OpSize) {

47052

if (OrigSize == 128) {

47053

TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

47054

X = OrigX;

47055

TmpZext = true;

47056

} else if (OrigSize == 256) {

47057

TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

47058

X = OrigX;

47059

TmpZext = true;

47060

}

47061

}

47062

}

47063

X = DAG.getBitcast(TmpCastVT, X);

47064

if (!NeedZExt && !TmpZext)

47065

return X;

47066

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

47067

DAG.getConstant(0, DL, VecVT), X,

47068

DAG.getVectorIdxConstant(0, DL));

47069

};

47070

47071

SDValue Cmp;

47072

if (IsOrXorXorTreeCCZero) {

47073

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

47074

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

47075

// Use 2 vector equality compares and 'and' the results before doing a

47076

// MOVMSK.

47077

Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

47078

} else {

47079

SDValue VecX = ScalarToVector(X);

47080

SDValue VecY = ScalarToVector(Y);

47081

if (VecVT != CmpVT) {

47082

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

47083

} else if (HasPT) {

47084

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

47085

} else {

47086

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

47087

}

47088

}

47089

// AVX512 should emit a setcc that will lower to kortest.

47090

if (VecVT != CmpVT) {

47091

EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :

47092

CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;

47093

return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

47094

DAG.getConstant(0, DL, KRegVT), CC);

47095

}

47096

if (HasPT) {

47097

SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,

47098

Cmp);

47099

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

47100

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

47101

SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

47102

return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

47103

}

47104

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

47105

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

47106

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

47107

assert(Cmp.getValueType() == MVT::v16i8 &&((Cmp.getValueType() == MVT::v16i8 && "Non 128-bit vector on pre-SSE41 target"
) ? static_cast<void> (0) : __assert_fail ("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47108, __PRETTY_FUNCTION__))

47108

"Non 128-bit vector on pre-SSE41 target")((Cmp.getValueType() == MVT::v16i8 && "Non 128-bit vector on pre-SSE41 target"
) ? static_cast<void> (0) : __assert_fail ("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47108, __PRETTY_FUNCTION__));

47109

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

47110

SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

47111

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

47112

}

47113

47114

return SDValue();

47115

}

47116

47117

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

47118

const X86Subtarget &Subtarget) {

47119

const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

47120

const SDValue LHS = N->getOperand(0);

47121

const SDValue RHS = N->getOperand(1);

47122

EVT VT = N->getValueType(0);

47123

EVT OpVT = LHS.getValueType();

47124

SDLoc DL(N);

47125

47126

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

47127

if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))

47128

return V;

47129

47130

if (VT == MVT::i1 && isNullConstant(RHS)) {

47131

SDValue X86CC;

47132

if (SDValue V =

47133

MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))

47134

return DAG.getNode(ISD::TRUNCATE, DL, VT,

47135

DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));

47136

}

47137

}

47138

47139

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

47140

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

47141

// Using temporaries to avoid messing up operand ordering for later

47142

// transformations if this doesn't work.

47143

SDValue Op0 = LHS;

47144

SDValue Op1 = RHS;

47145

ISD::CondCode TmpCC = CC;

47146

// Put build_vector on the right.

47147

if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

47148

std::swap(Op0, Op1);

47149

TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

47150

}

47151

47152

bool IsSEXT0 =

47153

(Op0.getOpcode() == ISD::SIGN_EXTEND) &&

47154

(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

47155

bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

47156

47157

if (IsSEXT0 && IsVZero1) {

47158

assert(VT == Op0.getOperand(0).getValueType() &&((VT == Op0.getOperand(0).getValueType() && "Unexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47159, __PRETTY_FUNCTION__))

47159

"Unexpected operand type")((VT == Op0.getOperand(0).getValueType() && "Unexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47159, __PRETTY_FUNCTION__));

47160

if (TmpCC == ISD::SETGT)

47161

return DAG.getConstant(0, DL, VT);

47162

if (TmpCC == ISD::SETLE)

47163

return DAG.getConstant(1, DL, VT);

47164

if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

47165

return DAG.getNOT(DL, Op0.getOperand(0), VT);

47166

47167

assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47168, __PRETTY_FUNCTION__))

47168

"Unexpected condition code!")(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47168, __PRETTY_FUNCTION__));

47169

return Op0.getOperand(0);

47170

}

47171

}

47172

47173

// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

47174

// pre-promote its result type since vXi1 vectors don't get promoted

47175

// during type legalization.

47176

// NOTE: The element count check is to ignore operand types that need to

47177

// go through type promotion to a 128-bit vector.

47178

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

47179

VT.getVectorElementType() == MVT::i1 &&

47180

(OpVT.getVectorElementType() == MVT::i8 ||

47181

OpVT.getVectorElementType() == MVT::i16)) {

47182

SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

47183

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

47184

}

47185

47186

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

47187

// to avoid scalarization via legalization because v4i32 is not a legal type.

47188

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

47189

LHS.getValueType() == MVT::v4f32)

47190

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

47191

47192

return SDValue();

47193

}

47194

47195

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

47196

TargetLowering::DAGCombinerInfo &DCI,

47197

const X86Subtarget &Subtarget) {

47198

SDValue Src = N->getOperand(0);

47199

MVT SrcVT = Src.getSimpleValueType();

47200

MVT VT = N->getSimpleValueType(0);

47201

unsigned NumBits = VT.getScalarSizeInBits();

47202

unsigned NumElts = SrcVT.getVectorNumElements();

47203

47204

// Perform constant folding.

47205

if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {

47206

assert(VT == MVT::i32 && "Unexpected result type")((VT == MVT::i32 && "Unexpected result type") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47206, __PRETTY_FUNCTION__));

47207

APInt Imm(32, 0);

47208

for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {

47209

if (!Src.getOperand(Idx).isUndef() &&

47210

Src.getConstantOperandAPInt(Idx).isNegative())

47211

Imm.setBit(Idx);

47212

}

47213

return DAG.getConstant(Imm, SDLoc(N), VT);

47214

}

47215

47216

// Look through int->fp bitcasts that don't change the element width.

47217

unsigned EltWidth = SrcVT.getScalarSizeInBits();

47218

if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

47219

Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

47220

return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

47221

47222

// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results

47223

// with scalar comparisons.

47224

if (SDValue NotSrc = IsNOT(Src, DAG)) {

47225

SDLoc DL(N);

47226

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

47227

NotSrc = DAG.getBitcast(SrcVT, NotSrc);

47228

return DAG.getNode(ISD::XOR, DL, VT,

47229

DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

47230

DAG.getConstant(NotMask, DL, VT));

47231

}

47232

47233

// Simplify the inputs.

47234

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47235

APInt DemandedMask(APInt::getAllOnesValue(NumBits));

47236

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

47237

return SDValue(N, 0);

47238

47239

return SDValue();

47240

}

47241

47242

static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

47243

TargetLowering::DAGCombinerInfo &DCI) {

47244

// With vector masks we only demand the upper bit of the mask.

47245

SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();

47246

if (Mask.getScalarValueSizeInBits() != 1) {

47247

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47248

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

47249

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

47250

if (N->getOpcode() != ISD::DELETED_NODE)

47251

DCI.AddToWorklist(N);

47252

return SDValue(N, 0);

47253

}

47254

}

47255

47256

return SDValue();

47257

}

47258

47259

static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

47260

SDValue Index, SDValue Base, SDValue Scale,

47261

SelectionDAG &DAG) {

47262

SDLoc DL(GorS);

47263

47264

if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

47265

SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

47266

Gather->getMask(), Base, Index, Scale } ;

47267

return DAG.getMaskedGather(Gather->getVTList(),

47268

Gather->getMemoryVT(), DL, Ops,

47269

Gather->getMemOperand(),

47270

Gather->getIndexType());

47271

}

47272

auto *Scatter = cast<MaskedScatterSDNode>(GorS);

47273

SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

47274

Scatter->getMask(), Base, Index, Scale };

47275

return DAG.getMaskedScatter(Scatter->getVTList(),

47276

Scatter->getMemoryVT(), DL,

47277

Ops, Scatter->getMemOperand(),

47278

Scatter->getIndexType());

47279

}

47280

47281

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

47282

TargetLowering::DAGCombinerInfo &DCI) {

47283

SDLoc DL(N);

47284

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

47285

SDValue Index = GorS->getIndex();

47286

SDValue Base = GorS->getBasePtr();

47287

SDValue Scale = GorS->getScale();

47288

47289

if (DCI.isBeforeLegalize()) {

47290

unsigned IndexWidth = Index.getScalarValueSizeInBits();

47291

47292

// Shrink constant indices if they are larger than 32-bits.

47293

// Only do this before legalize types since v2i64 could become v2i32.

47294

// FIXME: We could check that the type is legal if we're after legalize

47295

// types, but then we would need to construct test cases where that happens.

47296

// FIXME: We could support more than just constant vectors, but we need to

47297

// careful with costing. A truncate that can be optimized out would be fine.

47298

// Otherwise we might only want to create a truncate if it avoids a split.

47299

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {

47300

if (BV->isConstant() && IndexWidth > 32 &&

47301

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

47302

unsigned NumElts = Index.getValueType().getVectorNumElements();

47303

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);

47304

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

47305

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

47306

}

47307

}

47308

47309

// Shrink any sign/zero extends from 32 or smaller to larger than 32 if

47310

// there are sufficient sign bits. Only do this before legalize types to

47311

// avoid creating illegal types in truncate.

47312

if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

47313

Index.getOpcode() == ISD::ZERO_EXTEND) &&

47314

IndexWidth > 32 &&

47315

Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&

47316

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

47317

unsigned NumElts = Index.getValueType().getVectorNumElements();

47318

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);

47319

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

47320

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

47321

}

47322

}

47323

47324

if (DCI.isBeforeLegalizeOps()) {

47325

unsigned IndexWidth = Index.getScalarValueSizeInBits();

47326

47327

// Make sure the index is either i32 or i64

47328

if (IndexWidth != 32 && IndexWidth != 64) {

47329

MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

47330

EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,

47331

Index.getValueType().getVectorNumElements());

47332

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

47333

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

47334

}

47335

}

47336

47337

// With vector masks we only demand the upper bit of the mask.

47338

SDValue Mask = GorS->getMask();

47339

if (Mask.getScalarValueSizeInBits() != 1) {

47340

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47341

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

47342

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

47343

if (N->getOpcode() != ISD::DELETED_NODE)

47344

DCI.AddToWorklist(N);

47345

return SDValue(N, 0);

47346

}

47347

}

47348

47349

return SDValue();

47350

}

47351

47352

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

47353

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

47354

const X86Subtarget &Subtarget) {

47355

SDLoc DL(N);

47356

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

47357

SDValue EFLAGS = N->getOperand(1);

47358

47359

// Try to simplify the EFLAGS and condition code operands.

47360

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

47361

return getSETCC(CC, Flags, DL, DAG);

47362

47363

return SDValue();

47364

}

47365

47366

/// Optimize branch condition evaluation.

47367

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

47368

const X86Subtarget &Subtarget) {

47369

SDLoc DL(N);

47370

SDValue EFLAGS = N->getOperand(3);

47371

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

47372

47373

// Try to simplify the EFLAGS and condition code operands.

47374

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

47375

// RAUW them under us.

47376

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

47377

SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

47378

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

47379

N->getOperand(1), Cond, Flags);

47380

}

47381

47382

return SDValue();

47383

}

47384

47385

// TODO: Could we move this to DAGCombine?

47386

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

47387

SelectionDAG &DAG) {

47388

// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

47389

// to optimize away operation when it's from a constant.

47390

//

47391

// The general transformation is:

47392

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

47393

// AND(VECTOR_CMP(x,y), constant2)

47394

// constant2 = UNARYOP(constant)

47395

47396

// Early exit if this isn't a vector operation, the operand of the

47397

// unary operation isn't a bitwise AND, or if the sizes of the operations

47398

// aren't the same.

47399

EVT VT = N->getValueType(0);

47400

bool IsStrict = N->isStrictFPOpcode();

47401

unsigned NumEltBits = VT.getScalarSizeInBits();

47402

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

47403

if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

47404

DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

47405

VT.getSizeInBits() != Op0.getValueSizeInBits())

47406

return SDValue();

47407

47408

// Now check that the other operand of the AND is a constant. We could

47409

// make the transformation for non-constant splats as well, but it's unclear

47410

// that would be a benefit as it would not eliminate any operations, just

47411

// perform one more step in scalar code before moving to the vector unit.

47412

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

47413

// Bail out if the vector isn't a constant.

47414

if (!BV->isConstant())

47415

return SDValue();

47416

47417

// Everything checks out. Build up the new and improved node.

47418

SDLoc DL(N);

47419

EVT IntVT = BV->getValueType(0);

47420

// Create a new constant of the appropriate type for the transformed

47421

// DAG.

47422

SDValue SourceConst;

47423

if (IsStrict)

47424

SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

47425

{N->getOperand(0), SDValue(BV, 0)});

47426

else

47427

SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

47428

// The AND node needs bitcasts to/from an integer vector type around it.

47429

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

47430

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

47431

MaskConst);

47432

SDValue Res = DAG.getBitcast(VT, NewAnd);

47433

if (IsStrict)

47434

return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

47435

return Res;

47436

}

47437

47438

return SDValue();

47439

}

47440

47441

/// If we are converting a value to floating-point, try to replace scalar

47442

/// truncate of an extracted vector element with a bitcast. This tries to keep

47443

/// the sequence on XMM registers rather than moving between vector and GPRs.

47444

static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

47445

// TODO: This is currently only used by combineSIntToFP, but it is generalized

47446

// to allow being called by any similar cast opcode.

47447

// TODO: Consider merging this into lowering: vectorizeExtractedCast().

47448

SDValue Trunc = N->getOperand(0);

47449

if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

47450

return SDValue();

47451

47452

SDValue ExtElt = Trunc.getOperand(0);

47453

if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

47454

!isNullConstant(ExtElt.getOperand(1)))

47455

return SDValue();

47456

47457

EVT TruncVT = Trunc.getValueType();

47458

EVT SrcVT = ExtElt.getValueType();

47459

unsigned DestWidth = TruncVT.getSizeInBits();

47460

unsigned SrcWidth = SrcVT.getSizeInBits();

47461

if (SrcWidth % DestWidth != 0)

47462

return SDValue();

47463

47464

// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

47465

EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

47466

unsigned VecWidth = SrcVecVT.getSizeInBits();

47467

unsigned NumElts = VecWidth / DestWidth;

47468

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

47469

SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

47470

SDLoc DL(N);

47471

SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

47472

BitcastVec, ExtElt.getOperand(1));

47473

return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

47474

}

47475

47476

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

47477

const X86Subtarget &Subtarget) {

47478

bool IsStrict = N->isStrictFPOpcode();

47479

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

47480

EVT VT = N->getValueType(0);

47481

EVT InVT = Op0.getValueType();

47482

47483

// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

47484

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

47485

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

47486

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {

47487

SDLoc dl(N);

47488

EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

47489

InVT.getVectorNumElements());

47490

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

47491

47492

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

47493

if (IsStrict)

47494

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

47495

{N->getOperand(0), P});

47496

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

47497

}

47498

47499

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

47500

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

47501

// the optimization here.

47502

if (DAG.SignBitIsZero(Op0)) {

47503

if (IsStrict)

47504

return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

47505

{N->getOperand(0), Op0});

47506

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

47507

}

47508

47509

return SDValue();

47510

}

47511

47512

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

47513

TargetLowering::DAGCombinerInfo &DCI,

47514

const X86Subtarget &Subtarget) {

47515

// First try to optimize away the conversion entirely when it's

47516

// conditionally from a constant. Vectors only.

47517

bool IsStrict = N->isStrictFPOpcode();

47518

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

47519

return Res;

47520

47521

// Now move on to more general possibilities.

47522

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

47523

EVT VT = N->getValueType(0);

47524

EVT InVT = Op0.getValueType();

47525

47526

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

47527

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

47528

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

47529

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {

47530

SDLoc dl(N);

47531

EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

47532

InVT.getVectorNumElements());

47533

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

47534

if (IsStrict)

47535

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

47536

{N->getOperand(0), P});

47537

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

47538

}

47539

47540

// Without AVX512DQ we only support i64 to float scalar conversion. For both

47541

// vectors and scalars, see if we know that the upper bits are all the sign

47542

// bit, in which case we can truncate the input to i32 and convert from that.

47543

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

47544

unsigned BitWidth = InVT.getScalarSizeInBits();

47545

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

47546

if (NumSignBits >= (BitWidth - 31)) {

47547

EVT TruncVT = MVT::i32;

47548

if (InVT.isVector())

47549

TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,

47550

InVT.getVectorNumElements());

47551

SDLoc dl(N);

47552

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

47553

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

47554

if (IsStrict)

47555

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

47556

{N->getOperand(0), Trunc});

47557

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

47558

}

47559

// If we're after legalize and the type is v2i32 we need to shuffle and

47560

// use CVTSI2P.

47561

assert(InVT == MVT::v2i64 && "Unexpected VT!")((InVT == MVT::v2i64 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47561, __PRETTY_FUNCTION__));

47562

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

47563

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

47564

{ 0, 2, -1, -1 });

47565

if (IsStrict)

47566

return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

47567

{N->getOperand(0), Shuf});

47568

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

47569

}

47570

}

47571

47572

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

47573

// a 32-bit target where SSE doesn't support i64->FP operations.

47574

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

47575

Op0.getOpcode() == ISD::LOAD) {

47576

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

47577

47578

// This transformation is not supported if the result type is f16 or f128.

47579

if (VT == MVT::f16 || VT == MVT::f128)

47580

return SDValue();

47581

47582

// If we have AVX512DQ we can use packed conversion instructions unless

47583

// the VT is f80.

47584

if (Subtarget.hasDQI() && VT != MVT::f80)

47585

return SDValue();

47586

47587

if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

47588

Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

47589

std::pair<SDValue, SDValue> Tmp =

47590

Subtarget.getTargetLowering()->BuildFILD(

47591

VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

47592

Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

47593

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

47594

return Tmp.first;

47595

}

47596

}

47597

47598

if (IsStrict)

47599

return SDValue();

47600

47601

if (SDValue V = combineToFPTruncExtElt(N, DAG))

47602

return V;

47603

47604

return SDValue();

47605

}

47606

47607

static bool needCarryOrOverflowFlag(SDValue Flags) {

47608

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47608, __PRETTY_FUNCTION__));

47609

47610

for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();

47611

UI != UE; ++UI) {

47612

SDNode *User = *UI;

47613

47614

X86::CondCode CC;

47615

switch (User->getOpcode()) {

47616

default:

47617

// Be conservative.

47618

return true;

47619

case X86ISD::SETCC:

47620

case X86ISD::SETCC_CARRY:

47621

CC = (X86::CondCode)User->getConstantOperandVal(0);

47622

break;

47623

case X86ISD::BRCOND:

47624

CC = (X86::CondCode)User->getConstantOperandVal(2);

47625

break;

47626

case X86ISD::CMOV:

47627

CC = (X86::CondCode)User->getConstantOperandVal(2);

47628

break;

47629

}

47630

47631

switch (CC) {

47632

default: break;

47633

case X86::COND_A: case X86::COND_AE:

47634

case X86::COND_B: case X86::COND_BE:

47635

case X86::COND_O: case X86::COND_NO:

47636

case X86::COND_G: case X86::COND_GE:

47637

case X86::COND_L: case X86::COND_LE:

47638

return true;

47639

}

47640

}

47641

47642

return false;

47643

}

47644

47645

static bool onlyZeroFlagUsed(SDValue Flags) {

47646

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47646, __PRETTY_FUNCTION__));

47647

47648

for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();

47649

UI != UE; ++UI) {

47650

SDNode *User = *UI;

47651

47652

unsigned CCOpNo;

47653

switch (User->getOpcode()) {

47654

default:

47655

// Be conservative.

47656

return false;

47657

case X86ISD::SETCC: CCOpNo = 0; break;

47658

case X86ISD::SETCC_CARRY: CCOpNo = 0; break;

47659

case X86ISD::BRCOND: CCOpNo = 2; break;

47660

case X86ISD::CMOV: CCOpNo = 2; break;

47661

}

47662

47663

X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

47664

if (CC != X86::COND_E && CC != X86::COND_NE)

47665

return false;

47666

}

47667

47668

return true;

47669

}

47670

47671

static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {

47672

// Only handle test patterns.

47673

if (!isNullConstant(N->getOperand(1)))

47674

return SDValue();

47675

47676

// If we have a CMP of a truncated binop, see if we can make a smaller binop

47677

// and use its flags directly.

47678

// TODO: Maybe we should try promoting compares that only use the zero flag

47679

// first if we can prove the upper bits with computeKnownBits?

47680

SDLoc dl(N);

47681

SDValue Op = N->getOperand(0);

47682

EVT VT = Op.getValueType();

47683

47684

// If we have a constant logical shift that's only used in a comparison

47685

// against zero turn it into an equivalent AND. This allows turning it into

47686

// a TEST instruction later.

47687

if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

47688

Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

47689

onlyZeroFlagUsed(SDValue(N, 0))) {

47690

unsigned BitWidth = VT.getSizeInBits();

47691

const APInt &ShAmt = Op.getConstantOperandAPInt(1);

47692

if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

47693

unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

47694

APInt Mask = Op.getOpcode() == ISD::SRL

47695

? APInt::getHighBitsSet(BitWidth, MaskBits)

47696

: APInt::getLowBitsSet(BitWidth, MaskBits);

47697

if (Mask.isSignedIntN(32)) {

47698

Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

47699

DAG.getConstant(Mask, dl, VT));

47700

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

47701

DAG.getConstant(0, dl, VT));

47702

}

47703

}

47704

}

47705

47706

// Look for a truncate with a single use.

47707

if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())

47708

return SDValue();

47709

47710

Op = Op.getOperand(0);

47711

47712

// Arithmetic op can only have one use.

47713

if (!Op.hasOneUse())

47714

return SDValue();

47715

47716

unsigned NewOpc;

47717

switch (Op.getOpcode()) {

47718

default: return SDValue();

47719

case ISD::AND:

47720

// Skip and with constant. We have special handling for and with immediate

47721

// during isel to generate test instructions.

47722

if (isa<ConstantSDNode>(Op.getOperand(1)))

47723

return SDValue();

47724

NewOpc = X86ISD::AND;

47725

break;

47726

case ISD::OR: NewOpc = X86ISD::OR; break;

47727

case ISD::XOR: NewOpc = X86ISD::XOR; break;

47728

case ISD::ADD:

47729

// If the carry or overflow flag is used, we can't truncate.

47730

if (needCarryOrOverflowFlag(SDValue(N, 0)))

47731

return SDValue();

47732

NewOpc = X86ISD::ADD;

47733

break;

47734

case ISD::SUB:

47735

// If the carry or overflow flag is used, we can't truncate.

47736

if (needCarryOrOverflowFlag(SDValue(N, 0)))

47737

return SDValue();

47738

NewOpc = X86ISD::SUB;

47739

break;

47740

}

47741

47742

// We found an op we can narrow. Truncate its inputs.

47743

SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

47744

SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

47745

47746

// Use a X86 specific opcode to avoid DAG combine messing with it.

47747

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

47748

Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

47749

47750

// For AND, keep a CMP so that we can match the test pattern.

47751

if (NewOpc == X86ISD::AND)

47752

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

47753

DAG.getConstant(0, dl, VT));

47754

47755

// Return the flags.

47756

return Op.getValue(1);

47757

}

47758

47759

static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

47760

TargetLowering::DAGCombinerInfo &DCI) {

47761

assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47762, __PRETTY_FUNCTION__))

47762

"Expected X86ISD::ADD or X86ISD::SUB")(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47762, __PRETTY_FUNCTION__));

47763

47764

SDLoc DL(N);

47765

SDValue LHS = N->getOperand(0);

47766

SDValue RHS = N->getOperand(1);

47767

MVT VT = LHS.getSimpleValueType();

47768

unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;

47769

47770

// If we don't use the flag result, simplify back to a generic ADD/SUB.

47771

if (!N->hasAnyUseOfValue(1)) {

47772

SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

47773

return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

47774

}

47775

47776

// Fold any similar generic ADD/SUB opcodes to reuse this node.

47777

auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {

47778

SDValue Ops[] = {N0, N1};

47779

SDVTList VTs = DAG.getVTList(N->getValueType(0));

47780

if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {

47781

SDValue Op(N, 0);

47782

if (Negate)

47783

Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);

47784

DCI.CombineTo(GenericAddSub, Op);

47785

}

47786

};

47787

MatchGeneric(LHS, RHS, false);

47788

MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

47789

47790

return SDValue();

47791

}

47792

47793

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

47794

if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {

47795

MVT VT = N->getSimpleValueType(0);

47796

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

47797

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,

47798

N->getOperand(0), N->getOperand(1),

47799

Flags);

47800

}

47801

47802

// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

47803

// iff the flag result is dead.

47804

SDValue Op0 = N->getOperand(0);

47805

SDValue Op1 = N->getOperand(1);

47806

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&

47807

!N->hasAnyUseOfValue(1))

47808

return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),

47809

Op0.getOperand(1), N->getOperand(2));

47810

47811

return SDValue();

47812

}

47813

47814

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

47815

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

47816

TargetLowering::DAGCombinerInfo &DCI) {

47817

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

47818

// the result is either zero or one (depending on the input carry bit).

47819

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

47820

if (X86::isZeroNode(N->getOperand(0)) &&

47821

X86::isZeroNode(N->getOperand(1)) &&

47822

// We don't have a good way to replace an EFLAGS use, so only do this when

47823

// dead right now.

47824

SDValue(N, 1).use_empty()) {

47825

SDLoc DL(N);

47826

EVT VT = N->getValueType(0);

47827

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

47828

SDValue Res1 =

47829

DAG.getNode(ISD::AND, DL, VT,

47830

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

47831

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

47832

N->getOperand(2)),

47833

DAG.getConstant(1, DL, VT));

47834

return DCI.CombineTo(N, Res1, CarryOut);

47835

}

47836

47837

if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {

47838

MVT VT = N->getSimpleValueType(0);

47839

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

47840

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,

47841

N->getOperand(0), N->getOperand(1),

47842

Flags);

47843

}

47844

47845

return SDValue();

47846

}

47847

47848

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

47849

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

47850

/// with CMP+{ADC, SBB}.

47851

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

47852

bool IsSub = N->getOpcode() == ISD::SUB;

47853

SDValue X = N->getOperand(0);

47854

SDValue Y = N->getOperand(1);

47855

47856

// If this is an add, canonicalize a zext operand to the RHS.

47857

// TODO: Incomplete? What if both sides are zexts?

47858

if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&

47859

Y.getOpcode() != ISD::ZERO_EXTEND)

47860

std::swap(X, Y);

47861

47862

// Look through a one-use zext.

47863

bool PeekedThroughZext = false;

47864

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {

47865

Y = Y.getOperand(0);

47866

PeekedThroughZext = true;

47867

}

47868

47869

// If this is an add, canonicalize a setcc operand to the RHS.

47870

// TODO: Incomplete? What if both sides are setcc?

47871

// TODO: Should we allow peeking through a zext of the other operand?

47872

if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&

47873

Y.getOpcode() != X86ISD::SETCC)

47874

std::swap(X, Y);

47875

47876

if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())

47877

return SDValue();

47878

47879

SDLoc DL(N);

47880

EVT VT = N->getValueType(0);

47881

X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

47882

47883

// If X is -1 or 0, then we have an opportunity to avoid constants required in

47884

// the general case below.

47885

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

47886

if (ConstantX) {

47887

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||

47888

(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {

47889

// This is a complicated way to get -1 or 0 from the carry flag:

47890

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

47891

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

47892

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

47893

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

47894

Y.getOperand(1));

47895

}

47896

47897

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||

47898

(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {

47899

SDValue EFLAGS = Y->getOperand(1);

47900

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

47901

EFLAGS.getValueType().isInteger() &&

47902

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

47903

// Swap the operands of a SUB, and we have the same pattern as above.

47904

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

47905

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

47906

SDValue NewSub = DAG.getNode(

47907

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

47908

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

47909

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

47910

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

47911

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

47912

NewEFLAGS);

47913

}

47914

}

47915

}

47916

47917

if (CC == X86::COND_B) {

47918

// X + SETB Z --> adc X, 0

47919

// X - SETB Z --> sbb X, 0

47920

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

47921

DAG.getVTList(VT, MVT::i32), X,

47922

DAG.getConstant(0, DL, VT), Y.getOperand(1));

47923

}

47924

47925

if (CC == X86::COND_A) {

47926

SDValue EFLAGS = Y.getOperand(1);

47927

// Try to convert COND_A into COND_B in an attempt to facilitate

47928

// materializing "setb reg".

47929

//

47930

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

47931

// cannot take an immediate as its first operand.

47932

//

47933

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

47934

EFLAGS.getValueType().isInteger() &&

47935

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

47936

SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),

47937

EFLAGS.getNode()->getVTList(),

47938

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

47939

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

47940

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

47941

DAG.getVTList(VT, MVT::i32), X,

47942

DAG.getConstant(0, DL, VT), NewEFLAGS);

47943

}

47944

}

47945

47946

if (CC == X86::COND_AE) {

47947

// X + SETAE --> sbb X, -1

47948

// X - SETAE --> adc X, -1

47949

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

47950

DAG.getVTList(VT, MVT::i32), X,

47951

DAG.getConstant(-1, DL, VT), Y.getOperand(1));

47952

}

47953

47954

if (CC == X86::COND_BE) {

47955

// X + SETBE --> sbb X, -1

47956

// X - SETBE --> adc X, -1

47957

SDValue EFLAGS = Y.getOperand(1);

47958

// Try to convert COND_BE into COND_AE in an attempt to facilitate

47959

// materializing "setae reg".

47960

//

47961

// Do not flip "e <= c", where "c" is a constant, because Cmp instruction

47962

// cannot take an immediate as its first operand.

47963

//

47964

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

47965

EFLAGS.getValueType().isInteger() &&

47966

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

47967

SDValue NewSub = DAG.getNode(

47968

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

47969

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

47970

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

47971

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

47972

DAG.getVTList(VT, MVT::i32), X,

47973

DAG.getConstant(-1, DL, VT), NewEFLAGS);

47974

}

47975

}

47976

47977

if (CC != X86::COND_E && CC != X86::COND_NE)

47978

return SDValue();

47979

47980

SDValue Cmp = Y.getOperand(1);

47981

if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||

47982

!X86::isZeroNode(Cmp.getOperand(1)) ||

47983

!Cmp.getOperand(0).getValueType().isInteger())

47984

return SDValue();

47985

47986

SDValue Z = Cmp.getOperand(0);

47987

EVT ZVT = Z.getValueType();

47988

47989

// If X is -1 or 0, then we have an opportunity to avoid constants required in

47990

// the general case below.

47991

if (ConstantX) {

47992

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

47993

// fake operands:

47994

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

47995

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

47996

if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||

47997

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {

47998

SDValue Zero = DAG.getConstant(0, DL, ZVT);

47999

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

48000

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

48001

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

48002

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

48003

SDValue(Neg.getNode(), 1));

48004

}

48005

48006

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

48007

// with fake operands:

48008

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

48009

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

48010

if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||

48011

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {

48012

SDValue One = DAG.getConstant(1, DL, ZVT);

48013

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

48014

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

48015

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

48016

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

48017

Cmp1.getValue(1));

48018

}

48019

}

48020

48021

// (cmp Z, 1) sets the carry flag if Z is 0.

48022

SDValue One = DAG.getConstant(1, DL, ZVT);

48023

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

48024

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

48025

48026

// Add the flags type for ADC/SBB nodes.

48027

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

48028

48029

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

48030

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

48031

if (CC == X86::COND_NE)

48032

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

48033

DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

48034

48035

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

48036

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

48037

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

48038

DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

48039

}

48040

48041

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

48042

const SDLoc &DL, EVT VT,

48043

const X86Subtarget &Subtarget) {

48044

// Example of pattern we try to detect:

48045

// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

48046

//(add (build_vector (extract_elt t, 0),

48047

// (extract_elt t, 2),

48048

// (extract_elt t, 4),

48049

// (extract_elt t, 6)),

48050

// (build_vector (extract_elt t, 1),

48051

// (extract_elt t, 3),

48052

// (extract_elt t, 5),

48053

// (extract_elt t, 7)))

48054

48055

if (!Subtarget.hasSSE2())

48056

return SDValue();

48057

48058

if (Op0.getOpcode() != ISD::BUILD_VECTOR ||

48059

Op1.getOpcode() != ISD::BUILD_VECTOR)

48060

return SDValue();

48061

48062

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

48063

VT.getVectorNumElements() < 4 ||

48064

!isPowerOf2_32(VT.getVectorNumElements()))

48065

return SDValue();

48066

48067

// Check if one of Op0,Op1 is of the form:

48068

// (build_vector (extract_elt Mul, 0),

48069

// (extract_elt Mul, 2),

48070

// (extract_elt Mul, 4),

48071

// ...

48072

// the other is of the form:

48073

// (build_vector (extract_elt Mul, 1),

48074

// (extract_elt Mul, 3),

48075

// (extract_elt Mul, 5),

48076

// ...

48077

// and identify Mul.

48078

SDValue Mul;

48079

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

48080

SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

48081

Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

48082

// TODO: Be more tolerant to undefs.

48083

if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

48084

Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

48085

Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

48086

Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

48087

return SDValue();

48088

auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));

48089

auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));

48090

auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));

48091

auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));

48092

if (!Const0L || !Const1L || !Const0H || !Const1H)

48093

return SDValue();

48094

unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),

48095

Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();

48096

// Commutativity of mul allows factors of a product to reorder.

48097

if (Idx0L > Idx1L)

48098

std::swap(Idx0L, Idx1L);

48099

if (Idx0H > Idx1H)

48100

std::swap(Idx0H, Idx1H);

48101

// Commutativity of add allows pairs of factors to reorder.

48102

if (Idx0L > Idx0H) {

48103

std::swap(Idx0L, Idx0H);

48104

std::swap(Idx1L, Idx1H);

48105

}

48106

if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

48107

Idx1H != 2 * i + 3)

48108

return SDValue();

48109

if (!Mul) {

48110

// First time an extract_elt's source vector is visited. Must be a MUL

48111

// with 2X number of vector elements than the BUILD_VECTOR.

48112

// Both extracts must be from same MUL.

48113

Mul = Op0L->getOperand(0);

48114

if (Mul->getOpcode() != ISD::MUL ||

48115

Mul.getValueType().getVectorNumElements() != 2 * e)

48116

return SDValue();

48117

}

48118

// Check that the extract is from the same MUL previously seen.

48119

if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||

48120

Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))

48121

return SDValue();

48122

}

48123

48124

// Check if the Mul source can be safely shrunk.

48125

ShrinkMode Mode;

48126

if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

48127

Mode == ShrinkMode::MULU16)

48128

return SDValue();

48129

48130

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

48131

VT.getVectorNumElements() * 2);

48132

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

48133

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

48134

48135

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48136

ArrayRef<SDValue> Ops) {

48137

EVT InVT = Ops[0].getValueType();

48138

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48138, __PRETTY_FUNCTION__));

48139

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

48140

InVT.getVectorNumElements() / 2);

48141

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

48142

};

48143

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

48144

}

48145

48146

// Attempt to turn this pattern into PMADDWD.

48147

// (add (mul (sext (build_vector)), (sext (build_vector))),

48148

// (mul (sext (build_vector)), (sext (build_vector)))

48149

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

48150

const SDLoc &DL, EVT VT,

48151

const X86Subtarget &Subtarget) {

48152

if (!Subtarget.hasSSE2())

48153

return SDValue();

48154

48155

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

48156

return SDValue();

48157

48158

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

48159

VT.getVectorNumElements() < 4 ||

48160

!isPowerOf2_32(VT.getVectorNumElements()))

48161

return SDValue();

48162

48163

SDValue N00 = N0.getOperand(0);

48164

SDValue N01 = N0.getOperand(1);

48165

SDValue N10 = N1.getOperand(0);

48166

SDValue N11 = N1.getOperand(1);

48167

48168

// All inputs need to be sign extends.

48169

// TODO: Support ZERO_EXTEND from known positive?

48170

if (N00.getOpcode() != ISD::SIGN_EXTEND ||

48171

N01.getOpcode() != ISD::SIGN_EXTEND ||

48172

N10.getOpcode() != ISD::SIGN_EXTEND ||

48173

N11.getOpcode() != ISD::SIGN_EXTEND)

48174

return SDValue();

48175

48176

// Peek through the extends.

48177

N00 = N00.getOperand(0);

48178

N01 = N01.getOperand(0);

48179

N10 = N10.getOperand(0);

48180

N11 = N11.getOperand(0);

48181

48182

// Must be extending from vXi16.

48183

EVT InVT = N00.getValueType();

48184

if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

48185

N10.getValueType() != InVT || N11.getValueType() != InVT)

48186

return SDValue();

48187

48188

// All inputs should be build_vectors.

48189

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

48190

N01.getOpcode() != ISD::BUILD_VECTOR ||

48191

N10.getOpcode() != ISD::BUILD_VECTOR ||

48192

N11.getOpcode() != ISD::BUILD_VECTOR)

48193

return SDValue();

48194

48195

// For each element, we need to ensure we have an odd element from one vector

48196

// multiplied by the odd element of another vector and the even element from

48197

// one of the same vectors being multiplied by the even element from the

48198

// other vector. So we need to make sure for each element i, this operator

48199

// is being performed:

48200

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

48201

SDValue In0, In1;

48202

for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

48203

SDValue N00Elt = N00.getOperand(i);

48204

SDValue N01Elt = N01.getOperand(i);

48205

SDValue N10Elt = N10.getOperand(i);

48206

SDValue N11Elt = N11.getOperand(i);

48207

// TODO: Be more tolerant to undefs.

48208

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

48209

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

48210

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

48211

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

48212

return SDValue();

48213

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

48214

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

48215

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

48216

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

48217

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

48218

return SDValue();

48219

unsigned IdxN00 = ConstN00Elt->getZExtValue();

48220

unsigned IdxN01 = ConstN01Elt->getZExtValue();

48221

unsigned IdxN10 = ConstN10Elt->getZExtValue();

48222

unsigned IdxN11 = ConstN11Elt->getZExtValue();

48223

// Add is commutative so indices can be reordered.

48224

if (IdxN00 > IdxN10) {

48225

std::swap(IdxN00, IdxN10);

48226

std::swap(IdxN01, IdxN11);

48227

}

48228

// N0 indices be the even element. N1 indices must be the next odd element.

48229

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

48230

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

48231

return SDValue();

48232

SDValue N00In = N00Elt.getOperand(0);

48233

SDValue N01In = N01Elt.getOperand(0);

48234

SDValue N10In = N10Elt.getOperand(0);

48235

SDValue N11In = N11Elt.getOperand(0);

48236

// First time we find an input capture it.

48237

if (!In0) {

48238

In0 = N00In;

48239

In1 = N01In;

48240

}

48241

// Mul is commutative so the input vectors can be in any order.

48242

// Canonicalize to make the compares easier.

48243

if (In0 != N00In)

48244

std::swap(N00In, N01In);

48245

if (In0 != N10In)

48246

std::swap(N10In, N11In);

48247

if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

48248

return SDValue();

48249

}

48250

48251

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48252

ArrayRef<SDValue> Ops) {

48253

// Shrink by adding truncate nodes and let DAGCombine fold with the

48254

// sources.

48255

EVT OpVT = Ops[0].getValueType();

48256

assert(OpVT.getScalarType() == MVT::i16 &&((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48257, __PRETTY_FUNCTION__))

48257

"Unexpected scalar element type")((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48257, __PRETTY_FUNCTION__));

48258

assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")((OpVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48258, __PRETTY_FUNCTION__));

48259

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

48260

OpVT.getVectorNumElements() / 2);

48261

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

48262

};

48263

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

48264

PMADDBuilder);

48265

}

48266

48267

static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,

48268

const X86Subtarget &Subtarget) {

48269

EVT VT = N->getValueType(0);

48270

SDValue Op0 = N->getOperand(0);

48271

SDValue Op1 = N->getOperand(1);

48272

bool IsAdd = N->getOpcode() == ISD::ADD;

48273

assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode")(((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("(IsAdd || N->getOpcode() == ISD::SUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48273, __PRETTY_FUNCTION__));

48274

48275

SmallVector<int, 8> PostShuffleMask;

48276

if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||

48277

VT == MVT::v8i32) &&

48278

Subtarget.hasSSSE3() &&

48279

isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {

48280

auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,

48281

ArrayRef<SDValue> Ops) {

48282

return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,

48283

Ops[0].getValueType(), Ops);

48284

};

48285

SDValue HorizBinOp =

48286

SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);

48287

if (!PostShuffleMask.empty())

48288

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

48289

DAG.getUNDEF(VT), PostShuffleMask);

48290

return HorizBinOp;

48291

}

48292

48293

return SDValue();

48294

}

48295

48296

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

48297

TargetLowering::DAGCombinerInfo &DCI,

48298

const X86Subtarget &Subtarget) {

48299

EVT VT = N->getValueType(0);

48300

SDValue Op0 = N->getOperand(0);

48301

SDValue Op1 = N->getOperand(1);

48302

48303

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))

48304

return MAdd;

48305

if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))

48306

return MAdd;

48307

48308

// Try to synthesize horizontal adds from adds of shuffles.

48309

if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))

48310

return V;

48311

48312

// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

48313

// (sub Y, (sext (vXi1 X))).

48314

// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in

48315

// generic DAG combine without a legal type check, but adding this there

48316

// caused regressions.

48317

if (VT.isVector()) {

48318

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48319

if (Op0.getOpcode() == ISD::ZERO_EXTEND &&

48320

Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

48321

TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {

48322

SDLoc DL(N);

48323

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));

48324

return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);

48325

}

48326

48327

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&

48328

Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

48329

TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {

48330

SDLoc DL(N);

48331

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));

48332

return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);

48333

}

48334

}

48335

48336

return combineAddOrSubToADCOrSBB(N, DAG);

48337

}

48338

48339

static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,

48340

const X86Subtarget &Subtarget) {

48341

SDValue Op0 = N->getOperand(0);

48342

SDValue Op1 = N->getOperand(1);

48343

EVT VT = N->getValueType(0);

48344

48345

if (!VT.isVector())

48346

return SDValue();

48347

48348

// PSUBUS is supported, starting from SSE2, but truncation for v8i32

48349

// is only worth it with SSSE3 (PSHUFB).

48350

EVT EltVT = VT.getVectorElementType();

48351

if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&

48352

!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&

48353

!(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))

48354

return SDValue();

48355

48356

SDValue SubusLHS, SubusRHS;

48357

// Try to find umax(a,b) - b or a - umin(a,b) patterns

48358

// they may be converted to subus(a,b).

48359

// TODO: Need to add IR canonicalization for this code.

48360

if (Op0.getOpcode() == ISD::UMAX) {

48361

SubusRHS = Op1;

48362

SDValue MaxLHS = Op0.getOperand(0);

48363

SDValue MaxRHS = Op0.getOperand(1);

48364

if (MaxLHS == Op1)

48365

SubusLHS = MaxRHS;

48366

else if (MaxRHS == Op1)

48367

SubusLHS = MaxLHS;

48368

else

48369

return SDValue();

48370

} else if (Op1.getOpcode() == ISD::UMIN) {

48371

SubusLHS = Op0;

48372

SDValue MinLHS = Op1.getOperand(0);

48373

SDValue MinRHS = Op1.getOperand(1);

48374

if (MinLHS == Op0)

48375

SubusRHS = MinRHS;

48376

else if (MinRHS == Op0)

48377

SubusRHS = MinLHS;

48378

else

48379

return SDValue();

48380

} else if (Op1.getOpcode() == ISD::TRUNCATE &&

48381

Op1.getOperand(0).getOpcode() == ISD::UMIN &&

48382

(EltVT == MVT::i8 || EltVT == MVT::i16)) {

48383

// Special case where the UMIN has been truncated. Try to push the truncate

48384

// further up. This is similar to the i32/i64 special processing.

48385

SubusLHS = Op0;

48386

SDValue MinLHS = Op1.getOperand(0).getOperand(0);

48387

SDValue MinRHS = Op1.getOperand(0).getOperand(1);

48388

EVT TruncVT = Op1.getOperand(0).getValueType();

48389

if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||

48390

TruncVT == MVT::v8i64)) &&

48391

!(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))

48392

return SDValue();

48393

SDValue OpToSaturate;

48394

if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&

48395

MinLHS.getOperand(0) == Op0)

48396

OpToSaturate = MinRHS;

48397

else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&

48398

MinRHS.getOperand(0) == Op0)

48399

OpToSaturate = MinLHS;

48400

else

48401

return SDValue();

48402

48403

// Saturate the non-extended input and then truncate it.

48404

SDLoc DL(N);

48405

SDValue SaturationConst =

48406

DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),

48407

VT.getScalarSizeInBits()),

48408

DL, TruncVT);

48409

SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,

48410

SaturationConst);

48411

SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);

48412

} else

48413

return SDValue();

48414

48415

// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with

48416

// special preprocessing in some cases.

48417

if (EltVT == MVT::i8 || EltVT == MVT::i16)

48418

return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);

48419

48420

assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48421, __PRETTY_FUNCTION__))

48421

"Unexpected VT!")(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48421, __PRETTY_FUNCTION__));

48422

48423

// Special preprocessing case can be only applied

48424

// if the value was zero extended from 16 bit,

48425

// so we require first 16 bits to be zeros for 32 bit

48426

// values, or first 48 bits for 64 bit values.

48427

KnownBits Known = DAG.computeKnownBits(SubusLHS);

48428

unsigned NumZeros = Known.countMinLeadingZeros();

48429

if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)

48430

return SDValue();

48431

48432

EVT ExtType = SubusLHS.getValueType();

48433

EVT ShrinkedType;

48434

if (VT == MVT::v8i32 || VT == MVT::v8i64)

48435

ShrinkedType = MVT::v8i16;

48436

else

48437

ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

48438

48439

// If SubusLHS is zeroextended - truncate SubusRHS to it's

48440

// size SubusRHS = umin(0xFFF.., SubusRHS).

48441

SDValue SaturationConst =

48442

DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),

48443

ShrinkedType.getScalarSizeInBits()),

48444

SDLoc(SubusLHS), ExtType);

48445

SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,

48446

SaturationConst);

48447

SDValue NewSubusLHS =

48448

DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);

48449

SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);

48450

SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,

48451

NewSubusLHS, NewSubusRHS);

48452

48453

// Zero extend the result, it may be used somewhere as 32 bit,

48454

// if not zext and following trunc will shrink.

48455

return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);

48456

}

48457

48458

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

48459

TargetLowering::DAGCombinerInfo &DCI,

48460

const X86Subtarget &Subtarget) {

48461

SDValue Op0 = N->getOperand(0);

48462

SDValue Op1 = N->getOperand(1);

48463

48464

// X86 can't encode an immediate LHS of a sub. See if we can push the

48465

// negation into a preceding instruction.

48466

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {

48467

// If the RHS of the sub is a XOR with one use and a constant, invert the

48468

// immediate. Then add one to the LHS of the sub so we can turn

48469

// X-Y -> X+~Y+1, saving one register.

48470

if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&

48471

isa<ConstantSDNode>(Op1.getOperand(1))) {

48472

const APInt &XorC = Op1.getConstantOperandAPInt(1);

48473

EVT VT = Op0.getValueType();

48474

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,

48475

Op1.getOperand(0),

48476

DAG.getConstant(~XorC, SDLoc(Op1), VT));

48477

return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,

48478

DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));

48479

}

48480

}

48481

48482

// Try to synthesize horizontal subs from subs of shuffles.

48483

if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))

48484

return V;

48485

48486

// Try to create PSUBUS if SUB's argument is max/min

48487

if (SDValue V = combineSubToSubus(N, DAG, Subtarget))

48488

return V;

48489

48490

return combineAddOrSubToADCOrSBB(N, DAG);

48491

}

48492

48493

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

48494

const X86Subtarget &Subtarget) {

48495

MVT VT = N->getSimpleValueType(0);

48496

SDLoc DL(N);

48497

48498

if (N->getOperand(0) == N->getOperand(1)) {

48499

if (N->getOpcode() == X86ISD::PCMPEQ)

48500

return DAG.getConstant(-1, DL, VT);

48501

if (N->getOpcode() == X86ISD::PCMPGT)

48502

return DAG.getConstant(0, DL, VT);

48503

}

48504

48505

return SDValue();

48506

}

48507

48508

/// Helper that combines an array of subvector ops as if they were the operands

48509

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

48510

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.

48511

static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

48512

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

48513

TargetLowering::DAGCombinerInfo &DCI,

48514

const X86Subtarget &Subtarget) {

48515

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")((Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48515, __PRETTY_FUNCTION__));

48516

unsigned EltSizeInBits = VT.getScalarSizeInBits();

48517

48518

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

48519

return DAG.getUNDEF(VT);

48520

48521

if (llvm::all_of(Ops, [](SDValue Op) {

48522

return ISD::isBuildVectorAllZeros(Op.getNode());

48523

}))

48524

return getZeroVector(VT, Subtarget, DAG, DL);

48525

48526

SDValue Op0 = Ops[0];

48527

bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

48528

48529

// Fold subvector loads into one.

48530

// If needed, look through bitcasts to get to the load.

48531

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

48532

bool Fast;

48533

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

48534

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

48535

*FirstLd->getMemOperand(), &Fast) &&

48536

Fast) {

48537

if (SDValue Ld =

48538

EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

48539

return Ld;

48540

}

48541

}

48542

48543

// Repeated subvectors.

48544

if (IsSplat) {

48545

// If this broadcast/subv_broadcast is inserted into both halves, use a

48546

// larger broadcast/subv_broadcast.

48547

if (Op0.getOpcode() == X86ISD::VBROADCAST ||

48548

Op0.getOpcode() == X86ISD::SUBV_BROADCAST)

48549

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

48550

48551

// If this broadcast_load is inserted into both halves, use a larger

48552

// broadcast_load. Update other uses to use an extracted subvector.

48553

if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

48554

auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);

48555

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

48556

SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};

48557

SDValue BcastLd = DAG.getMemIntrinsicNode(

48558

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

48559

MemIntr->getMemOperand());

48560

DAG.ReplaceAllUsesOfValueWith(

48561

Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));

48562

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

48563

return BcastLd;

48564

}

48565

48566

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

48567

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

48568

(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))

48569

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

48570

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

48571

Op0.getOperand(0),

48572

DAG.getIntPtrConstant(0, DL)));

48573

48574

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

48575

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

48576

(Subtarget.hasAVX2() ||

48577

(EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&

48578

Op0.getOperand(0).getValueType() == VT.getScalarType())

48579

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

48580

48581

// concat_vectors(extract_subvector(broadcast(x)),

48582

// extract_subvector(broadcast(x))) -> broadcast(x)

48583

if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

48584

Op0.getOperand(0).getValueType() == VT) {

48585

if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

48586

Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

48587

return Op0.getOperand(0);

48588

}

48589

}

48590

48591

// Repeated opcode.

48592

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

48593

// but it currently struggles with different vector widths.

48594

if (llvm::all_of(Ops, [Op0](SDValue Op) {

48595

return Op.getOpcode() == Op0.getOpcode();

48596

})) {

48597

unsigned NumOps = Ops.size();

48598

switch (Op0.getOpcode()) {

48599

case X86ISD::SHUFP: {

48600

// Add SHUFPD support if/when necessary.

48601

if (!IsSplat && VT.getScalarType() == MVT::f32 &&

48602

llvm::all_of(Ops, [Op0](SDValue Op) {

48603

return Op.getOperand(2) == Op0.getOperand(2);

48604

})) {

48605

SmallVector<SDValue, 2> LHS, RHS;

48606

for (unsigned i = 0; i != NumOps; ++i) {

48607

LHS.push_back(Ops[i].getOperand(0));

48608

RHS.push_back(Ops[i].getOperand(1));

48609

}

48610

return DAG.getNode(Op0.getOpcode(), DL, VT,

48611

DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),

48612

DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),

48613

Op0.getOperand(2));

48614

}

48615

break;

48616

}

48617

case X86ISD::PSHUFHW:

48618

case X86ISD::PSHUFLW:

48619

case X86ISD::PSHUFD:

48620

if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

48621

Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

48622

SmallVector<SDValue, 2> Src;

48623

for (unsigned i = 0; i != NumOps; ++i)

48624

Src.push_back(Ops[i].getOperand(0));

48625

return DAG.getNode(Op0.getOpcode(), DL, VT,

48626

DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),

48627

Op0.getOperand(1));

48628

}

48629

LLVM_FALLTHROUGH[[gnu::fallthrough]];

48630

case X86ISD::VPERMILPI:

48631

// TODO - add support for vXf64/vXi64 shuffles.

48632

if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&

48633

Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

48634

SmallVector<SDValue, 2> Src;

48635

for (unsigned i = 0; i != NumOps; ++i)

48636

Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));

48637

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);

48638

Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,

48639

Op0.getOperand(1));

48640

return DAG.getBitcast(VT, Res);

48641

}

48642

break;

48643

case X86ISD::VSHLI:

48644

case X86ISD::VSRAI:

48645

case X86ISD::VSRLI:

48646

if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

48647

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

48648

(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

48649

llvm::all_of(Ops, [Op0](SDValue Op) {

48650

return Op0.getOperand(1) == Op.getOperand(1);

48651

})) {

48652

SmallVector<SDValue, 2> Src;

48653

for (unsigned i = 0; i != NumOps; ++i)

48654

Src.push_back(Ops[i].getOperand(0));

48655

return DAG.getNode(Op0.getOpcode(), DL, VT,

48656

DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),

48657

Op0.getOperand(1));

48658

}

48659

break;

48660

case X86ISD::VPERMI:

48661

case X86ISD::VROTLI:

48662

case X86ISD::VROTRI:

48663

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

48664

llvm::all_of(Ops, [Op0](SDValue Op) {

48665

return Op0.getOperand(1) == Op.getOperand(1);

48666

})) {

48667

SmallVector<SDValue, 2> Src;

48668

for (unsigned i = 0; i != NumOps; ++i)

48669

Src.push_back(Ops[i].getOperand(0));

48670

return DAG.getNode(Op0.getOpcode(), DL, VT,

48671

DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),

48672

Op0.getOperand(1));

48673

}

48674

break;

48675

case ISD::AND:

48676

case ISD::OR:

48677

case ISD::XOR:

48678

case X86ISD::ANDNP:

48679

// TODO: Add 256-bit support.

48680

if (!IsSplat && VT.is512BitVector()) {

48681

SmallVector<SDValue, 2> LHS, RHS;

48682

for (unsigned i = 0; i != NumOps; ++i) {

48683

LHS.push_back(Ops[i].getOperand(0));

48684

RHS.push_back(Ops[i].getOperand(1));

48685

}

48686

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

48687

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

48688

NumOps * SrcVT.getVectorNumElements());

48689

return DAG.getNode(Op0.getOpcode(), DL, VT,

48690

DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),

48691

DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));

48692

}

48693

break;

48694

case X86ISD::HADD:

48695

case X86ISD::HSUB:

48696

case X86ISD::FHADD:

48697

case X86ISD::FHSUB:

48698

case X86ISD::PACKSS:

48699

case X86ISD::PACKUS:

48700

if (!IsSplat && VT.is256BitVector() &&

48701

(VT.isFloatingPoint() || Subtarget.hasInt256())) {

48702

SmallVector<SDValue, 2> LHS, RHS;

48703

for (unsigned i = 0; i != NumOps; ++i) {

48704

LHS.push_back(Ops[i].getOperand(0));

48705

RHS.push_back(Ops[i].getOperand(1));

48706

}

48707

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

48708

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

48709

NumOps * SrcVT.getVectorNumElements());

48710

return DAG.getNode(Op0.getOpcode(), DL, VT,

48711

DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),

48712

DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));

48713

}

48714

break;

48715

case X86ISD::PALIGNR:

48716

if (!IsSplat &&

48717

((VT.is256BitVector() && Subtarget.hasInt256()) ||

48718

(VT.is512BitVector() && Subtarget.useBWIRegs())) &&

48719

llvm::all_of(Ops, [Op0](SDValue Op) {

48720

return Op0.getOperand(2) == Op.getOperand(2);

48721

})) {

48722

SmallVector<SDValue, 2> LHS, RHS;

48723

for (unsigned i = 0; i != NumOps; ++i) {

48724

LHS.push_back(Ops[i].getOperand(0));

48725

RHS.push_back(Ops[i].getOperand(1));

48726

}

48727

return DAG.getNode(Op0.getOpcode(), DL, VT,

48728

DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),

48729

DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),

48730

Op0.getOperand(2));

48731

}

48732

break;

48733

}

48734

}

48735

48736

return SDValue();

48737

}

48738

48739

static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,

48740

TargetLowering::DAGCombinerInfo &DCI,

48741

const X86Subtarget &Subtarget) {

48742

EVT VT = N->getValueType(0);

48743

EVT SrcVT = N->getOperand(0).getValueType();

48744

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48745

48746

// Don't do anything for i1 vectors.

48747

if (VT.getVectorElementType() == MVT::i1)

48748

return SDValue();

48749

48750

if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

48751

SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());

48752

if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

48753

DCI, Subtarget))

48754

return R;

48755

}

48756

48757

return SDValue();

48758

}

48759

48760

static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,

48761

TargetLowering::DAGCombinerInfo &DCI,

48762

const X86Subtarget &Subtarget) {

48763

if (DCI.isBeforeLegalizeOps())

48764

return SDValue();

48765

48766

MVT OpVT = N->getSimpleValueType(0);

48767

48768

bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

48769

48770

SDLoc dl(N);

48771

SDValue Vec = N->getOperand(0);

48772

SDValue SubVec = N->getOperand(1);

48773

48774

uint64_t IdxVal = N->getConstantOperandVal(2);

48775

MVT SubVecVT = SubVec.getSimpleValueType();

48776

48777

if (Vec.isUndef() && SubVec.isUndef())

48778

return DAG.getUNDEF(OpVT);

48779

48780

// Inserting undefs/zeros into zeros/undefs is a zero vector.

48781

if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

48782

(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

48783

return getZeroVector(OpVT, Subtarget, DAG, dl);

48784

48785

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

48786

// If we're inserting into a zero vector and then into a larger zero vector,

48787

// just insert into the larger zero vector directly.

48788

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

48789

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

48790

uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

48791

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

48792

getZeroVector(OpVT, Subtarget, DAG, dl),

48793

SubVec.getOperand(1),

48794

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

48795

}

48796

48797

// If we're inserting into a zero vector and our input was extracted from an

48798

// insert into a zero vector of the same type and the extraction was at

48799

// least as large as the original insertion. Just insert the original

48800

// subvector into a zero vector.

48801

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

48802

isNullConstant(SubVec.getOperand(1)) &&

48803

SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

48804

SDValue Ins = SubVec.getOperand(0);

48805

if (isNullConstant(Ins.getOperand(2)) &&

48806

ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

48807

Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())

48808

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

48809

getZeroVector(OpVT, Subtarget, DAG, dl),

48810

Ins.getOperand(1), N->getOperand(2));

48811

}

48812

}

48813

48814

// Stop here if this is an i1 vector.

48815

if (IsI1Vector)

48816

return SDValue();

48817

48818

// If this is an insert of an extract, combine to a shuffle. Don't do this

48819

// if the insert or extract can be represented with a subregister operation.

48820

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

48821

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

48822

(IdxVal != 0 ||

48823

!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

48824

int ExtIdxVal = SubVec.getConstantOperandVal(1);

48825

if (ExtIdxVal != 0) {

48826

int VecNumElts = OpVT.getVectorNumElements();

48827

int SubVecNumElts = SubVecVT.getVectorNumElements();

48828

SmallVector<int, 64> Mask(VecNumElts);

48829

// First create an identity shuffle mask.

48830

for (int i = 0; i != VecNumElts; ++i)

48831

Mask[i] = i;

48832

// Now insert the extracted portion.

48833

for (int i = 0; i != SubVecNumElts; ++i)

48834

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

48835

48836

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

48837

}

48838

}

48839

48840

// Match concat_vector style patterns.

48841

SmallVector<SDValue, 2> SubVectorOps;

48842

if (collectConcatOps(N, SubVectorOps)) {

48843

if (SDValue Fold =

48844

combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))

48845

return Fold;

48846

48847

// If we're inserting all zeros into the upper half, change this to

48848

// a concat with zero. We will match this to a move

48849

// with implicit upper bit zeroing during isel.

48850

// We do this here because we don't want combineConcatVectorOps to

48851

// create INSERT_SUBVECTOR from CONCAT_VECTORS.

48852

if (SubVectorOps.size() == 2 &&

48853

ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

48854

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

48855

getZeroVector(OpVT, Subtarget, DAG, dl),

48856

SubVectorOps[0], DAG.getIntPtrConstant(0, dl));

48857

}

48858

48859

// If this is a broadcast insert into an upper undef, use a larger broadcast.

48860

if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

48861

return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

48862

48863

// If this is a broadcast load inserted into an upper undef, use a larger

48864

// broadcast load.

48865

if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

48866

SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

48867

auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

48868

SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);

48869

SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };

48870

SDValue BcastLd =

48871

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

48872

MemIntr->getMemoryVT(),

48873

MemIntr->getMemOperand());

48874

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

48875

return BcastLd;

48876

}

48877

48878

return SDValue();

48879

}

48880

48881

/// If we are extracting a subvector of a vector select and the select condition

48882

/// is composed of concatenated vectors, try to narrow the select width. This

48883

/// is a common pattern for AVX1 integer code because 256-bit selects may be

48884

/// legal, but there is almost no integer math/logic available for 256-bit.

48885

/// This function should only be called with legal types (otherwise, the calls

48886

/// to get simple value types will assert).

48887

static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

48888

SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));

48889

SmallVector<SDValue, 4> CatOps;

48890

if (Sel.getOpcode() != ISD::VSELECT ||

48891

!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))

48892

return SDValue();

48893

48894

// Note: We assume simple value types because this should only be called with

48895

// legal operations/types.

48896

// TODO: This can be extended to handle extraction to 256-bits.

48897

MVT VT = Ext->getSimpleValueType(0);

48898

if (!VT.is128BitVector())

48899

return SDValue();

48900

48901

MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

48902

if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

48903

return SDValue();

48904

48905

MVT WideVT = Ext->getOperand(0).getSimpleValueType();

48906

MVT SelVT = Sel.getSimpleValueType();

48907

assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48908, __PRETTY_FUNCTION__))

48908

"Unexpected vector type with legal operations")(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48908, __PRETTY_FUNCTION__));

48909

48910

unsigned SelElts = SelVT.getVectorNumElements();

48911

unsigned CastedElts = WideVT.getVectorNumElements();

48912

unsigned ExtIdx = Ext->getConstantOperandVal(1);

48913

if (SelElts % CastedElts == 0) {

48914

// The select has the same or more (narrower) elements than the extract

48915

// operand. The extraction index gets scaled by that factor.

48916

ExtIdx *= (SelElts / CastedElts);

48917

} else if (CastedElts % SelElts == 0) {

48918

// The select has less (wider) elements than the extract operand. Make sure

48919

// that the extraction index can be divided evenly.

48920

unsigned IndexDivisor = CastedElts / SelElts;

48921

if (ExtIdx % IndexDivisor != 0)

48922

return SDValue();

48923

ExtIdx /= IndexDivisor;

48924

} else {

48925

llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48925);

48926

}

48927

48928

unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

48929

unsigned NarrowElts = SelElts / NarrowingFactor;

48930

MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

48931

SDLoc DL(Ext);

48932

SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

48933

SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

48934

SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

48935

SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

48936

return DAG.getBitcast(VT, NarrowSel);

48937

}

48938

48939

static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

48940

TargetLowering::DAGCombinerInfo &DCI,

48941

const X86Subtarget &Subtarget) {

48942

// For AVX1 only, if we are extracting from a 256-bit and+not (which will

48943

// eventually get combined/lowered into ANDNP) with a concatenated operand,

48944

// split the 'and' into 128-bit ops to avoid the concatenate and extract.

48945

// We let generic combining take over from there to simplify the

48946

// insert/extract and 'not'.

48947

// This pattern emerges during AVX1 legalization. We handle it before lowering

48948

// to avoid complications like splitting constant vector loads.

48949

48950

// Capture the original wide type in the likely case that we need to bitcast

48951

// back to this type.

48952

if (!N->getValueType(0).isSimple())

48953

return SDValue();

48954

48955

MVT VT = N->getSimpleValueType(0);

48956

SDValue InVec = N->getOperand(0);

48957

unsigned IdxVal = N->getConstantOperandVal(1);

48958

SDValue InVecBC = peekThroughBitcasts(InVec);

48959

EVT InVecVT = InVec.getValueType();

48960

unsigned SizeInBits = VT.getSizeInBits();

48961

unsigned InSizeInBits = InVecVT.getSizeInBits();

48962

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48963

48964

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

48965

TLI.isTypeLegal(InVecVT) &&

48966

InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {

48967

auto isConcatenatedNot = [](SDValue V) {

48968

V = peekThroughBitcasts(V);

48969

if (!isBitwiseNot(V))

48970

return false;

48971

SDValue NotOp = V->getOperand(0);

48972

return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

48973

};

48974

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

48975

isConcatenatedNot(InVecBC.getOperand(1))) {

48976

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

48977

SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

48978

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

48979

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

48980

}

48981

}

48982

48983

if (DCI.isBeforeLegalizeOps())

48984

return SDValue();

48985

48986

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

48987

return V;

48988

48989

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

48990

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

48991

48992

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

48993

if (VT.getScalarType() == MVT::i1)

48994

return DAG.getConstant(1, SDLoc(N), VT);

48995

return getOnesVector(VT, DAG, SDLoc(N));

48996

}

48997

48998

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

48999

return DAG.getBuildVector(

49000

VT, SDLoc(N),

49001

InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

49002

49003

// If we are extracting from an insert into a zero vector, replace with a

49004

// smaller insert into zero if we don't access less than the original

49005

// subvector. Don't do this for i1 vectors.

49006

if (VT.getVectorElementType() != MVT::i1 &&

49007

InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&

49008

InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&

49009

ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&

49010

InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {

49011

SDLoc DL(N);

49012

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

49013

getZeroVector(VT, Subtarget, DAG, DL),

49014

InVec.getOperand(1), InVec.getOperand(2));

49015

}

49016

49017

// If we're extracting from a broadcast then we're better off just

49018

// broadcasting to the smaller type directly, assuming this is the only use.

49019

// As its a broadcast we don't care about the extraction index.

49020

if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&

49021

InVec.getOperand(0).getValueSizeInBits() <= SizeInBits)

49022

return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));

49023

49024

if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {

49025

auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);

49026

if (MemIntr->getMemoryVT().getSizeInBits() <= SizeInBits) {

49027

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

49028

SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};

49029

SDValue BcastLd =

49030

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

49031

MemIntr->getMemoryVT(),

49032

MemIntr->getMemOperand());

49033

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

49034

return BcastLd;

49035

}

49036

}

49037

49038

// If we're extracting an upper subvector from a broadcast we should just

49039

// extract the lowest subvector instead which should allow

49040

// SimplifyDemandedVectorElts do more simplifications.

49041

if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

49042

InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))

49043

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

49044

49045

// If we're extracting a broadcasted subvector, just use the source.

49046

if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&

49047

InVec.getOperand(0).getValueType() == VT)

49048

return InVec.getOperand(0);

49049

49050

// Attempt to extract from the source of a shuffle vector.

49051

if ((InSizeInBits % SizeInBits) == 0 &&

49052

(IdxVal % VT.getVectorNumElements()) == 0) {

49053

SmallVector<int, 32> ShuffleMask;

49054

SmallVector<int, 32> ScaledMask;

49055

SmallVector<SDValue, 2> ShuffleInputs;

49056

unsigned NumSubVecs = InSizeInBits / SizeInBits;

49057

// Decode the shuffle mask and scale it so its shuffling subvectors.

49058

if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

49059

scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

49060

unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();

49061

if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

49062

return DAG.getUNDEF(VT);

49063

if (ScaledMask[SubVecIdx] == SM_SentinelZero)

49064

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

49065

SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

49066

if (Src.getValueSizeInBits() == InSizeInBits) {

49067

unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

49068

unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();

49069

return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

49070

SDLoc(N), SizeInBits);

49071

}

49072

}

49073

}

49074

49075

// If we're extracting the lowest subvector and we're the only user,

49076

// we may be able to perform this with a smaller vector width.

49077

if (IdxVal == 0 && InVec.hasOneUse()) {

49078

unsigned InOpcode = InVec.getOpcode();

49079

if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

49080

// v2f64 CVTDQ2PD(v4i32).

49081

if (InOpcode == ISD::SINT_TO_FP &&

49082

InVec.getOperand(0).getValueType() == MVT::v4i32) {

49083

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

49084

}

49085

// v2f64 CVTUDQ2PD(v4i32).

49086

if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

49087

InVec.getOperand(0).getValueType() == MVT::v4i32) {

49088

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

49089

}

49090

// v2f64 CVTPS2PD(v4f32).

49091

if (InOpcode == ISD::FP_EXTEND &&

49092

InVec.getOperand(0).getValueType() == MVT::v4f32) {

49093

return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));

49094

}

49095

}

49096

if ((InOpcode == ISD::ANY_EXTEND ||

49097

InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||

49098

InOpcode == ISD::ZERO_EXTEND ||

49099

InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||

49100

InOpcode == ISD::SIGN_EXTEND ||

49101

InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&

49102

(SizeInBits == 128 || SizeInBits == 256) &&

49103

InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

49104

SDLoc DL(N);

49105

SDValue Ext = InVec.getOperand(0);

49106

if (Ext.getValueSizeInBits() > SizeInBits)

49107

Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

49108

unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);

49109

return DAG.getNode(ExtOp, DL, VT, Ext);

49110

}

49111

if (InOpcode == ISD::VSELECT &&

49112

InVec.getOperand(0).getValueType().is256BitVector() &&

49113

InVec.getOperand(1).getValueType().is256BitVector() &&

49114

InVec.getOperand(2).getValueType().is256BitVector()) {

49115

SDLoc DL(N);

49116

SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

49117

SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

49118

SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

49119

return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

49120

}

49121

if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

49122

(VT.is128BitVector() || VT.is256BitVector())) {

49123

SDLoc DL(N);

49124

SDValue InVecSrc = InVec.getOperand(0);

49125

unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

49126

SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

49127

return DAG.getNode(InOpcode, DL, VT, Ext);

49128

}

49129

}

49130

49131

return SDValue();

49132

}

49133

49134

static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

49135

EVT VT = N->getValueType(0);

49136

SDValue Src = N->getOperand(0);

49137

SDLoc DL(N);

49138

49139

// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

49140

// This occurs frequently in our masked scalar intrinsic code and our

49141

// floating point select lowering with AVX512.

49142

// TODO: SimplifyDemandedBits instead?

49143

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())

49144

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

49145

if (C->getAPIntValue().isOneValue())

49146

return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,

49147

Src.getOperand(0));

49148

49149

// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

49150

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

49151

Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

49152

Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)

49153

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

49154

if (C->isNullValue())

49155

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

49156

Src.getOperand(1));

49157

49158

// Reduce v2i64 to v4i32 if we don't need the upper bits.

49159

// TODO: Move to DAGCombine/SimplifyDemandedBits?

49160

if (VT == MVT::v2i64 || VT == MVT::v2f64) {

49161

auto IsAnyExt64 = [](SDValue Op) {

49162

if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())

49163

return SDValue();

49164

if (Op.getOpcode() == ISD::ANY_EXTEND &&

49165

Op.getOperand(0).getScalarValueSizeInBits() <= 32)

49166

return Op.getOperand(0);

49167

if (auto *Ld = dyn_cast<LoadSDNode>(Op))

49168

if (Ld->getExtensionType() == ISD::EXTLOAD &&

49169

Ld->getMemoryVT().getScalarSizeInBits() <= 32)

49170

return Op;

49171

return SDValue();

49172

};

49173

if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))

49174

return DAG.getBitcast(

49175

VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

49176

DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));

49177

}

49178

49179

// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

49180

if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

49181

Src.getOperand(0).getValueType() == MVT::x86mmx)

49182

return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

49183

49184

return SDValue();

49185

}

49186

49187

// Simplify PMULDQ and PMULUDQ operations.

49188

static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

49189

TargetLowering::DAGCombinerInfo &DCI,

49190

const X86Subtarget &Subtarget) {

49191

SDValue LHS = N->getOperand(0);

49192

SDValue RHS = N->getOperand(1);

49193

49194

// Canonicalize constant to RHS.

49195

if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

49196

!DAG.isConstantIntBuildVectorOrConstantInt(RHS))

49197

return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

49198

49199

// Multiply by zero.

49200

// Don't return RHS as it may contain UNDEFs.

49201

if (ISD::isBuildVectorAllZeros(RHS.getNode()))

49202

return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

49203

49204

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

49205

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49206

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))

49207

return SDValue(N, 0);

49208

49209

// If the input is an extend_invec and the SimplifyDemandedBits call didn't

49210

// convert it to any_extend_invec, due to the LegalOperations check, do the

49211

// conversion directly to a vector shuffle manually. This exposes combine

49212

// opportunities missed by combineExtInVec not calling

49213

// combineX86ShufflesRecursively on SSE4.1 targets.

49214

// FIXME: This is basically a hack around several other issues related to

49215

// ANY_EXTEND_VECTOR_INREG.

49216

if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

49217

(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

49218

LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

49219

LHS.getOperand(0).getValueType() == MVT::v4i32) {

49220

SDLoc dl(N);

49221

LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

49222

LHS.getOperand(0), { 0, -1, 1, -1 });

49223

LHS = DAG.getBitcast(MVT::v2i64, LHS);

49224

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

49225

}

49226

if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

49227

(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

49228

RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

49229

RHS.getOperand(0).getValueType() == MVT::v4i32) {

49230

SDLoc dl(N);

49231

RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

49232

RHS.getOperand(0), { 0, -1, 1, -1 });

49233

RHS = DAG.getBitcast(MVT::v2i64, RHS);

49234

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

49235

}

49236

49237

return SDValue();

49238

}

49239

49240

static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,

49241

TargetLowering::DAGCombinerInfo &DCI,

49242

const X86Subtarget &Subtarget) {

49243

EVT VT = N->getValueType(0);

49244

SDValue In = N->getOperand(0);

49245

unsigned Opcode = N->getOpcode();

49246

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49247

49248

// Try to merge vector loads and extend_inreg to an extload.

49249

if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

49250

In.hasOneUse()) {

49251

auto *Ld = cast<LoadSDNode>(In);

49252

if (Ld->isSimple()) {

49253

MVT SVT = In.getSimpleValueType().getVectorElementType();

49254

ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

49255

? ISD::SEXTLOAD

49256

: ISD::ZEXTLOAD;

49257

EVT MemVT =

49258

EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());

49259

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

49260

SDValue Load =

49261

DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),

49262

Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),

49263

Ld->getMemOperand()->getFlags());

49264

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

49265

return Load;

49266

}

49267

}

49268

}

49269

49270

// Attempt to combine as a shuffle.

49271

// TODO: SSE ZERO_EXTEND_VECTOR_INREG support.

49272

if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||

49273

(Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasAVX())) {

49274

SDValue Op(N, 0);

49275

if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

49276

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49277

return Res;

49278

}

49279

49280

return SDValue();

49281

}

49282

49283

static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

49284

TargetLowering::DAGCombinerInfo &DCI) {

49285

EVT VT = N->getValueType(0);

49286

49287

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

49288

return DAG.getConstant(0, SDLoc(N), VT);

49289

49290

APInt KnownUndef, KnownZero;

49291

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49292

APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());

49293

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,

49294

KnownZero, DCI))

49295

return SDValue(N, 0);

49296

49297

return SDValue();

49298

}

49299

49300

// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

49301

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

49302

// extra instructions between the conversion due to going to scalar and back.

49303

static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

49304

const X86Subtarget &Subtarget) {

49305

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

49306

return SDValue();

49307

49308

if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

49309

return SDValue();

49310

49311

if (N->getValueType(0) != MVT::f32 ||

49312

N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

49313

return SDValue();

49314

49315

SDLoc dl(N);

49316

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

49317

N->getOperand(0).getOperand(0));

49318

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

49319

DAG.getTargetConstant(4, dl, MVT::i32));

49320

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

49321

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

49322

DAG.getIntPtrConstant(0, dl));

49323

}

49324

49325

static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

49326

const X86Subtarget &Subtarget) {

49327

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

49328

return SDValue();

49329

49330

bool IsStrict = N->isStrictFPOpcode();

49331

EVT VT = N->getValueType(0);

49332

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

49333

EVT SrcVT = Src.getValueType();

49334

49335

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

49336

return SDValue();

49337

49338

if (VT.getVectorElementType() != MVT::f32 &&

49339

VT.getVectorElementType() != MVT::f64)

49340

return SDValue();

49341

49342

unsigned NumElts = VT.getVectorNumElements();

49343

if (NumElts == 1 || !isPowerOf2_32(NumElts))

49344

return SDValue();

49345

49346

SDLoc dl(N);

49347

49348

// Convert the input to vXi16.

49349

EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

49350

Src = DAG.getBitcast(IntVT, Src);

49351

49352

// Widen to at least 8 input elements.

49353

if (NumElts < 8) {

49354

unsigned NumConcats = 8 / NumElts;

49355

SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

49356

: DAG.getConstant(0, dl, IntVT);

49357

SmallVector<SDValue, 4> Ops(NumConcats, Fill);

49358

Ops[0] = Src;

49359

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

49360

}

49361

49362

// Destination is vXf32 with at least 4 elements.

49363

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

49364

std::max(4U, NumElts));

49365

SDValue Cvt, Chain;

49366

if (IsStrict) {

49367

Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

49368

{N->getOperand(0), Src});

49369

Chain = Cvt.getValue(1);

49370

} else {

49371

Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

49372

}

49373

49374

if (NumElts < 4) {

49375

assert(NumElts == 2 && "Unexpected size")((NumElts == 2 && "Unexpected size") ? static_cast<
void> (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49375, __PRETTY_FUNCTION__));

49376

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

49377

DAG.getIntPtrConstant(0, dl));

49378

}

49379

49380

if (IsStrict) {

49381

// Extend to the original VT if necessary.

49382

if (Cvt.getValueType() != VT) {

49383

Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

49384

{Chain, Cvt});

49385

Chain = Cvt.getValue(1);

49386

}

49387

return DAG.getMergeValues({Cvt, Chain}, dl);

49388

}

49389

49390

// Extend to the original VT if necessary.

49391

return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

49392

}

49393

49394

// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to

49395

// cases where the loads have the same input chain and the output chains are

49396

// unused. This avoids any memory ordering issues.

49397

static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

49398

TargetLowering::DAGCombinerInfo &DCI) {

49399

// Only do this if the chain result is unused.

49400

if (N->hasAnyUseOfValue(1))

49401

return SDValue();

49402

49403

auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

49404

49405

SDValue Ptr = MemIntrin->getBasePtr();

49406

SDValue Chain = MemIntrin->getChain();

49407

EVT VT = N->getSimpleValueType(0);

49408

EVT MemVT = MemIntrin->getMemoryVT();

49409

49410

// Look at other users of our base pointer and try to find a wider broadcast.

49411

// The input chain and the size of the memory VT must match.

49412

for (SDNode *User : Ptr->uses())

49413

if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&

49414

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

49415

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

49416

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

49417

MemVT.getSizeInBits() &&

49418

!User->hasAnyUseOfValue(1) &&

49419

User->getValueSizeInBits(0) > VT.getSizeInBits()) {

49420

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

49421

VT.getSizeInBits());

49422

Extract = DAG.getBitcast(VT, Extract);

49423

return DCI.CombineTo(N, Extract, SDValue(User, 1));

49424

}

49425

49426

return SDValue();

49427

}

49428

49429

static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

49430

const X86Subtarget &Subtarget) {

49431

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

49432

return SDValue();

49433

49434

EVT VT = N->getValueType(0);

49435

SDValue Src = N->getOperand(0);

49436

EVT SrcVT = Src.getValueType();

49437

49438

if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

49439

SrcVT.getVectorElementType() != MVT::f32)

49440

return SDValue();

49441

49442

unsigned NumElts = VT.getVectorNumElements();

49443

if (NumElts == 1 || !isPowerOf2_32(NumElts))

49444

return SDValue();

49445

49446

SDLoc dl(N);

49447

49448

// Widen to at least 4 input elements.

49449

if (NumElts < 4)

49450

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

49451

DAG.getConstantFP(0.0, dl, SrcVT));

49452

49453

// Destination is v8i16 with at least 8 elements.

49454

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

49455

std::max(8U, NumElts));

49456

SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,

49457

DAG.getTargetConstant(4, dl, MVT::i32));

49458

49459

// Extract down to real number of elements.

49460

if (NumElts < 8) {

49461

EVT IntVT = VT.changeVectorElementTypeToInteger();

49462

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

49463

DAG.getIntPtrConstant(0, dl));

49464

}

49465

49466

return DAG.getBitcast(VT, Cvt);

49467

}

49468

49469

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

49470

SDValue Src = N->getOperand(0);

49471

49472

// Turn MOVDQ2Q+simple_load into an mmx load.

49473

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

49474

LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

49475

49476

if (LN->isSimple()) {

49477

SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

49478

LN->getBasePtr(),

49479

LN->getPointerInfo(),

49480

LN->getOriginalAlign(),

49481

LN->getMemOperand()->getFlags());

49482

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

49483

return NewLd;

49484

}

49485

}

49486

49487

return SDValue();

49488

}

49489

49490

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

49491

DAGCombinerInfo &DCI) const {

49492

SelectionDAG &DAG = DCI.DAG;

49493

switch (N->getOpcode()) {

49494

default: break;

49495

case ISD::SCALAR_TO_VECTOR:

49496

return combineScalarToVector(N, DAG);

49497

case ISD::EXTRACT_VECTOR_ELT:

49498

case X86ISD::PEXTRW:

49499

case X86ISD::PEXTRB:

49500

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

49501

case ISD::CONCAT_VECTORS:

49502

return combineConcatVectors(N, DAG, DCI, Subtarget);

49503

case ISD::INSERT_SUBVECTOR:

49504

return combineInsertSubvector(N, DAG, DCI, Subtarget);

49505

case ISD::EXTRACT_SUBVECTOR:

49506

return combineExtractSubvector(N, DAG, DCI, Subtarget);

49507

case ISD::VSELECT:

49508

case ISD::SELECT:

49509

case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);

49510

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

49511

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

49512

case X86ISD::CMP: return combineCMP(N, DAG);

49513

case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);

49514

case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);

49515

case X86ISD::ADD:

49516

case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);

49517

case X86ISD::SBB: return combineSBB(N, DAG);

49518

case X86ISD::ADC: return combineADC(N, DAG, DCI);

49519

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

49520

case ISD::SHL: return combineShiftLeft(N, DAG);

49521

case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

49522

case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

49523

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

49524

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

49525

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

49526

case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);

49527

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

49528

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

49529

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

49530

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

49531

case X86ISD::VEXTRACT_STORE:

49532

return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

49533

case ISD::SINT_TO_FP:

49534

case ISD::STRICT_SINT_TO_FP:

49535

return combineSIntToFP(N, DAG, DCI, Subtarget);

49536

case ISD::UINT_TO_FP:

49537

case ISD::STRICT_UINT_TO_FP:

49538

return combineUIntToFP(N, DAG, Subtarget);

49539

case ISD::FADD:

49540

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

49541

case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

49542

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

49543

case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

49544

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

49545

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

49546

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

49547

case X86ISD::FXOR:

49548

case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

49549

case X86ISD::FMIN:

49550

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

49551

case ISD::FMINNUM:

49552

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

49553

case X86ISD::CVTSI2P:

49554

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

49555

case X86ISD::CVTP2SI:

49556

case X86ISD::CVTP2UI:

49557

case X86ISD::STRICT_CVTTP2SI:

49558

case X86ISD::CVTTP2SI:

49559

case X86ISD::STRICT_CVTTP2UI:

49560

case X86ISD::CVTTP2UI:

49561

return combineCVTP2I_CVTTP2I(N, DAG, DCI);

49562

case X86ISD::STRICT_CVTPH2PS:

49563

case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

49564

case X86ISD::BT: return combineBT(N, DAG, DCI);

49565

case ISD::ANY_EXTEND:

49566

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

49567

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

49568

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

49569

case ISD::ANY_EXTEND_VECTOR_INREG:

49570

case ISD::SIGN_EXTEND_VECTOR_INREG:

49571

case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,

49572

Subtarget);

49573

case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);

49574

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

49575

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

49576

case X86ISD::PACKSS:

49577

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

49578

case X86ISD::HADD:

49579

case X86ISD::HSUB:

49580

case X86ISD::FHADD:

49581

case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

49582

case X86ISD::VSHL:

49583

case X86ISD::VSRA:

49584

case X86ISD::VSRL:

49585

return combineVectorShiftVar(N, DAG, DCI, Subtarget);

49586

case X86ISD::VSHLI:

49587

case X86ISD::VSRAI:

49588

case X86ISD::VSRLI:

49589

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

49590

case ISD::INSERT_VECTOR_ELT:

49591

case X86ISD::PINSRB:

49592

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

49593

case X86ISD::SHUFP: // Handle all target specific shuffles

49594

case X86ISD::INSERTPS:

49595

case X86ISD::EXTRQI:

49596

case X86ISD::INSERTQI:

49597

case X86ISD::VALIGN:

49598

case X86ISD::PALIGNR:

49599

case X86ISD::VSHLDQ:

49600

case X86ISD::VSRLDQ:

49601

case X86ISD::BLENDI:

49602

case X86ISD::UNPCKH:

49603

case X86ISD::UNPCKL:

49604

case X86ISD::MOVHLPS:

49605

case X86ISD::MOVLHPS:

49606

case X86ISD::PSHUFB:

49607

case X86ISD::PSHUFD:

49608

case X86ISD::PSHUFHW:

49609

case X86ISD::PSHUFLW:

49610

case X86ISD::MOVSHDUP:

49611

case X86ISD::MOVSLDUP:

49612

case X86ISD::MOVDDUP:

49613

case X86ISD::MOVSS:

49614

case X86ISD::MOVSD:

49615

case X86ISD::VBROADCAST:

49616

case X86ISD::VPPERM:

49617

case X86ISD::VPERMI:

49618

case X86ISD::VPERMV:

49619

case X86ISD::VPERMV3:

49620

case X86ISD::VPERMIL2:

49621

case X86ISD::VPERMILPI:

49622

case X86ISD::VPERMILPV:

49623

case X86ISD::VPERM2X128:

49624

case X86ISD::SHUF128:

49625

case X86ISD::VZEXT_MOVL:

49626

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

49627

case X86ISD::FMADD_RND:

49628

case X86ISD::FMSUB:

49629

case X86ISD::STRICT_FMSUB:

49630

case X86ISD::FMSUB_RND:

49631

case X86ISD::FNMADD:

49632

case X86ISD::STRICT_FNMADD:

49633

case X86ISD::FNMADD_RND:

49634

case X86ISD::FNMSUB:

49635

case X86ISD::STRICT_FNMSUB:

49636

case X86ISD::FNMSUB_RND:

49637

case ISD::FMA:

49638

case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

49639

case X86ISD::FMADDSUB_RND:

49640

case X86ISD::FMSUBADD_RND:

49641

case X86ISD::FMADDSUB:

49642

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);

49643

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);

49644

case X86ISD::MGATHER:

49645

case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);

49646

case ISD::MGATHER:

49647

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);

49648

case X86ISD::PCMPEQ:

49649

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

49650

case X86ISD::PMULDQ:

49651

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

49652

case X86ISD::KSHIFTL:

49653

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

49654

case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

49655

case ISD::STRICT_FP_EXTEND:

49656

case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

49657

case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

49658

case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);

49659

case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

49660

}

49661

49662

return SDValue();

49663

}

49664

49665

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

49666

if (!isTypeLegal(VT))

49667

return false;

49668

49669

// There are no vXi8 shifts.

49670

if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

49671

return false;

49672

49673

// TODO: Almost no 8-bit ops are desirable because they have no actual

49674

// size/speed advantages vs. 32-bit ops, but they do have a major

49675

// potential disadvantage by causing partial register stalls.

49676

//

49677

// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

49678

// we have specializations to turn 32-bit multiply/shl into LEA or other ops.

49679

// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

49680

// check for a constant operand to the multiply.

49681

if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

49682

return false;

49683

49684

// i16 instruction encodings are longer and some i16 instructions are slow,

49685

// so those are not desirable.

49686

if (VT == MVT::i16) {

49687

switch (Opc) {

49688

default:

49689

break;

49690

case ISD::LOAD:

49691

case ISD::SIGN_EXTEND:

49692

case ISD::ZERO_EXTEND:

49693

case ISD::ANY_EXTEND:

49694

case ISD::SHL:

49695

case ISD::SRA:

49696

case ISD::SRL:

49697

case ISD::SUB:

49698

case ISD::ADD:

49699

case ISD::MUL:

49700

case ISD::AND:

49701

case ISD::OR:

49702

case ISD::XOR:

49703

return false;

49704

}

49705

}

49706

49707

// Any legal type not explicitly accounted for above here is desirable.

49708

return true;

49709

}

49710

49711

SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,

49712

SDValue Value, SDValue Addr,

49713

SelectionDAG &DAG) const {

49714

const Module *M = DAG.getMachineFunction().getMMI().getModule();

49715

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

49716

if (IsCFProtectionSupported) {

49717

// In case control-flow branch protection is enabled, we need to add

49718

// notrack prefix to the indirect branch.

49719

// In order to do that we create NT_BRIND SDNode.

49720

// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

49721

return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);

49722

}

49723

49724

return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);

49725

}

49726

49727

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

49728

EVT VT = Op.getValueType();

49729

bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

49730

isa<ConstantSDNode>(Op.getOperand(1));

49731

49732

// i16 is legal, but undesirable since i16 instruction encodings are longer

49733

// and some i16 instructions are slow.

49734

// 8-bit multiply-by-constant can usually be expanded to something cheaper

49735

// using LEA and/or other ALU ops.

49736

if (VT != MVT::i16 && !Is8BitMulByConstant)

49737

return false;

49738

49739

auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

49740

if (!Op.hasOneUse())

49741

return false;

49742

SDNode *User = *Op->use_begin();

49743

if (!ISD::isNormalStore(User))

49744

return false;

49745

auto *Ld = cast<LoadSDNode>(Load);

49746

auto *St = cast<StoreSDNode>(User);

49747

return Ld->getBasePtr() == St->getBasePtr();

49748

};

49749

49750

auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

49751

if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

49752

return false;

49753

if (!Op.hasOneUse())

49754

return false;

49755

SDNode *User = *Op->use_begin();

49756

if (User->getOpcode() != ISD::ATOMIC_STORE)

49757

return false;

49758

auto *Ld = cast<AtomicSDNode>(Load);

49759

auto *St = cast<AtomicSDNode>(User);

49760

return Ld->getBasePtr() == St->getBasePtr();

49761

};

49762

49763

bool Commute = false;

49764

switch (Op.getOpcode()) {

49765

default: return false;

49766

case ISD::SIGN_EXTEND:

49767

case ISD::ZERO_EXTEND:

49768

case ISD::ANY_EXTEND:

49769

break;

49770

case ISD::SHL:

49771

case ISD::SRA:

49772

case ISD::SRL: {

49773

SDValue N0 = Op.getOperand(0);

49774

// Look out for (store (shl (load), x)).

49775

if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))

49776

return false;

49777

break;

49778

}

49779

case ISD::ADD:

49780

case ISD::MUL:

49781

case ISD::AND:

49782

case ISD::OR:

49783

case ISD::XOR:

49784

Commute = true;

49785

LLVM_FALLTHROUGH[[gnu::fallthrough]];

49786

case ISD::SUB: {

49787

SDValue N0 = Op.getOperand(0);

49788

SDValue N1 = Op.getOperand(1);

49789

// Avoid disabling potential load folding opportunities.

49790

if (MayFoldLoad(N1) &&

49791

(!Commute || !isa<ConstantSDNode>(N0) ||

49792

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

49793

return false;

49794

if (MayFoldLoad(N0) &&

49795

((Commute && !isa<ConstantSDNode>(N1)) ||

49796

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

49797

return false;

49798

if (IsFoldableAtomicRMW(N0, Op) ||

49799

(Commute && IsFoldableAtomicRMW(N1, Op)))

49800

return false;

49801

}

49802

}

49803

49804

PVT = MVT::i32;

49805

return true;

49806

}

49807

49808

//===----------------------------------------------------------------------===//

49809

// X86 Inline Assembly Support

49810

//===----------------------------------------------------------------------===//

49811

49812

// Helper to match a string separated by whitespace.

49813

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

49814

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

49815

49816

for (StringRef Piece : Pieces) {

49817

if (!S.startswith(Piece)) // Check if the piece matches.

49818

return false;

49819

49820

S = S.substr(Piece.size());

49821

StringRef::size_type Pos = S.find_first_not_of(" \t");

49822

if (Pos == 0) // We matched a prefix.

49823

return false;

49824

49825

S = S.substr(Pos);

49826

}

49827

49828

return S.empty();

49829

}

49830

49831

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

49832

49833

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

49834

if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&

49835

std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&

49836

std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

49837

49838

if (AsmPieces.size() == 3)

49839

return true;

49840

else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))

49841

return true;

49842

}

49843

}

49844

return false;

49845

}

49846

49847

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

49848

InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

49849

49850

const std::string &AsmStr = IA->getAsmString();

49851

49852

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

49853

if (!Ty || Ty->getBitWidth() % 16 != 0)

49854

return false;

49855

49856

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

49857

SmallVector<StringRef, 4> AsmPieces;

49858

SplitString(AsmStr, AsmPieces, ";\n");

49859

49860

switch (AsmPieces.size()) {

49861

default: return false;

49862

case 1:

49863

// FIXME: this should verify that we are targeting a 486 or better. If not,

49864

// we will turn this bswap into something that will be lowered to logical

49865

// ops instead of emitting the bswap asm. For now, we don't support 486 or

49866

// lower so don't worry about this.

49867

// bswap $0

49868

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

49869

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

49870

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

49871

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

49872

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

49873

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

49874

// No need to check constraints, nothing other than the equivalent of

49875

// "=r,0" would be valid here.

49876

return IntrinsicLowering::LowerToByteSwap(CI);

49877

}

49878

49879

// rorw $$8, ${0:w} --> llvm.bswap.i16

49880

if (CI->getType()->isIntegerTy(16) &&

49881

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

49882

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

49883

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

49884

AsmPieces.clear();

49885

StringRef ConstraintsStr = IA->getConstraintString();

49886

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

49887

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

49888

if (clobbersFlagRegisters(AsmPieces))

49889

return IntrinsicLowering::LowerToByteSwap(CI);

49890

}

49891

break;

49892

case 3:

49893

if (CI->getType()->isIntegerTy(32) &&

49894

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

49895

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

49896

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

49897

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

49898

AsmPieces.clear();

49899

StringRef ConstraintsStr = IA->getConstraintString();

49900

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

49901

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

49902

if (clobbersFlagRegisters(AsmPieces))

49903

return IntrinsicLowering::LowerToByteSwap(CI);

49904

}

49905

49906

if (CI->getType()->isIntegerTy(64)) {

49907

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

49908

if (Constraints.size() >= 2 &&

49909

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

49910

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

49911

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

49912

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

49913

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

49914

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

49915

return IntrinsicLowering::LowerToByteSwap(CI);

49916

}

49917

}

49918

break;

49919

}

49920

return false;

49921

}

49922

49923

static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

49924

X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

49925

.Case("{@cca}", X86::COND_A)

49926

.Case("{@ccae}", X86::COND_AE)

49927

.Case("{@ccb}", X86::COND_B)

49928

.Case("{@ccbe}", X86::COND_BE)

49929

.Case("{@ccc}", X86::COND_B)

49930

.Case("{@cce}", X86::COND_E)

49931

.Case("{@ccz}", X86::COND_E)

49932

.Case("{@ccg}", X86::COND_G)

49933

.Case("{@ccge}", X86::COND_GE)

49934

.Case("{@ccl}", X86::COND_L)

49935

.Case("{@ccle}", X86::COND_LE)

49936

.Case("{@ccna}", X86::COND_BE)

49937

.Case("{@ccnae}", X86::COND_B)

49938

.Case("{@ccnb}", X86::COND_AE)

49939

.Case("{@ccnbe}", X86::COND_A)

49940

.Case("{@ccnc}", X86::COND_AE)

49941

.Case("{@ccne}", X86::COND_NE)

49942

.Case("{@ccnz}", X86::COND_NE)

49943

.Case("{@ccng}", X86::COND_LE)

49944

.Case("{@ccnge}", X86::COND_L)

49945

.Case("{@ccnl}", X86::COND_GE)

49946

.Case("{@ccnle}", X86::COND_G)

49947

.Case("{@ccno}", X86::COND_NO)

49948

.Case("{@ccnp}", X86::COND_P)

49949

.Case("{@ccns}", X86::COND_NS)

49950

.Case("{@cco}", X86::COND_O)

49951

.Case("{@ccp}", X86::COND_P)

49952

.Case("{@ccs}", X86::COND_S)

49953

.Default(X86::COND_INVALID);

49954

return Cond;

49955

}

49956

49957

/// Given a constraint letter, return the type of constraint for this target.

49958

X86TargetLowering::ConstraintType

49959

X86TargetLowering::getConstraintType(StringRef Constraint) const {

49960

if (Constraint.size() == 1) {

49961

switch (Constraint[0]) {

49962

case 'R':

49963

case 'q':

49964

case 'Q':

49965

case 'f':

49966

case 't':

49967

case 'u':

49968

case 'y':

49969

case 'x':

49970

case 'v':

49971

case 'l':

49972

case 'k': // AVX512 masking registers.

49973

return C_RegisterClass;

49974

case 'a':

49975

case 'b':

49976

case 'c':

49977

case 'd':

49978

case 'S':

49979

case 'D':

49980

case 'A':

49981

return C_Register;

49982

case 'I':

49983

case 'J':

49984

case 'K':

49985

case 'N':

49986

case 'G':

49987

case 'L':

49988

case 'M':

49989

return C_Immediate;

49990

case 'C':

49991

case 'e':

49992

case 'Z':

49993

return C_Other;

49994

default:

49995

break;

49996

}

49997

}

49998

else if (Constraint.size() == 2) {

49999

switch (Constraint[0]) {

50000

default:

50001

break;

50002

case 'Y':

50003

switch (Constraint[1]) {

50004

default:

50005

break;

50006

case 'z':

50007

return C_Register;

50008

case 'i':

50009

case 'm':

50010

case 'k':

50011

case 't':

50012

case '2':

50013

return C_RegisterClass;

50014

}

50015

}

50016

} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

50017

return C_Other;

50018

return TargetLowering::getConstraintType(Constraint);

50019

}

50020

50021

/// Examine constraint type and operand type and determine a weight value.

50022

/// This object must already have been set up with the operand type

50023

/// and the current alternative constraint selected.

50024

TargetLowering::ConstraintWeight

50025

X86TargetLowering::getSingleConstraintMatchWeight(

50026

AsmOperandInfo &info, const char *constraint) const {

50027

ConstraintWeight weight = CW_Invalid;

50028

Value *CallOperandVal = info.CallOperandVal;

50029

// If we don't have a value, we can't do a match,

50030

// but allow it at the lowest weight.

50031

if (!CallOperandVal)

50032

return CW_Default;

50033

Type *type = CallOperandVal->getType();

50034

// Look at the constraint type.

50035

switch (*constraint) {

50036

default:

50037

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

50038

LLVM_FALLTHROUGH[[gnu::fallthrough]];

50039

case 'R':

50040

case 'q':

50041

case 'Q':

50042

case 'a':

50043

case 'b':

50044

case 'c':

50045

case 'd':

50046

case 'S':

50047

case 'D':

50048

case 'A':

50049

if (CallOperandVal->getType()->isIntegerTy())

50050

weight = CW_SpecificReg;

50051

break;

50052

case 'f':

50053

case 't':

50054

case 'u':

50055

if (type->isFloatingPointTy())

50056

weight = CW_SpecificReg;

50057

break;

50058

case 'y':

50059

if (type->isX86_MMXTy() && Subtarget.hasMMX())

50060

weight = CW_SpecificReg;

50061

break;

50062

case 'Y':

50063

if (StringRef(constraint).size() != 2)

50064

break;

50065

switch (constraint[1]) {

50066

default:

50067

return CW_Invalid;

50068

// XMM0

50069

case 'z':

50070

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

50071

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

50072

((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

50073

return CW_SpecificReg;

50074

return CW_Invalid;

50075

// Conditional OpMask regs (AVX512)

50076

case 'k':

50077

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

50078

return CW_Register;

50079

return CW_Invalid;

50080

// Any MMX reg

50081

case 'm':

50082

if (type->isX86_MMXTy() && Subtarget.hasMMX())

50083

return weight;

50084

return CW_Invalid;

50085

// Any SSE reg when ISA >= SSE2, same as 'x'

50086

case 'i':

50087

case 't':

50088

case '2':

50089

if (!Subtarget.hasSSE2())

50090

return CW_Invalid;

50091

break;

50092

}

50093

break;

50094

case 'v':

50095

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

50096

weight = CW_Register;

50097

LLVM_FALLTHROUGH[[gnu::fallthrough]];

50098

case 'x':

50099

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

50100

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

50101

weight = CW_Register;

50102

break;

50103

case 'k':

50104

// Enable conditional vector operations using %k<#> registers.

50105

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

50106

weight = CW_Register;

50107

break;

50108

case 'I':

50109

if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

50110

if (C->getZExtValue() <= 31)

50111

weight = CW_Constant;

50112

}

50113

break;

50114

case 'J':

50115

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

50116

if (C->getZExtValue() <= 63)

50117

weight = CW_Constant;

50118

}

50119

break;

50120

case 'K':

50121

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

50122

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

50123

weight = CW_Constant;

50124

}

50125

break;

50126

case 'L':

50127

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

50128

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

50129

weight = CW_Constant;

50130

}

50131

break;

50132

case 'M':

50133

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

50134

if (C->getZExtValue() <= 3)

50135

weight = CW_Constant;

50136

}

50137

break;

50138

case 'N':

50139

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

50140

if (C->getZExtValue() <= 0xff)

50141

weight = CW_Constant;

50142

}

50143

break;

50144

case 'G':

50145

case 'C':

50146

if (isa<ConstantFP>(CallOperandVal)) {

50147

weight = CW_Constant;

50148

}

50149

break;

50150

case 'e':

50151

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

50152

if ((C->getSExtValue() >= -0x80000000LL) &&

50153

(C->getSExtValue() <= 0x7fffffffLL))

50154

weight = CW_Constant;

50155

}

50156

break;

50157

case 'Z':

50158

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

50159

if (C->getZExtValue() <= 0xffffffff)

50160

weight = CW_Constant;

50161

}

50162

break;

50163

}

50164

return weight;

50165

}

50166

50167

/// Try to replace an X constraint, which matches anything, with another that

50168

/// has more specific requirements based on the type of the corresponding

50169

/// operand.

50170

const char *X86TargetLowering::

50171

LowerXConstraint(EVT ConstraintVT) const {

50172

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

50173

// 'f' like normal targets.

50174

if (ConstraintVT.isFloatingPoint()) {

50175

if (Subtarget.hasSSE1())

50176

return "x";

50177

}

50178

50179

return TargetLowering::LowerXConstraint(ConstraintVT);

50180

}

50181

50182

// Lower @cc targets via setcc.

50183

SDValue X86TargetLowering::LowerAsmOutputForConstraint(

50184

SDValue &Chain, SDValue &Flag, const SDLoc &DL,

50185

const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

50186

X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

50187

if (Cond == X86::COND_INVALID)

50188

return SDValue();

50189

// Check that return type is valid.

50190

if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

50191

OpInfo.ConstraintVT.getSizeInBits() < 8)

50192

report_fatal_error("Flag output operand is of invalid type");

50193

50194

// Get EFLAGS register. Only update chain when copyfrom is glued.

50195

if (Flag.getNode()) {

50196

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);

50197

Chain = Flag.getValue(1);

50198

} else

50199

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

50200

// Extract CC code.

50201

SDValue CC = getSETCC(Cond, Flag, DL, DAG);

50202

// Extend to 32-bits

50203

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

50204

50205

return Result;

50206

}

50207

50208

/// Lower the specified operand into the Ops vector.

50209

/// If it is invalid, don't add anything to Ops.

50210

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

50211

std::string &Constraint,

50212

std::vector<SDValue>&Ops,

50213

SelectionDAG &DAG) const {

50214

SDValue Result;

50215

50216

// Only support length 1 constraints for now.

50217

if (Constraint.length() > 1) return;

50218

50219

char ConstraintLetter = Constraint[0];

50220

switch (ConstraintLetter) {

50221

default: break;

50222

case 'I':

50223

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50224

if (C->getZExtValue() <= 31) {

50225

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

50226

Op.getValueType());

50227

break;

50228

}

50229

}

50230

return;

50231

case 'J':

50232

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50233

if (C->getZExtValue() <= 63) {

50234

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

50235

Op.getValueType());

50236

break;

50237

}

50238

}

50239

return;

50240

case 'K':

50241

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50242

if (isInt<8>(C->getSExtValue())) {

50243

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

50244

Op.getValueType());

50245

break;

50246

}

50247

}

50248

return;

50249

case 'L':

50250

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50251

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

50252

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

50253

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

50254

Op.getValueType());

50255

break;

50256

}

50257

}

50258

return;

50259

case 'M':

50260

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50261

if (C->getZExtValue() <= 3) {

50262

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

50263

Op.getValueType());

50264

break;

50265

}

50266

}

50267

return;

50268

case 'N':

50269

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50270

if (C->getZExtValue() <= 255) {

50271

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

50272

Op.getValueType());

50273

break;

50274

}

50275

}

50276

return;

50277

case 'O':

50278

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50279

if (C->getZExtValue() <= 127) {

50280

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

50281

Op.getValueType());

50282

break;

50283

}

50284

}

50285

return;

50286

case 'e': {

50287

// 32-bit signed value

50288

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50289

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

50290

C->getSExtValue())) {

50291

// Widen to 64 bits here to get it sign extended.

50292

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

50293

break;

50294

}

50295

// FIXME gcc accepts some relocatable values here too, but only in certain

50296

// memory models; it's complicated.

50297

}

50298

return;

50299

}

50300

case 'Z': {

50301

// 32-bit unsigned value

50302

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

50303

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

50304

C->getZExtValue())) {

50305

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

50306

Op.getValueType());

50307

break;

50308

}

50309

}

50310

// FIXME gcc accepts some relocatable values here too, but only in certain

50311

// memory models; it's complicated.

50312

return;

50313

}

50314

case 'i': {

50315

// Literal immediates are always ok.

50316

if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {

50317

bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

50318

BooleanContent BCont = getBooleanContents(MVT::i64);

50319

ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

50320

: ISD::SIGN_EXTEND;

50321

int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

50322

: CST->getSExtValue();

50323

Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

50324

break;

50325

}

50326

50327

// In any sort of PIC mode addresses need to be computed at runtime by

50328

// adding in a register or some sort of table lookup. These can't

50329

// be used as immediates.

50330

if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())

50331

return;

50332

50333

// If we are in non-pic codegen mode, we allow the address of a global (with

50334

// an optional displacement) to be used with 'i'.

50335

if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

50336

// If we require an extra load to get this address, as in PIC mode, we

50337

// can't accept it.

50338

if (isGlobalStubReference(

50339

Subtarget.classifyGlobalReference(GA->getGlobal())))

50340

return;

50341

break;

50342

}

50343

}

50344

50345

if (Result.getNode()) {

50346

Ops.push_back(Result);

50347

return;

50348

}

50349

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

50350

}

50351

50352

/// Check if \p RC is a general purpose register class.

50353

/// I.e., GR* or one of their variant.

50354

static bool isGRClass(const TargetRegisterClass &RC) {

50355

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

50356

RC.hasSuperClassEq(&X86::GR16RegClass) ||

50357

RC.hasSuperClassEq(&X86::GR32RegClass) ||

50358

RC.hasSuperClassEq(&X86::GR64RegClass) ||

50359

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

50360

}

50361

50362

/// Check if \p RC is a vector register class.

50363

/// I.e., FR* / VR* or one of their variant.

50364

static bool isFRClass(const TargetRegisterClass &RC) {

50365

return RC.hasSuperClassEq(&X86::FR32XRegClass) ||

50366

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

50367

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

50368

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

50369

RC.hasSuperClassEq(&X86::VR512RegClass);

50370

}

50371

50372

/// Check if \p RC is a mask register class.

50373

/// I.e., VK* or one of their variant.

50374

static bool isVKClass(const TargetRegisterClass &RC) {

50375

return RC.hasSuperClassEq(&X86::VK1RegClass) ||

50376

RC.hasSuperClassEq(&X86::VK2RegClass) ||

50377

RC.hasSuperClassEq(&X86::VK4RegClass) ||

50378

RC.hasSuperClassEq(&X86::VK8RegClass) ||

50379

RC.hasSuperClassEq(&X86::VK16RegClass) ||

50380

RC.hasSuperClassEq(&X86::VK32RegClass) ||

50381

RC.hasSuperClassEq(&X86::VK64RegClass);

50382

}

50383

50384

std::pair<unsigned, const TargetRegisterClass *>

50385

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

50386

StringRef Constraint,

50387

MVT VT) const {

50388

// First, see if this is a constraint that directly corresponds to an LLVM

50389

// register class.

50390

if (Constraint.size() == 1) {

50391

// GCC Constraint Letters

50392

switch (Constraint[0]) {

50393

default: break;

50394

// 'A' means [ER]AX + [ER]DX.

50395

case 'A':

50396

if (Subtarget.is64Bit())

50397

return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

50398

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50399, __PRETTY_FUNCTION__))

50399

"Expecting 64, 32 or 16 bit subtarget")(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50399, __PRETTY_FUNCTION__));

50400

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

50401

50402

// TODO: Slight differences here in allocation order and leaving

50403

// RIP in the class. Do they matter any more here than they do

50404

// in the normal allocation?

50405

case 'k':

50406

if (Subtarget.hasAVX512()) {

50407

if (VT == MVT::i1)

50408

return std::make_pair(0U, &X86::VK1RegClass);

50409

if (VT == MVT::i8)

50410

return std::make_pair(0U, &X86::VK8RegClass);

50411

if (VT == MVT::i16)

50412

return std::make_pair(0U, &X86::VK16RegClass);

50413

}

50414

if (Subtarget.hasBWI()) {

50415

if (VT == MVT::i32)

50416

return std::make_pair(0U, &X86::VK32RegClass);

50417

if (VT == MVT::i64)

50418

return std::make_pair(0U, &X86::VK64RegClass);

50419

}

50420

break;

50421

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

50422

if (Subtarget.is64Bit()) {

50423

if (VT == MVT::i8 || VT == MVT::i1)

50424

return std::make_pair(0U, &X86::GR8RegClass);

50425

if (VT == MVT::i16)

50426

return std::make_pair(0U, &X86::GR16RegClass);

50427

if (VT == MVT::i32 || VT == MVT::f32)

50428

return std::make_pair(0U, &X86::GR32RegClass);

50429

if (VT != MVT::f80)

50430

return std::make_pair(0U, &X86::GR64RegClass);

50431

break;

50432

}

50433

LLVM_FALLTHROUGH[[gnu::fallthrough]];

50434

// 32-bit fallthrough

50435

case 'Q': // Q_REGS

50436

if (VT == MVT::i8 || VT == MVT::i1)

50437

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

50438

if (VT == MVT::i16)

50439

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

50440

if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())

50441

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

50442

if (VT != MVT::f80)

50443

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

50444

break;

50445

case 'r': // GENERAL_REGS

50446

case 'l': // INDEX_REGS

50447

if (VT == MVT::i8 || VT == MVT::i1)

50448

return std::make_pair(0U, &X86::GR8RegClass);

50449

if (VT == MVT::i16)

50450

return std::make_pair(0U, &X86::GR16RegClass);

50451

if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())

50452

return std::make_pair(0U, &X86::GR32RegClass);

50453

if (VT != MVT::f80)

50454

return std::make_pair(0U, &X86::GR64RegClass);

50455

break;

50456

case 'R': // LEGACY_REGS

50457

if (VT == MVT::i8 || VT == MVT::i1)

50458

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

50459

if (VT == MVT::i16)

50460

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

50461

if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())

50462

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

50463

if (VT != MVT::f80)

50464

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

50465

break;

50466

case 'f': // FP Stack registers.

50467

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

50468

// value to the correct fpstack register class.

50469

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

50470

return std::make_pair(0U, &X86::RFP32RegClass);

50471

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

50472

return std::make_pair(0U, &X86::RFP64RegClass);

50473

if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

50474

return std::make_pair(0U, &X86::RFP80RegClass);

50475

break;

50476

case 'y': // MMX_REGS if MMX allowed.

50477

if (!Subtarget.hasMMX()) break;

50478

return std::make_pair(0U, &X86::VR64RegClass);

50479

case 'v':

50480

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

50481

if (!Subtarget.hasSSE1()) break;

50482

bool VConstraint = (Constraint[0] == 'v');

50483

50484

switch (VT.SimpleTy) {

50485

default: break;

50486

// Scalar SSE types.

50487

case MVT::f32:

50488

case MVT::i32:

50489

if (VConstraint && Subtarget.hasVLX())

50490

return std::make_pair(0U, &X86::FR32XRegClass);

50491

return std::make_pair(0U, &X86::FR32RegClass);

50492

case MVT::f64:

50493

case MVT::i64:

50494

if (VConstraint && Subtarget.hasVLX())

50495

return std::make_pair(0U, &X86::FR64XRegClass);

50496

return std::make_pair(0U, &X86::FR64RegClass);

50497

case MVT::i128:

50498

if (Subtarget.is64Bit()) {

50499

if (VConstraint && Subtarget.hasVLX())

50500

return std::make_pair(0U, &X86::VR128XRegClass);

50501

return std::make_pair(0U, &X86::VR128RegClass);

50502

}

50503

break;

50504

// Vector types and fp128.

50505

case MVT::f128:

50506

case MVT::v16i8:

50507

case MVT::v8i16:

50508

case MVT::v4i32:

50509

case MVT::v2i64:

50510

case MVT::v4f32:

50511

case MVT::v2f64:

50512

if (VConstraint && Subtarget.hasVLX())

50513

return std::make_pair(0U, &X86::VR128XRegClass);

50514

return std::make_pair(0U, &X86::VR128RegClass);

50515

// AVX types.

50516

case MVT::v32i8:

50517

case MVT::v16i16:

50518

case MVT::v8i32:

50519

case MVT::v4i64:

50520

case MVT::v8f32:

50521

case MVT::v4f64:

50522

if (VConstraint && Subtarget.hasVLX())

50523

return std::make_pair(0U, &X86::VR256XRegClass);

50524

if (Subtarget.hasAVX())

50525

return std::make_pair(0U, &X86::VR256RegClass);

50526

break;

50527

case MVT::v64i8:

50528

case MVT::v32i16:

50529

case MVT::v8f64:

50530

case MVT::v16f32:

50531

case MVT::v16i32:

50532

case MVT::v8i64:

50533

if (!Subtarget.hasAVX512()) break;

50534

if (VConstraint)

50535

return std::make_pair(0U, &X86::VR512RegClass);

50536

return std::make_pair(0U, &X86::VR512_0_15RegClass);

50537

}

50538

break;

50539

}

50540

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

50541

switch (Constraint[1]) {

50542

default:

50543

break;

50544

case 'i':

50545

case 't':

50546

case '2':

50547

return getRegForInlineAsmConstraint(TRI, "x", VT);

50548

case 'm':

50549

if (!Subtarget.hasMMX()) break;

50550

return std::make_pair(0U, &X86::VR64RegClass);

50551

case 'z':

50552

if (!Subtarget.hasSSE1()) break;

50553

switch (VT.SimpleTy) {

50554

default: break;

50555

// Scalar SSE types.

50556

case MVT::f32:

50557

case MVT::i32:

50558

return std::make_pair(X86::XMM0, &X86::FR32RegClass);

50559

case MVT::f64:

50560

case MVT::i64:

50561

return std::make_pair(X86::XMM0, &X86::FR64RegClass);

50562

case MVT::f128:

50563

case MVT::v16i8:

50564

case MVT::v8i16:

50565

case MVT::v4i32:

50566

case MVT::v2i64:

50567

case MVT::v4f32:

50568

case MVT::v2f64:

50569

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

50570

// AVX types.

50571

case MVT::v32i8:

50572

case MVT::v16i16:

50573

case MVT::v8i32:

50574

case MVT::v4i64:

50575

case MVT::v8f32:

50576

case MVT::v4f64:

50577

if (Subtarget.hasAVX())

50578

return std::make_pair(X86::YMM0, &X86::VR256RegClass);

50579

break;

50580

case MVT::v64i8:

50581

case MVT::v32i16:

50582

case MVT::v8f64:

50583

case MVT::v16f32:

50584

case MVT::v16i32:

50585

case MVT::v8i64:

50586

if (Subtarget.hasAVX512())

50587

return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

50588

break;

50589

}

50590

break;

50591

case 'k':

50592

// This register class doesn't allocate k0 for masked vector operation.

50593

if (Subtarget.hasAVX512()) {

50594

if (VT == MVT::i1)

50595

return std::make_pair(0U, &X86::VK1WMRegClass);

50596

if (VT == MVT::i8)

50597

return std::make_pair(0U, &X86::VK8WMRegClass);

50598

if (VT == MVT::i16)

50599

return std::make_pair(0U, &X86::VK16WMRegClass);

50600

}

50601

if (Subtarget.hasBWI()) {

50602

if (VT == MVT::i32)

50603

return std::make_pair(0U, &X86::VK32WMRegClass);

50604

if (VT == MVT::i64)

50605

return std::make_pair(0U, &X86::VK64WMRegClass);

50606

}

50607

break;

50608

}

50609

}

50610

50611

if (parseConstraintCode(Constraint) != X86::COND_INVALID)

50612

return std::make_pair(0U, &X86::GR32RegClass);

50613

50614

// Use the default implementation in TargetLowering to convert the register

50615

// constraint into a member of a register class.

50616

std::pair<Register, const TargetRegisterClass*> Res;

50617

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

50618

50619

// Not found as a standard register?

50620

if (!Res.second) {

50621

// Map st(0) -> st(7) -> ST0

50622

if (Constraint.size() == 7 && Constraint[0] == '{' &&

50623

tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

50624

Constraint[3] == '(' &&

50625

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

50626

Constraint[5] == ')' && Constraint[6] == '}') {

50627

// st(7) is not allocatable and thus not a member of RFP80. Return

50628

// singleton class in cases where we have a reference to it.

50629

if (Constraint[4] == '7')

50630

return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

50631

return std::make_pair(X86::FP0 + Constraint[4] - '0',

50632

&X86::RFP80RegClass);

50633

}

50634

50635

// GCC allows "st(0)" to be called just plain "st".

50636

if (StringRef("{st}").equals_lower(Constraint))

50637

return std::make_pair(X86::FP0, &X86::RFP80RegClass);

50638

50639

// flags -> EFLAGS

50640

if (StringRef("{flags}").equals_lower(Constraint))

50641

return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

50642

50643

// dirflag -> DF

50644

// Only allow for clobber.

50645

if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)

50646

return std::make_pair(X86::DF, &X86::DFCCRRegClass);

50647

50648

// fpsr -> FPSW

50649

if (StringRef("{fpsr}").equals_lower(Constraint))

50650

return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

50651

50652

return Res;

50653

}

50654

50655

// Make sure it isn't a register that requires 64-bit mode.

50656

if (!Subtarget.is64Bit() &&

50657

(isFRClass(*Res.second) || isGRClass(*Res.second)) &&

50658

TRI->getEncodingValue(Res.first) >= 8) {

50659

// Register requires REX prefix, but we're in 32-bit mode.

50660

return std::make_pair(0, nullptr);

50661

}

50662

50663

// Make sure it isn't a register that requires AVX512.

50664

if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

50665

TRI->getEncodingValue(Res.first) & 0x10) {

50666

// Register requires EVEX prefix.

50667

return std::make_pair(0, nullptr);

50668

}

50669

50670

// Otherwise, check to see if this is a register class of the wrong value

50671

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

50672

// turn into {ax},{dx}.

50673

// MVT::Other is used to specify clobber names.

50674

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

50675

return Res; // Correct type already, nothing to do.

50676

50677

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

50678

// return "eax". This should even work for things like getting 64bit integer

50679

// registers when given an f64 type.

50680

const TargetRegisterClass *Class = Res.second;

50681

// The generic code will match the first register class that contains the

50682

// given register. Thus, based on the ordering of the tablegened file,

50683

// the "plain" GR classes might not come first.

50684

// Therefore, use a helper method.

50685

if (isGRClass(*Class)) {

50686

unsigned Size = VT.getSizeInBits();

50687

if (Size == 1) Size = 8;

50688

Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);

50689

if (DestReg > 0) {

50690

bool is64Bit = Subtarget.is64Bit();

50691

const TargetRegisterClass *RC =

50692

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

50693

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

50694

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

50695

: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)

50696

: nullptr;

50697

if (Size == 64 && !is64Bit) {

50698

// Model GCC's behavior here and select a fixed pair of 32-bit

50699

// registers.

50700

switch (DestReg) {

50701

case X86::RAX:

50702

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

50703

case X86::RDX:

50704

return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

50705

case X86::RCX:

50706

return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

50707

case X86::RBX:

50708

return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

50709

case X86::RSI:

50710

return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

50711

case X86::RDI:

50712

return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

50713

case X86::RBP:

50714

return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

50715

default:

50716

return std::make_pair(0, nullptr);

50717

}

50718

}

50719

if (RC && RC->contains(DestReg))

50720

return std::make_pair(DestReg, RC);

50721

return Res;

50722

}

50723

// No register found/type mismatch.

50724

return std::make_pair(0, nullptr);

50725

} else if (isFRClass(*Class)) {

50726

// Handle references to XMM physical registers that got mapped into the

50727

// wrong class. This can happen with constraints like {xmm0} where the

50728

// target independent register mapper will just pick the first match it can

50729

// find, ignoring the required type.

50730

50731

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

50732

if (VT == MVT::f32 || VT == MVT::i32)

50733

Res.second = &X86::FR32XRegClass;

50734

else if (VT == MVT::f64 || VT == MVT::i64)

50735

Res.second = &X86::FR64XRegClass;

50736

else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

50737

Res.second = &X86::VR128XRegClass;

50738

else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

50739

Res.second = &X86::VR256XRegClass;

50740

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

50741

Res.second = &X86::VR512RegClass;

50742

else {

50743

// Type mismatch and not a clobber: Return an error;

50744

Res.first = 0;

50745

Res.second = nullptr;

50746

}

50747

} else if (isVKClass(*Class)) {

50748

if (VT == MVT::i1)

50749

Res.second = &X86::VK1RegClass;

50750

else if (VT == MVT::i8)

50751

Res.second = &X86::VK8RegClass;

50752

else if (VT == MVT::i16)

50753

Res.second = &X86::VK16RegClass;

50754

else if (VT == MVT::i32)

50755

Res.second = &X86::VK32RegClass;

50756

else if (VT == MVT::i64)

50757

Res.second = &X86::VK64RegClass;

50758

else {

50759

// Type mismatch and not a clobber: Return an error;

50760

Res.first = 0;

50761

Res.second = nullptr;

50762

}

50763

}

50764

50765

return Res;

50766

}

50767

50768

int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,

50769

const AddrMode &AM, Type *Ty,

50770

unsigned AS) const {

50771

// Scaling factors are not free at all.

50772

// An indexed folded instruction, i.e., inst (reg1, reg2, scale),

50773

// will take 2 allocations in the out of order engine instead of 1

50774

// for plain addressing mode, i.e. inst (reg1).

50775

// E.g.,

50776

// vaddps (%rsi,%rdx), %ymm0, %ymm1

50777

// Requires two allocations (one for the load, one for the computation)

50778

// whereas:

50779

// vaddps (%rsi), %ymm0, %ymm1

50780

// Requires just 1 allocation, i.e., freeing allocations for other operations

50781

// and having less micro operations to execute.

50782

//

50783

// For some X86 architectures, this is even worse because for instance for

50784

// stores, the complex addressing mode forces the instruction to use the

50785

// "load" ports instead of the dedicated "store" port.

50786

// E.g., on Haswell:

50787

// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.

50788

// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.

50789

if (isLegalAddressingMode(DL, AM, Ty, AS))

50790

// Scale represents reg2 * scale, thus account for 1

50791

// as soon as we use a second register.

50792

return AM.Scale != 0;

50793

return -1;

50794

}

50795

50796

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

50797

// Integer division on x86 is expensive. However, when aggressively optimizing

50798

// for code size, we prefer to use a div instruction, as it is usually smaller

50799

// than the alternative sequence.

50800

// The exception to this is vector division. Since x86 doesn't have vector

50801

// integer division, leaving the division as-is is a loss even in terms of

50802

// size, because it will have to be scalarized, while the alternative code

50803

// sequence can be performed in vector form.

50804

bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);

50805

return OptSize && !VT.isVector();

50806

}

50807

50808

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

50809

if (!Subtarget.is64Bit())

50810

return;

50811

50812

// Update IsSplitCSR in X86MachineFunctionInfo.

50813

X86MachineFunctionInfo *AFI =

50814

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

50815

AFI->setIsSplitCSR(true);

50816

}

50817

50818

void X86TargetLowering::insertCopiesSplitCSR(

50819

MachineBasicBlock *Entry,

50820

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

50821

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

50822

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

50823

if (!IStart)

50824

return;

50825

50826

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

50827

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

50828

MachineBasicBlock::iterator MBBI = Entry->begin();

50829

for (const MCPhysReg *I = IStart; *I; ++I) {

50830

const TargetRegisterClass *RC = nullptr;

50831

if (X86::GR64RegClass.contains(*I))

50832

RC = &X86::GR64RegClass;

50833

else

50834

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50834);

50835

50836

Register NewVR = MRI->createVirtualRegister(RC);

50837

// Create copy from CSR to a virtual register.

50838

// FIXME: this currently does not emit CFI pseudo-instructions, it works

50839

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

50840

// nounwind. If we want to generalize this later, we may need to emit

50841

// CFI pseudo-instructions.

50842

assert(((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50844, __PRETTY_FUNCTION__))

50843

Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50844, __PRETTY_FUNCTION__))

50844

"Function should be nounwind in insertCopiesSplitCSR!")((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-12~++20200917111122+b03c2b8395b/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50844, __PRETTY_FUNCTION__));

50845

Entry->addLiveIn(*I);

50846

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

50847

.addReg(*I);

50848

50849

// Insert the copy-back instructions right before the terminator.

50850

for (auto *Exit : Exits)

50851

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

50852

TII->get(TargetOpcode::COPY), *I)

50853

.addReg(NewVR);

50854

}

50855

}

50856

50857

bool X86TargetLowering::supportSwiftError() const {

50858

return Subtarget.is64Bit();

50859

}

50860

50861

/// Returns true if stack probing through a function call is requested.

50862

bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {

50863

return !getStackProbeSymbolName(MF).empty();

50864

}

50865

50866

/// Returns true if stack probing through inline assembly is requested.

50867

bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {

50868

50869

// No inline stack probe for Windows, they have their own mechanism.

50870

if (Subtarget.isOSWindows() ||

50871

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

50872

return false;

50873

50874

// If the function specifically requests inline stack probes, emit them.

50875

if (MF.getFunction().hasFnAttribute("probe-stack"))

50876

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

50877

"inline-asm";

50878

50879

return false;

50880

}

50881

50882

/// Returns the name of the symbol used to emit stack probes or the empty

50883

/// string if not applicable.

50884

StringRef

50885

X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {

50886

// Inline Stack probes disable stack probe call

50887

if (hasInlineStackProbe(MF))

50888

return "";

50889

50890

// If the function specifically requests stack probes, emit them.

50891

if (MF.getFunction().hasFnAttribute("probe-stack"))

50892

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

50893

50894

// Generally, if we aren't on Windows, the platform ABI does not include

50895

// support for stack probes, so don't emit them.

50896

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||

50897

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

50898

return "";

50899

50900

// We need a stack probe to conform to the Windows ABI. Choose the right

50901

// symbol.

50902

if (Subtarget.is64Bit())

50903

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

50904

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

50905

}

50906

50907

unsigned

50908

X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {

50909

// The default stack probe size is 4096 if the function has no stackprobesize

50910

// attribute.

50911

unsigned StackProbeSize = 4096;

50912

const Function &Fn = MF.getFunction();

50913

if (Fn.hasFnAttribute("stack-probe-size"))

50914

Fn.getFnAttribute("stack-probe-size")

50915

.getValueAsString()

50916

.getAsInteger(0, StackProbeSize);

50917

return StackProbeSize;

50918

}

File:	llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:	line 36143, column 7 Moved-from object 'WidenedMask' is moved

Bug Summary

Annotated Source Code