/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp

Bug Summary

File:	lib/Target/X86/X86ISelLowering.cpp
Warning:	line 6665, column 1 Potential leak of memory pointed to by 'ZeroMask.X'

Annotated Source Code

/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp

→

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

// The LLVM Compiler Infrastructure

// This file is distributed under the University of Illinois Open Source

// License. See LICENSE.TXT for details.

//===----------------------------------------------------------------------===//

// This file defines the interfaces that X86 uses to lower LLVM code into a

// selection DAG.

//===----------------------------------------------------------------------===//

#include "X86ISelLowering.h"

#include "Utils/X86ShuffleDecode.h"

#include "X86CallingConv.h"

#include "X86FrameLowering.h"

#include "X86InstrBuilder.h"

#include "X86IntrinsicsInfo.h"

#include "X86MachineFunctionInfo.h"

#include "X86ShuffleDecodeConstantPool.h"

#include "X86TargetMachine.h"

#include "X86TargetObjectFile.h"

#include "llvm/ADT/SmallBitVector.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/ADT/StringSwitch.h"

#include "llvm/Analysis/EHPersonalities.h"

#include "llvm/CodeGen/IntrinsicLowering.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineJumpTableInfo.h"

#include "llvm/CodeGen/MachineModuleInfo.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/CodeGen/WinEHFuncInfo.h"

#include "llvm/IR/CallSite.h"

#include "llvm/IR/CallingConv.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/GlobalAlias.h"

#include "llvm/IR/GlobalVariable.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/MC/MCAsmInfo.h"

#include "llvm/MC/MCContext.h"

#include "llvm/MC/MCExpr.h"

#include "llvm/MC/MCSymbol.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Target/TargetOptions.h"

#include <algorithm>

#include <bitset>

#include <cctype>

#include <numeric>

using namespace llvm;

#define DEBUG_TYPE"x86-isel" "x86-isel"

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls", {0}, false};

static cl::opt<bool> ExperimentalVectorWideningLegalization(

"x86-experimental-vector-widening-legalization", cl::init(false),

cl::desc("Enable an experimental vector type legalization through widening "

"rather than promotion."),

cl::Hidden);

static cl::opt<int> ExperimentalPrefLoopAlignment(

"x86-experimental-pref-loop-alignment", cl::init(4),

cl::desc("Sets the preferable loop alignment for experiments "

"(the last x86-experimental-pref-loop-alignment bits"

" of the loop header PC will be 0)."),

cl::Hidden);

static cl::opt<bool> MulConstantOptimization(

"mul-constant-optimization", cl::init(true),

cl::desc("Replace 'mul x, Const' with more effective instructions like "

"SHIFT, LEA, etc."),

cl::Hidden);

/// Call this when the user attempts to do something unsupported, like

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

/// report_fatal_error, so calling code should attempt to recover without

/// crashing.

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

const char *Msg) {

MachineFunction &MF = DAG.getMachineFunction();

DAG.getContext()->diagnose(

DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));

}

100

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

101

const X86Subtarget &STI)

102

: TargetLowering(TM), Subtarget(STI) {

103

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

104

X86ScalarSSEf64 = Subtarget.hasSSE2();

105

X86ScalarSSEf32 = Subtarget.hasSSE1();

106

MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());

107

108

// Set up the TargetLowering object.

109

110

// X86 is weird. It always uses i8 for shift amounts and setcc results.

111

setBooleanContents(ZeroOrOneBooleanContent);

112

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

113

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

114

115

// For 64-bit, since we have so many registers, use the ILP scheduler.

116

// For 32-bit, use the register pressure specific scheduling.

117

// For Atom, always use ILP scheduling.

118

if (Subtarget.isAtom())

119

setSchedulingPreference(Sched::ILP);

120

else if (Subtarget.is64Bit())

121

setSchedulingPreference(Sched::ILP);

122

else

123

setSchedulingPreference(Sched::RegPressure);

124

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

125

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

126

127

// Bypass expensive divides and use cheaper ones.

128

if (TM.getOptLevel() >= CodeGenOpt::Default) {

129

if (Subtarget.hasSlowDivide32())

130

addBypassSlowDiv(32, 8);

131

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

132

addBypassSlowDiv(64, 32);

133

}

134

135

if (Subtarget.isTargetKnownWindowsMSVC() ||

136

Subtarget.isTargetWindowsItanium()) {

137

// Setup Windows compiler runtime calls.

138

setLibcallName(RTLIB::SDIV_I64, "_alldiv");

139

setLibcallName(RTLIB::UDIV_I64, "_aulldiv");

140

setLibcallName(RTLIB::SREM_I64, "_allrem");

141

setLibcallName(RTLIB::UREM_I64, "_aullrem");

142

setLibcallName(RTLIB::MUL_I64, "_allmul");

143

setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);

144

setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);

145

setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);

146

setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);

147

setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);

148

}

149

150

if (Subtarget.isTargetDarwin()) {

151

// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.

152

setUseUnderscoreSetJmp(false);

153

setUseUnderscoreLongJmp(false);

154

} else if (Subtarget.isTargetWindowsGNU()) {

155

// MS runtime is weird: it exports _setjmp, but longjmp!

156

setUseUnderscoreSetJmp(true);

157

setUseUnderscoreLongJmp(false);

158

} else {

159

setUseUnderscoreSetJmp(true);

160

setUseUnderscoreLongJmp(true);

161

}

162

163

// Set up the register classes.

164

addRegisterClass(MVT::i8, &X86::GR8RegClass);

165

addRegisterClass(MVT::i16, &X86::GR16RegClass);

166

addRegisterClass(MVT::i32, &X86::GR32RegClass);

167

if (Subtarget.is64Bit())

168

addRegisterClass(MVT::i64, &X86::GR64RegClass);

169

170

for (MVT VT : MVT::integer_valuetypes())

171

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

172

173

// We don't accept any truncstore of integer registers.

174

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

175

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

176

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

177

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

178

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

179

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

180

181

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

182

183

// SETOEQ and SETUNE require checking two conditions.

184

setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);

185

setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);

186

setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);

187

setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);

188

setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);

189

setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

190

191

// Integer absolute.

192

if (Subtarget.hasCMov()) {

193

setOperationAction(ISD::ABS , MVT::i16 , Custom);

194

setOperationAction(ISD::ABS , MVT::i32 , Custom);

195

if (Subtarget.is64Bit())

196

setOperationAction(ISD::ABS , MVT::i64 , Custom);

197

}

198

199

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

200

// operation.

201

setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);

202

setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);

203

setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

204

205

if (Subtarget.is64Bit()) {

206

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())

207

// f32/f64 are legal, f80 is custom.

208

setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);

209

else

210

setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);

211

setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);

212

} else if (!Subtarget.useSoftFloat()) {

213

// We have an algorithm for SSE2->double, and we turn this into a

214

// 64-bit FILD followed by conditional FADD for other targets.

215

setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);

216

// We have an algorithm for SSE2, and we turn this into a 64-bit

217

// FILD or VCVTUSI2SS/SD for other targets.

218

setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);

219

}

220

221

// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

222

// this operation.

223

setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);

224

setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

225

226

if (!Subtarget.useSoftFloat()) {

227

// SSE has no i16 to fp conversion, only i32.

228

if (X86ScalarSSEf32) {

229

setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);

230

// f32 and f64 cases are Legal, f80 case is not

231

setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);

232

} else {

233

setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);

234

setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);

235

}

236

} else {

237

setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);

238

setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);

239

}

240

241

// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

242

// this operation.

243

setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);

244

setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

245

246

if (!Subtarget.useSoftFloat()) {

247

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

248

// are Legal, f80 is custom lowered.

249

setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);

250

setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

251

252

if (X86ScalarSSEf32) {

253

setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);

254

// f32 and f64 cases are Legal, f80 case is not

255

setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);

256

} else {

257

setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);

258

setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);

259

}

260

} else {

261

setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);

262

setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);

263

setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);

264

}

265

266

// Handle FP_TO_UINT by promoting the destination to a larger signed

267

// conversion.

268

setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);

269

setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);

270

setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

271

272

if (Subtarget.is64Bit()) {

273

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

274

// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.

275

setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

276

setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);

277

} else {

278

setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);

279

setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);

280

}

281

} else if (!Subtarget.useSoftFloat()) {

282

// Since AVX is a superset of SSE3, only check for SSE here.

283

if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())

284

// Expand FP_TO_UINT into a select.

285

// FIXME: We would like to use a Custom expander here eventually to do

286

// the optimal thing for SSE vs. the default expansion in the legalizer.

287

setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);

288

else

289

// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.

290

// With SSE3 we can use fisttpll to convert to a signed i64; without

291

// SSE, we're stuck with a fistpll.

292

setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

293

294

setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);

295

}

296

297

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

298

if (!X86ScalarSSEf64) {

299

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

300

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

301

if (Subtarget.is64Bit()) {

302

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

303

// Without SSE, i64->f64 goes through memory.

304

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

305

}

306

} else if (!Subtarget.is64Bit())

307

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

308

309

// Scalar integer divide and remainder are lowered to use operations that

310

// produce two results, to match the available instructions. This exposes

311

// the two-result form to trivial CSE, which is able to combine x/y and x%y

312

// into a single instruction.

313

314

// Scalar integer multiply-high is also lowered to use two-result

315

// operations, to match the available instructions. However, plain multiply

316

// (low) operations are left as Legal, as there are single-result

317

// instructions for this in x86. Using the two-result multiply instructions

318

// when both high and low results are needed must be arranged by dagcombine.

319

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

320

setOperationAction(ISD::MULHS, VT, Expand);

321

setOperationAction(ISD::MULHU, VT, Expand);

322

setOperationAction(ISD::SDIV, VT, Expand);

323

setOperationAction(ISD::UDIV, VT, Expand);

324

setOperationAction(ISD::SREM, VT, Expand);

325

setOperationAction(ISD::UREM, VT, Expand);

326

}

327

328

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

329

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

330

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

331

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

332

setOperationAction(ISD::BR_CC, VT, Expand);

333

setOperationAction(ISD::SELECT_CC, VT, Expand);

334

}

335

if (Subtarget.is64Bit())

336

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

337

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

338

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

339

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

340

setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

341

342

setOperationAction(ISD::FREM , MVT::f32 , Expand);

343

setOperationAction(ISD::FREM , MVT::f64 , Expand);

344

setOperationAction(ISD::FREM , MVT::f80 , Expand);

345

setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

346

347

// Promote the i8 variants and force them on up to i32 which has a shorter

348

// encoding.

349

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

350

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

351

if (!Subtarget.hasBMI()) {

352

setOperationAction(ISD::CTTZ , MVT::i16 , Custom);

353

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

354

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);

355

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

356

if (Subtarget.is64Bit()) {

357

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

358

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

359

}

360

}

361

362

if (Subtarget.hasLZCNT()) {

363

// When promoting the i8 variants, force them to i32 for a shorter

364

// encoding.

365

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

366

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

367

} else {

368

setOperationAction(ISD::CTLZ , MVT::i8 , Custom);

369

setOperationAction(ISD::CTLZ , MVT::i16 , Custom);

370

setOperationAction(ISD::CTLZ , MVT::i32 , Custom);

371

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);

372

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);

373

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);

374

if (Subtarget.is64Bit()) {

375

setOperationAction(ISD::CTLZ , MVT::i64 , Custom);

376

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);

377

}

378

}

379

380

// Special handling for half-precision floating point conversions.

381

// If we don't have F16C support, then lower half float conversions

382

// into library calls.

383

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {

384

setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);

385

setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);

386

}

387

388

// There's never any support for operations beyond MVT::f32.

389

setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);

390

setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);

391

setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);

392

setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

393

394

setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);

395

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);

396

setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);

397

setTruncStoreAction(MVT::f32, MVT::f16, Expand);

398

setTruncStoreAction(MVT::f64, MVT::f16, Expand);

399

setTruncStoreAction(MVT::f80, MVT::f16, Expand);

400

401

if (Subtarget.hasPOPCNT()) {

402

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

403

} else {

404

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

405

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

406

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

407

if (Subtarget.is64Bit())

408

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

409

}

410

411

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

412

413

if (!Subtarget.hasMOVBE())

414

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

415

416

// These should be promoted to a larger select which is supported.

417

setOperationAction(ISD::SELECT , MVT::i1 , Promote);

418

// X86 wants to expand cmov itself.

419

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

420

setOperationAction(ISD::SELECT, VT, Custom);

421

setOperationAction(ISD::SETCC, VT, Custom);

422

}

423

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

424

if (VT == MVT::i64 && !Subtarget.is64Bit())

425

continue;

426

setOperationAction(ISD::SELECT, VT, Custom);

427

setOperationAction(ISD::SETCC, VT, Custom);

428

}

429

430

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

431

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

432

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

433

434

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

435

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

436

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

437

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

438

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

439

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

440

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

441

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

442

443

// Darwin ABI issue.

444

for (auto VT : { MVT::i32, MVT::i64 }) {

445

if (VT == MVT::i64 && !Subtarget.is64Bit())

446

continue;

447

setOperationAction(ISD::ConstantPool , VT, Custom);

448

setOperationAction(ISD::JumpTable , VT, Custom);

449

setOperationAction(ISD::GlobalAddress , VT, Custom);

450

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

451

setOperationAction(ISD::ExternalSymbol , VT, Custom);

452

setOperationAction(ISD::BlockAddress , VT, Custom);

453

}

454

455

// 64-bit shl, sra, srl (iff 32-bit x86)

456

for (auto VT : { MVT::i32, MVT::i64 }) {

457

if (VT == MVT::i64 && !Subtarget.is64Bit())

458

continue;

459

setOperationAction(ISD::SHL_PARTS, VT, Custom);

460

setOperationAction(ISD::SRA_PARTS, VT, Custom);

461

setOperationAction(ISD::SRL_PARTS, VT, Custom);

462

}

463

464

if (Subtarget.hasSSE1())

465

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

466

467

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

468

469

// Expand certain atomics

470

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

471

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

472

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

473

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

474

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

475

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

476

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

477

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

478

}

479

480

if (Subtarget.hasCmpxchg16b()) {

481

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

482

}

483

484

// FIXME - use subtarget debug flags

485

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

486

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

487

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

488

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

489

}

490

491

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

492

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

493

494

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

495

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

496

497

setOperationAction(ISD::TRAP, MVT::Other, Legal);

498

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

499

500

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

501

setOperationAction(ISD::VASTART , MVT::Other, Custom);

502

setOperationAction(ISD::VAEND , MVT::Other, Expand);

503

bool Is64Bit = Subtarget.is64Bit();

504

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

505

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

506

507

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

508

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

509

510

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

511

512

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

513

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

514

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

515

516

if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {

517

// f32 and f64 use SSE.

518

// Set up the FP register classes.

519

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

520

: &X86::FR32RegClass);

521

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

522

: &X86::FR64RegClass);

523

524

for (auto VT : { MVT::f32, MVT::f64 }) {

525

// Use ANDPD to simulate FABS.

526

setOperationAction(ISD::FABS, VT, Custom);

527

528

// Use XORP to simulate FNEG.

529

setOperationAction(ISD::FNEG, VT, Custom);

530

531

// Use ANDPD and ORPD to simulate FCOPYSIGN.

532

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

533

534

// We don't support sin/cos/fmod

535

setOperationAction(ISD::FSIN , VT, Expand);

536

setOperationAction(ISD::FCOS , VT, Expand);

537

setOperationAction(ISD::FSINCOS, VT, Expand);

538

}

539

540

// Lower this to MOVMSK plus an AND.

541

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

542

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

543

544

// Expand FP immediates into loads from the stack, except for the special

545

// cases we handle.

546

addLegalFPImmediate(APFloat(+0.0)); // xorpd

547

addLegalFPImmediate(APFloat(+0.0f)); // xorps

548

} else if (UseX87 && X86ScalarSSEf32) {

549

// Use SSE for f32, x87 for f64.

550

// Set up the FP register classes.

551

addRegisterClass(MVT::f32, &X86::FR32RegClass);

552

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

553

554

// Use ANDPS to simulate FABS.

555

setOperationAction(ISD::FABS , MVT::f32, Custom);

556

557

// Use XORP to simulate FNEG.

558

setOperationAction(ISD::FNEG , MVT::f32, Custom);

559

560

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

561

562

// Use ANDPS and ORPS to simulate FCOPYSIGN.

563

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

564

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

565

566

// We don't support sin/cos/fmod

567

setOperationAction(ISD::FSIN , MVT::f32, Expand);

568

setOperationAction(ISD::FCOS , MVT::f32, Expand);

569

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

570

571

// Special cases we handle for FP constants.

572

addLegalFPImmediate(APFloat(+0.0f)); // xorps

573

addLegalFPImmediate(APFloat(+0.0)); // FLD0

574

addLegalFPImmediate(APFloat(+1.0)); // FLD1

575

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

576

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

577

578

// Always expand sin/cos functions even though x87 has an instruction.

579

setOperationAction(ISD::FSIN , MVT::f64, Expand);

580

setOperationAction(ISD::FCOS , MVT::f64, Expand);

581

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

582

} else if (UseX87) {

583

// f32 and f64 in x87.

584

// Set up the FP register classes.

585

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

586

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

587

588

for (auto VT : { MVT::f32, MVT::f64 }) {

589

setOperationAction(ISD::UNDEF, VT, Expand);

590

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

591

592

// Always expand sin/cos functions even though x87 has an instruction.

593

setOperationAction(ISD::FSIN , VT, Expand);

594

setOperationAction(ISD::FCOS , VT, Expand);

595

setOperationAction(ISD::FSINCOS, VT, Expand);

596

}

597

addLegalFPImmediate(APFloat(+0.0)); // FLD0

598

addLegalFPImmediate(APFloat(+1.0)); // FLD1

599

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

600

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

601

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

602

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

603

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

604

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

605

}

606

607

// We don't support FMA.

608

setOperationAction(ISD::FMA, MVT::f64, Expand);

609

setOperationAction(ISD::FMA, MVT::f32, Expand);

610

611

// Long double always uses X87, except f128 in MMX.

612

if (UseX87) {

613

if (Subtarget.is64Bit() && Subtarget.hasMMX()) {

614

addRegisterClass(MVT::f128, &X86::FR128RegClass);

615

ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);

616

setOperationAction(ISD::FABS , MVT::f128, Custom);

617

setOperationAction(ISD::FNEG , MVT::f128, Custom);

618

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

619

}

620

621

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

622

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

623

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

624

{

625

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

626

addLegalFPImmediate(TmpFlt); // FLD0

627

TmpFlt.changeSign();

628

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

629

630

bool ignored;

631

APFloat TmpFlt2(+1.0);

632

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

633

&ignored);

634

addLegalFPImmediate(TmpFlt2); // FLD1

635

TmpFlt2.changeSign();

636

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

637

}

638

639

// Always expand sin/cos functions even though x87 has an instruction.

640

setOperationAction(ISD::FSIN , MVT::f80, Expand);

641

setOperationAction(ISD::FCOS , MVT::f80, Expand);

642

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

643

644

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

645

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

646

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

647

setOperationAction(ISD::FRINT, MVT::f80, Expand);

648

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

649

setOperationAction(ISD::FMA, MVT::f80, Expand);

650

}

651

652

// Always use a library call for pow.

653

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

654

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

655

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

656

657

setOperationAction(ISD::FLOG, MVT::f80, Expand);

658

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

659

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

660

setOperationAction(ISD::FEXP, MVT::f80, Expand);

661

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

662

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

663

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

664

665

// Some FP actions are always expanded for vector types.

666

for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,

667

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

668

setOperationAction(ISD::FSIN, VT, Expand);

669

setOperationAction(ISD::FSINCOS, VT, Expand);

670

setOperationAction(ISD::FCOS, VT, Expand);

671

setOperationAction(ISD::FREM, VT, Expand);

672

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

673

setOperationAction(ISD::FPOW, VT, Expand);

674

setOperationAction(ISD::FLOG, VT, Expand);

675

setOperationAction(ISD::FLOG2, VT, Expand);

676

setOperationAction(ISD::FLOG10, VT, Expand);

677

setOperationAction(ISD::FEXP, VT, Expand);

678

setOperationAction(ISD::FEXP2, VT, Expand);

679

}

680

681

// First set operation action for all vector types to either promote

682

// (for widening) or expand (for scalarization). Then we will selectively

683

// turn on ones that can be effectively codegen'd.

684

for (MVT VT : MVT::vector_valuetypes()) {

685

setOperationAction(ISD::SDIV, VT, Expand);

686

setOperationAction(ISD::UDIV, VT, Expand);

687

setOperationAction(ISD::SREM, VT, Expand);

688

setOperationAction(ISD::UREM, VT, Expand);

689

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

690

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

691

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

692

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

693

setOperationAction(ISD::FMA, VT, Expand);

694

setOperationAction(ISD::FFLOOR, VT, Expand);

695

setOperationAction(ISD::FCEIL, VT, Expand);

696

setOperationAction(ISD::FTRUNC, VT, Expand);

697

setOperationAction(ISD::FRINT, VT, Expand);

698

setOperationAction(ISD::FNEARBYINT, VT, Expand);

699

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

700

setOperationAction(ISD::MULHS, VT, Expand);

701

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

702

setOperationAction(ISD::MULHU, VT, Expand);

703

setOperationAction(ISD::SDIVREM, VT, Expand);

704

setOperationAction(ISD::UDIVREM, VT, Expand);

705

setOperationAction(ISD::CTPOP, VT, Expand);

706

setOperationAction(ISD::CTTZ, VT, Expand);

707

setOperationAction(ISD::CTLZ, VT, Expand);

708

setOperationAction(ISD::ROTL, VT, Expand);

709

setOperationAction(ISD::ROTR, VT, Expand);

710

setOperationAction(ISD::BSWAP, VT, Expand);

711

setOperationAction(ISD::SETCC, VT, Expand);

712

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

713

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

714

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

715

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

716

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

717

setOperationAction(ISD::TRUNCATE, VT, Expand);

718

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

719

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

720

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

721

setOperationAction(ISD::SELECT_CC, VT, Expand);

722

for (MVT InnerVT : MVT::vector_valuetypes()) {

723

setTruncStoreAction(InnerVT, VT, Expand);

724

725

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

726

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

727

728

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

729

// types, we have to deal with them whether we ask for Expansion or not.

730

// Setting Expand causes its own optimisation problems though, so leave

731

// them legal.

732

if (VT.getVectorElementType() == MVT::i1)

733

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

734

735

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

736

// split/scalarized right now.

737

if (VT.getVectorElementType() == MVT::f16)

738

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

739

}

740

}

741

742

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

743

// with -msoft-float, disable use of MMX as well.

744

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

745

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

746

// No operations on x86mmx supported, everything uses intrinsics.

747

}

748

749

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

750

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

751

: &X86::VR128RegClass);

752

753

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

754

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

755

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

756

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

757

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

758

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

759

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

760

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

761

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

762

}

763

764

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

765

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

766

: &X86::VR128RegClass);

767

768

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

769

// registers cannot be used even for integer operations.

770

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

771

: &X86::VR128RegClass);

772

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

773

: &X86::VR128RegClass);

774

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

775

: &X86::VR128RegClass);

776

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

777

: &X86::VR128RegClass);

778

779

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

780

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

781

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

782

setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);

783

setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);

784

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

785

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

786

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

787

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

788

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

789

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

790

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

791

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

792

793

setOperationAction(ISD::SMAX, MVT::v8i16, Legal);

794

setOperationAction(ISD::UMAX, MVT::v16i8, Legal);

795

setOperationAction(ISD::SMIN, MVT::v8i16, Legal);

796

setOperationAction(ISD::UMIN, MVT::v16i8, Legal);

797

798

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

799

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

800

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

801

802

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

803

setOperationAction(ISD::SETCC, VT, Custom);

804

setOperationAction(ISD::CTPOP, VT, Custom);

805

setOperationAction(ISD::CTTZ, VT, Custom);

806

}

807

808

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

809

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

810

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

811

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

812

setOperationAction(ISD::VSELECT, VT, Custom);

813

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

814

}

815

816

// We support custom legalizing of sext and anyext loads for specific

817

// memory vector types which we can load as a scalar (or sequence of

818

// scalars) and extend in-register to a legal 128-bit vector type. For sext

819

// loads these must work with a single scalar load.

820

for (MVT VT : MVT::integer_vector_valuetypes()) {

821

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);

822

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);

823

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);

824

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);

825

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);

826

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);

827

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);

828

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);

829

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);

830

}

831

832

for (auto VT : { MVT::v2f64, MVT::v2i64 }) {

833

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

834

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

835

setOperationAction(ISD::VSELECT, VT, Custom);

836

837

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

838

continue;

839

840

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

841

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

842

}

843

844

// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.

845

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

846

setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);

847

setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);

848

setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);

849

setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);

850

setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);

851

}

852

853

// Custom lower v2i64 and v2f64 selects.

854

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

855

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

856

857

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);

858

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

859

860

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);

861

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

862

863

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

864

865

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

866

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

867

868

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

869

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

870

871

for (MVT VT : MVT::fp_vector_valuetypes())

872

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);

873

874

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

875

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

876

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

877

878

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

879

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

880

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

881

882

// In the customized shift lowering, the legal v4i32/v2i64 cases

883

// in AVX2 will be recognized.

884

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

885

setOperationAction(ISD::SRL, VT, Custom);

886

setOperationAction(ISD::SHL, VT, Custom);

887

setOperationAction(ISD::SRA, VT, Custom);

888

}

889

}

890

891

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

892

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

893

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

894

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

895

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

896

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

897

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

898

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

899

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

900

}

901

902

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

903

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

904

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

905

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

906

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

907

setOperationAction(ISD::FRINT, RoundedTy, Legal);

908

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

909

}

910

911

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

912

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

913

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

914

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

915

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

916

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

917

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

918

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

919

920

// FIXME: Do we need to handle scalar-to-vector here?

921

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

922

923

// We directly match byte blends in the backend as they match the VSELECT

924

// condition form.

925

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

926

927

// SSE41 brings specific instructions for doing vector sign extend even in

928

// cases where we don't have SRA.

929

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

930

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

931

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

932

}

933

934

for (MVT VT : MVT::integer_vector_valuetypes()) {

935

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);

936

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);

937

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);

938

}

939

940

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

941

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

942

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

943

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

944

setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);

945

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

946

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

947

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

948

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

949

}

950

951

// i8 vectors are custom because the source register and source

952

// source memory operand types are not the same width.

953

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

954

}

955

956

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

957

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

958

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

959

setOperationAction(ISD::ROTL, VT, Custom);

960

961

// XOP can efficiently perform BITREVERSE with VPPERM.

962

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

963

setOperationAction(ISD::BITREVERSE, VT, Custom);

964

965

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

966

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

967

setOperationAction(ISD::BITREVERSE, VT, Custom);

968

}

969

970

if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {

971

bool HasInt256 = Subtarget.hasInt256();

972

973

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

974

: &X86::VR256RegClass);

975

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

976

: &X86::VR256RegClass);

977

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

978

: &X86::VR256RegClass);

979

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

980

: &X86::VR256RegClass);

981

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

982

: &X86::VR256RegClass);

983

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

984

: &X86::VR256RegClass);

985

986

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

987

setOperationAction(ISD::FFLOOR, VT, Legal);

988

setOperationAction(ISD::FCEIL, VT, Legal);

989

setOperationAction(ISD::FTRUNC, VT, Legal);

990

setOperationAction(ISD::FRINT, VT, Legal);

991

setOperationAction(ISD::FNEARBYINT, VT, Legal);

992

setOperationAction(ISD::FNEG, VT, Custom);

993

setOperationAction(ISD::FABS, VT, Custom);

994

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

995

}

996

997

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

998

// even though v8i16 is a legal type.

999

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);

1000

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);

1001

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

1002

1003

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);

1004

setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);

1005

1006

for (MVT VT : MVT::fp_vector_valuetypes())

1007

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);

1008

1009

// In the customized shift lowering, the legal v8i32/v4i64 cases

1010

// in AVX2 will be recognized.

1011

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1012

setOperationAction(ISD::SRL, VT, Custom);

1013

setOperationAction(ISD::SHL, VT, Custom);

1014

setOperationAction(ISD::SRA, VT, Custom);

1015

}

1016

1017

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1018

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1019

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1020

1021

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1022

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1023

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1024

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1025

}

1026

1027

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1028

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1029

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1030

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1031

1032

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1033

setOperationAction(ISD::SETCC, VT, Custom);

1034

setOperationAction(ISD::CTPOP, VT, Custom);

1035

setOperationAction(ISD::CTTZ, VT, Custom);

1036

setOperationAction(ISD::CTLZ, VT, Custom);

1037

}

1038

1039

if (Subtarget.hasAnyFMA()) {

1040

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1041

MVT::v2f64, MVT::v4f64 })

1042

setOperationAction(ISD::FMA, VT, Legal);

1043

}

1044

1045

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1046

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1047

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1048

}

1049

1050

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1051

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1052

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1053

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1054

1055

setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);

1056

setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

1057

1058

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1059

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1060

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1061

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1062

1063

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1064

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1065

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1066

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1067

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1068

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1069

}

1070

1071

if (HasInt256) {

1072

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);

1073

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);

1074

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);

1075

1076

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1077

// when we have a 256bit-wide blend with immediate.

1078

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1079

1080

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1081

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1082

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1083

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1084

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1085

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1086

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1087

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1088

}

1089

}

1090

1091

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1092

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1093

setOperationAction(ISD::MLOAD, VT, Legal);

1094

setOperationAction(ISD::MSTORE, VT, Legal);

1095

}

1096

1097

// Extract subvector is special because the value type

1098

// (result) is 128-bit but the source is 256-bit wide.

1099

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1100

MVT::v4f32, MVT::v2f64 }) {

1101

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1102

}

1103

1104

// Custom lower several nodes for 256-bit types.

1105

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1106

MVT::v8f32, MVT::v4f64 }) {

1107

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1108

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1109

setOperationAction(ISD::VSELECT, VT, Custom);

1110

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1111

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1112

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1113

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1114

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1115

}

1116

1117

if (HasInt256)

1118

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1119

1120

// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.

1121

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1122

setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);

1123

setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);

1124

setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);

1125

setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);

1126

setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);

1127

}

1128

1129

if (HasInt256) {

1130

// Custom legalize 2x32 to get a little better code.

1131

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1132

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1133

1134

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1135

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1136

setOperationAction(ISD::MGATHER, VT, Custom);

1137

}

1138

}

1139

1140

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1141

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1142

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1143

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1144

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1145

1146

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1147

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1148

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1149

1150

for (MVT VT : MVT::fp_vector_valuetypes())

1151

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);

1152

1153

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1154

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1155

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1156

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1157

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1158

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1159

}

1160

1161

for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,

1162

MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,

1163

MVT::v8i64, MVT::v32i16, MVT::v64i8}) {

1164

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

1165

setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);

1166

setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);

1167

setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);

1168

setTruncStoreAction(VT, MaskVT, Custom);

1169

}

1170

1171

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1172

setOperationAction(ISD::FNEG, VT, Custom);

1173

setOperationAction(ISD::FABS, VT, Custom);

1174

setOperationAction(ISD::FMA, VT, Legal);

1175

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1176

}

1177

1178

setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);

1179

setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote);

1180

setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote);

1181

setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);

1182

setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote);

1183

setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote);

1184

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);

1185

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);

1186

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1187

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);

1188

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

1189

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);

1190

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

1191

setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);

1192

setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);

1193

setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);

1194

setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);

1195

setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);

1196

setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);

1197

setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);

1198

setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);

1199

1200

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1201

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1202

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1203

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1204

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1205

if (Subtarget.hasVLX()){

1206

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

1207

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

1208

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

1209

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

1210

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

1211

1212

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

1213

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

1214

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

1215

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

1216

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

1217

} else {

1218

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1219

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1220

setOperationAction(ISD::MLOAD, VT, Custom);

1221

setOperationAction(ISD::MSTORE, VT, Custom);

1222

}

1223

}

1224

1225

if (Subtarget.hasDQI()) {

1226

for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {

1227

setOperationAction(ISD::SINT_TO_FP, VT, Legal);

1228

setOperationAction(ISD::UINT_TO_FP, VT, Legal);

1229

setOperationAction(ISD::FP_TO_SINT, VT, Legal);

1230

setOperationAction(ISD::FP_TO_UINT, VT, Legal);

1231

}

1232

if (Subtarget.hasVLX()) {

1233

// Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.

1234

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1235

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1236

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1237

}

1238

}

1239

if (Subtarget.hasVLX()) {

1240

setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);

1241

setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);

1242

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);

1243

setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);

1244

}

1245

1246

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);

1247

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);

1248

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1249

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1250

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1251

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1252

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1253

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1254

1255

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);

1256

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);

1257

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);

1258

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);

1259

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);

1260

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);

1261

1262

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1263

setOperationAction(ISD::FFLOOR, VT, Legal);

1264

setOperationAction(ISD::FCEIL, VT, Legal);

1265

setOperationAction(ISD::FTRUNC, VT, Legal);

1266

setOperationAction(ISD::FRINT, VT, Legal);

1267

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1268

}

1269

1270

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);

1271

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);

1272

1273

// Without BWI we need to use custom lowering to handle MVT::v64i8 input.

1274

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

1275

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

1276

1277

setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);

1278

setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);

1279

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);

1280

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

1281

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);

1282

1283

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1284

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1285

1286

setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);

1287

setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);

1288

1289

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1290

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);

1291

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1292

setOperationAction(ISD::SELECT, MVT::v8f64, Custom);

1293

setOperationAction(ISD::SELECT, MVT::v8i64, Custom);

1294

setOperationAction(ISD::SELECT, MVT::v16f32, Custom);

1295

1296

1297

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1298

setOperationAction(ISD::ABS, MVT::v4i64, Legal);

1299

setOperationAction(ISD::ABS, MVT::v2i64, Legal);

1300

1301

for (auto VT : { MVT::v8i1, MVT::v16i1 }) {

1302

setOperationAction(ISD::ADD, VT, Custom);

1303

setOperationAction(ISD::SUB, VT, Custom);

1304

setOperationAction(ISD::MUL, VT, Custom);

1305

setOperationAction(ISD::SETCC, VT, Custom);

1306

setOperationAction(ISD::SELECT, VT, Custom);

1307

setOperationAction(ISD::TRUNCATE, VT, Custom);

1308

1309

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1310

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1311

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1312

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1313

setOperationAction(ISD::VSELECT, VT, Expand);

1314

}

1315

1316

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1317

setOperationAction(ISD::SMAX, VT, Legal);

1318

setOperationAction(ISD::UMAX, VT, Legal);

1319

setOperationAction(ISD::SMIN, VT, Legal);

1320

setOperationAction(ISD::UMIN, VT, Legal);

1321

setOperationAction(ISD::ABS, VT, Legal);

1322

setOperationAction(ISD::SRL, VT, Custom);

1323

setOperationAction(ISD::SHL, VT, Custom);

1324

setOperationAction(ISD::SRA, VT, Custom);

1325

setOperationAction(ISD::CTPOP, VT, Custom);

1326

setOperationAction(ISD::CTTZ, VT, Custom);

1327

}

1328

1329

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1330

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1331

setOperationAction(ISD::SMAX, VT, Legal);

1332

setOperationAction(ISD::UMAX, VT, Legal);

1333

setOperationAction(ISD::SMIN, VT, Legal);

1334

setOperationAction(ISD::UMIN, VT, Legal);

1335

}

1336

1337

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1338

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,

1339

MVT::v8i64}) {

1340

setOperationAction(ISD::ROTL, VT, Custom);

1341

setOperationAction(ISD::ROTR, VT, Custom);

1342

}

1343

1344

// Need to promote to 64-bit even though we have 32-bit masked instructions

1345

// because the IR optimizers rearrange bitcasts around logic ops leaving

1346

// too many variations to handle if we don't promote them.

1347

setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);

1348

setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);

1349

setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);

1350

1351

if (Subtarget.hasCDI()) {

1352

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1353

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,

1354

MVT::v4i64, MVT::v8i64}) {

1355

setOperationAction(ISD::CTLZ, VT, Legal);

1356

setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);

1357

}

1358

} // Subtarget.hasCDI()

1359

1360

if (Subtarget.hasDQI()) {

1361

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1362

setOperationAction(ISD::MUL, MVT::v2i64, Legal);

1363

setOperationAction(ISD::MUL, MVT::v4i64, Legal);

1364

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1365

}

1366

1367

if (Subtarget.hasVPOPCNTDQ()) {

1368

// VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512

1369

// version of popcntd/q.

1370

for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,

1371

MVT::v4i32, MVT::v2i64})

1372

setOperationAction(ISD::CTPOP, VT, Legal);

1373

}

1374

1375

// Custom lower several nodes.

1376

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1377

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1378

setOperationAction(ISD::MSCATTER, VT, Custom);

1379

1380

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v1i1, Legal);

1381

1382

// Extract subvector is special because the value type

1383

// (result) is 256-bit but the source is 512-bit wide.

1384

// 128-bit was made Legal under AVX1.

1385

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1386

MVT::v8f32, MVT::v4f64 })

1387

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1388

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,

1389

MVT::v16i1, MVT::v32i1, MVT::v64i1 })

1390

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1391

1392

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1393

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1394

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1395

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1396

setOperationAction(ISD::VSELECT, VT, Custom);

1397

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1398

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1399

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1400

setOperationAction(ISD::MLOAD, VT, Legal);

1401

setOperationAction(ISD::MSTORE, VT, Legal);

1402

setOperationAction(ISD::MGATHER, VT, Custom);

1403

setOperationAction(ISD::MSCATTER, VT, Custom);

1404

}

1405

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {

1406

setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);

1407

setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);

1408

}

1409

}// has AVX-512

1410

1411

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

1412

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1413

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1414

1415

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

1416

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

1417

1418

setOperationAction(ISD::ADD, MVT::v32i1, Custom);

1419

setOperationAction(ISD::ADD, MVT::v64i1, Custom);

1420

setOperationAction(ISD::SUB, MVT::v32i1, Custom);

1421

setOperationAction(ISD::SUB, MVT::v64i1, Custom);

1422

setOperationAction(ISD::MUL, MVT::v32i1, Custom);

1423

setOperationAction(ISD::MUL, MVT::v64i1, Custom);

1424

1425

setOperationAction(ISD::SETCC, MVT::v32i1, Custom);

1426

setOperationAction(ISD::SETCC, MVT::v64i1, Custom);

1427

setOperationAction(ISD::MUL, MVT::v32i16, Legal);

1428

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1429

setOperationAction(ISD::MULHS, MVT::v32i16, Legal);

1430

setOperationAction(ISD::MULHU, MVT::v32i16, Legal);

1431

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1432

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1433

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);

1434

setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);

1435

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);

1436

setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);

1437

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);

1438

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);

1439

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);

1440

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);

1441

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);

1442

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);

1443

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);

1444

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);

1445

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);

1446

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);

1447

setOperationAction(ISD::SELECT, MVT::v32i1, Custom);

1448

setOperationAction(ISD::SELECT, MVT::v64i1, Custom);

1449

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

1450

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

1451

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1452

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1453

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1454

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);

1455

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);

1456

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1457

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1458

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);

1459

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);

1460

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);

1461

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);

1462

setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);

1463

setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);

1464

setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);

1465

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);

1466

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);

1467

setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);

1468

setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);

1469

setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);

1470

setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);

1471

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1472

1473

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

1474

1475

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1476

if (Subtarget.hasVLX()) {

1477

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

1478

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

1479

}

1480

1481

LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;

1482

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

1483

setOperationAction(ISD::MLOAD, VT, Action);

1484

setOperationAction(ISD::MSTORE, VT, Action);

1485

}

1486

1487

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1488

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1489

setOperationAction(ISD::VSELECT, VT, Custom);

1490

setOperationAction(ISD::ABS, VT, Legal);

1491

setOperationAction(ISD::SRL, VT, Custom);

1492

setOperationAction(ISD::SHL, VT, Custom);

1493

setOperationAction(ISD::SRA, VT, Custom);

1494

setOperationAction(ISD::MLOAD, VT, Legal);

1495

setOperationAction(ISD::MSTORE, VT, Legal);

1496

setOperationAction(ISD::CTPOP, VT, Custom);

1497

setOperationAction(ISD::CTTZ, VT, Custom);

1498

setOperationAction(ISD::CTLZ, VT, Custom);

1499

setOperationAction(ISD::SMAX, VT, Legal);

1500

setOperationAction(ISD::UMAX, VT, Legal);

1501

setOperationAction(ISD::SMIN, VT, Legal);

1502

setOperationAction(ISD::UMIN, VT, Legal);

1503

1504

setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);

1505

setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);

1506

setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);

1507

}

1508

1509

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1510

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1511

}

1512

1513

if (Subtarget.hasBITALG()) {

1514

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v32i8,

1515

MVT::v16i16, MVT::v16i8, MVT::v8i16 })

1516

setOperationAction(ISD::CTPOP, VT, Legal);

1517

}

1518

}

1519

1520

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

1521

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1522

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1523

1524

for (auto VT : { MVT::v2i1, MVT::v4i1 }) {

1525

setOperationAction(ISD::ADD, VT, Custom);

1526

setOperationAction(ISD::SUB, VT, Custom);

1527

setOperationAction(ISD::MUL, VT, Custom);

1528

setOperationAction(ISD::VSELECT, VT, Expand);

1529

1530

setOperationAction(ISD::TRUNCATE, VT, Custom);

1531

setOperationAction(ISD::SETCC, VT, Custom);

1532

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1533

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1534

setOperationAction(ISD::SELECT, VT, Custom);

1535

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1536

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1537

}

1538

1539

setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);

1540

setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);

1541

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);

1542

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);

1543

}

1544

1545

// We want to custom lower some of our intrinsics.

1546

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

1547

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

1548

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

1549

if (!Subtarget.is64Bit()) {

1550

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

1551

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);

1552

}

1553

1554

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

1555

// handle type legalization for these operations here.

1556

1557

// FIXME: We really should do custom legalization for addition and

1558

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

1559

// than generic legalization for 64-bit multiplication-with-overflow, though.

1560

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

1561

if (VT == MVT::i64 && !Subtarget.is64Bit())

1562

continue;

1563

// Add/Sub/Mul with overflow operations are custom lowered.

1564

setOperationAction(ISD::SADDO, VT, Custom);

1565

setOperationAction(ISD::UADDO, VT, Custom);

1566

setOperationAction(ISD::SSUBO, VT, Custom);

1567

setOperationAction(ISD::USUBO, VT, Custom);

1568

setOperationAction(ISD::SMULO, VT, Custom);

1569

setOperationAction(ISD::UMULO, VT, Custom);

1570

1571

// Support carry in as value rather than glue.

1572

setOperationAction(ISD::ADDCARRY, VT, Custom);

1573

setOperationAction(ISD::SUBCARRY, VT, Custom);

1574

setOperationAction(ISD::SETCCCARRY, VT, Custom);

1575

}

1576

1577

if (!Subtarget.is64Bit()) {

1578

// These libcalls are not available in 32-bit.

1579

setLibcallName(RTLIB::SHL_I128, nullptr);

1580

setLibcallName(RTLIB::SRL_I128, nullptr);

1581

setLibcallName(RTLIB::SRA_I128, nullptr);

1582

setLibcallName(RTLIB::MUL_I128, nullptr);

1583

}

1584

1585

// Combine sin / cos into one node or libcall if possible.

1586

if (Subtarget.hasSinCos()) {

1587

setLibcallName(RTLIB::SINCOS_F32, "sincosf");

1588

setLibcallName(RTLIB::SINCOS_F64, "sincos");

1589

if (Subtarget.isTargetDarwin()) {

1590

// For MacOSX, we don't want the normal expansion of a libcall to sincos.

1591

// We want to issue a libcall to __sincos_stret to avoid memory traffic.

1592

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

1593

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

1594

}

1595

}

1596

1597

if (Subtarget.isTargetWin64()) {

1598

setOperationAction(ISD::SDIV, MVT::i128, Custom);

1599

setOperationAction(ISD::UDIV, MVT::i128, Custom);

1600

setOperationAction(ISD::SREM, MVT::i128, Custom);

1601

setOperationAction(ISD::UREM, MVT::i128, Custom);

1602

setOperationAction(ISD::SDIVREM, MVT::i128, Custom);

1603

setOperationAction(ISD::UDIVREM, MVT::i128, Custom);

1604

}

1605

1606

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

1607

// is. We should promote the value to 64-bits to solve this.

1608

// This is what the CRT headers do - `fmodf` is an inline header

1609

// function casting to f64 and calling `fmod`.

1610

if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||

1611

Subtarget.isTargetWindowsItanium()))

1612

for (ISD::NodeType Op :

1613

{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,

1614

ISD::FLOG10, ISD::FPOW, ISD::FSIN})

1615

if (isOperationExpand(Op, MVT::f32))

1616

setOperationAction(Op, MVT::f32, Promote);

1617

1618

// We have target-specific dag combine patterns for the following nodes:

1619

setTargetDAGCombine(ISD::VECTOR_SHUFFLE);

1620

setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);

1621

setTargetDAGCombine(ISD::INSERT_SUBVECTOR);

1622

setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);

1623

setTargetDAGCombine(ISD::BITCAST);

1624

setTargetDAGCombine(ISD::VSELECT);

1625

setTargetDAGCombine(ISD::SELECT);

1626

setTargetDAGCombine(ISD::SHL);

1627

setTargetDAGCombine(ISD::SRA);

1628

setTargetDAGCombine(ISD::SRL);

1629

setTargetDAGCombine(ISD::OR);

1630

setTargetDAGCombine(ISD::AND);

1631

setTargetDAGCombine(ISD::ADD);

1632

setTargetDAGCombine(ISD::FADD);

1633

setTargetDAGCombine(ISD::FSUB);

1634

setTargetDAGCombine(ISD::FNEG);

1635

setTargetDAGCombine(ISD::FMA);

1636

setTargetDAGCombine(ISD::FMINNUM);

1637

setTargetDAGCombine(ISD::FMAXNUM);

1638

setTargetDAGCombine(ISD::SUB);

1639

setTargetDAGCombine(ISD::LOAD);

1640

setTargetDAGCombine(ISD::MLOAD);

1641

setTargetDAGCombine(ISD::STORE);

1642

setTargetDAGCombine(ISD::MSTORE);

1643

setTargetDAGCombine(ISD::TRUNCATE);

1644

setTargetDAGCombine(ISD::ZERO_EXTEND);

1645

setTargetDAGCombine(ISD::ANY_EXTEND);

1646

setTargetDAGCombine(ISD::SIGN_EXTEND);

1647

setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);

1648

setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);

1649

setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);

1650

setTargetDAGCombine(ISD::SINT_TO_FP);

1651

setTargetDAGCombine(ISD::UINT_TO_FP);

1652

setTargetDAGCombine(ISD::SETCC);

1653

setTargetDAGCombine(ISD::MUL);

1654

setTargetDAGCombine(ISD::XOR);

1655

setTargetDAGCombine(ISD::MSCATTER);

1656

setTargetDAGCombine(ISD::MGATHER);

1657

1658

computeRegisterProperties(Subtarget.getRegisterInfo());

1659

1660

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

1661

MaxStoresPerMemsetOptSize = 8;

1662

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

1663

MaxStoresPerMemcpyOptSize = 4;

1664

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

1665

MaxStoresPerMemmoveOptSize = 4;

1666

1667

// TODO: These control memcmp expansion in CGP and could be raised higher, but

1668

// that needs to benchmarked and balanced with the potential use of vector

1669

// load/store types (PR33329, PR33914).

1670

MaxLoadsPerMemcmp = 2;

1671

MaxLoadsPerMemcmpOptSize = 2;

1672

1673

// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).

1674

setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

1675

1676

// An out-of-order CPU can speculatively execute past a predictable branch,

1677

// but a conditional move could be stalled by an expensive earlier operation.

1678

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

1679

EnableExtLdPromotion = true;

1680

setPrefFunctionAlignment(4); // 2^4 bytes.

1681

1682

verifyIntrinsicTables();

1683

}

1684

1685

// This has so far only been implemented for 64-bit MachO.

1686

bool X86TargetLowering::useLoadStackGuardNode() const {

1687

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

1688

}

1689

1690

bool X86TargetLowering::useStackGuardXorFP() const {

1691

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

1692

return Subtarget.getTargetTriple().isOSMSVCRT();

1693

}

1694

1695

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

1696

const SDLoc &DL) const {

1697

EVT PtrTy = getPointerTy(DAG.getDataLayout());

1698

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

1699

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

1700

return SDValue(Node, 0);

1701

}

1702

1703

TargetLoweringBase::LegalizeTypeAction

1704

X86TargetLowering::getPreferredVectorAction(EVT VT) const {

1705

if (ExperimentalVectorWideningLegalization &&

1706

VT.getVectorNumElements() != 1 &&

1707

VT.getVectorElementType().getSimpleVT() != MVT::i1)

1708

return TypeWidenVector;

1709

1710

return TargetLoweringBase::getPreferredVectorAction(VT);

1711

}

1712

1713

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

1714

LLVMContext& Context,

1715

EVT VT) const {

1716

if (!VT.isVector())

1717

return MVT::i8;

1718

1719

if (Subtarget.hasAVX512()) {

1720

const unsigned NumElts = VT.getVectorNumElements();

1721

1722

// Figure out what this type will be legalized to.

1723

EVT LegalVT = VT;

1724

while (getTypeAction(Context, LegalVT) != TypeLegal)

1725

LegalVT = getTypeToTransformTo(Context, LegalVT);

1726

1727

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

1728

if (LegalVT.getSimpleVT().is512BitVector())

1729

return EVT::getVectorVT(Context, MVT::i1, NumElts);

1730

1731

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

1732

// If we legalized to less than a 512-bit vector, then we will use a vXi1

1733

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

1734

// vXi16/vXi8.

1735

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

1736

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

1737

return EVT::getVectorVT(Context, MVT::i1, NumElts);

1738

}

1739

}

1740

1741

return VT.changeVectorElementTypeToInteger();

1742

}

1743

1744

/// Helper for getByValTypeAlignment to determine

1745

/// the desired ByVal argument alignment.

1746

static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {

1747

if (MaxAlign == 16)

1748

return;

1749

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

1750

if (VTy->getBitWidth() == 128)

1751

MaxAlign = 16;

1752

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

1753

unsigned EltAlign = 0;

1754

getMaxByValAlign(ATy->getElementType(), EltAlign);

1755

if (EltAlign > MaxAlign)

1756

MaxAlign = EltAlign;

1757

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

1758

for (auto *EltTy : STy->elements()) {

1759

unsigned EltAlign = 0;

1760

getMaxByValAlign(EltTy, EltAlign);

1761

if (EltAlign > MaxAlign)

1762

MaxAlign = EltAlign;

1763

if (MaxAlign == 16)

1764

break;

1765

}

1766

}

1767

}

1768

1769

/// Return the desired alignment for ByVal aggregate

1770

/// function arguments in the caller parameter area. For X86, aggregates

1771

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

1772

/// are at 4-byte boundaries.

1773

unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,

1774

const DataLayout &DL) const {

1775

if (Subtarget.is64Bit()) {

1776

// Max of 8 and alignment of type.

1777

unsigned TyAlign = DL.getABITypeAlignment(Ty);

1778

if (TyAlign > 8)

1779

return TyAlign;

1780

return 8;

1781

}

1782

1783

unsigned Align = 4;

1784

if (Subtarget.hasSSE1())

1785

getMaxByValAlign(Ty, Align);

1786

return Align;

1787

}

1788

1789

/// Returns the target specific optimal type for load

1790

/// and store operations as a result of memset, memcpy, and memmove

1791

/// lowering. If DstAlign is zero that means it's safe to destination

1792

/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it

1793

/// means there isn't a need to check it against alignment requirement,

1794

/// probably because the source does not need to be loaded. If 'IsMemset' is

1795

/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that

1796

/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy

1797

/// source is constant so it does not need to be loaded.

1798

/// It returns EVT::Other if the type should be determined using generic

1799

/// target-independent logic.

1800

EVT

1801

X86TargetLowering::getOptimalMemOpType(uint64_t Size,

1802

unsigned DstAlign, unsigned SrcAlign,

1803

bool IsMemset, bool ZeroMemset,

1804

bool MemcpyStrSrc,

1805

MachineFunction &MF) const {

1806

const Function *F = MF.getFunction();

1807

if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {

1808

if (Size >= 16 &&

1809

(!Subtarget.isUnalignedMem16Slow() ||

1810

((DstAlign == 0 || DstAlign >= 16) &&

1811

(SrcAlign == 0 || SrcAlign >= 16)))) {

1812

// FIXME: Check if unaligned 32-byte accesses are slow.

1813

if (Size >= 32 && Subtarget.hasAVX()) {

1814

// Although this isn't a well-supported type for AVX1, we'll let

1815

// legalization and shuffle lowering produce the optimal codegen. If we

1816

// choose an optimal type with a vector element larger than a byte,

1817

// getMemsetStores() may create an intermediate splat (using an integer

1818

// multiply) before we splat as a vector.

1819

return MVT::v32i8;

1820

}

1821

if (Subtarget.hasSSE2())

1822

return MVT::v16i8;

1823

// TODO: Can SSE1 handle a byte vector?

1824

if (Subtarget.hasSSE1())

1825

return MVT::v4f32;

1826

} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&

1827

!Subtarget.is64Bit() && Subtarget.hasSSE2()) {

1828

// Do not use f64 to lower memcpy if source is string constant. It's

1829

// better to use i32 to avoid the loads.

1830

// Also, do not use f64 to lower memset unless this is a memset of zeros.

1831

// The gymnastics of splatting a byte value into an XMM register and then

1832

// only using 8-byte stores (because this is a CPU with slow unaligned

1833

// 16-byte accesses) makes that a loser.

1834

return MVT::f64;

1835

}

1836

}

1837

// This is a compromise. If we reach here, unaligned accesses may be slow on

1838

// this target. However, creating smaller, aligned accesses could be even

1839

// slower and would certainly be a lot more code.

1840

if (Subtarget.is64Bit() && Size >= 8)

1841

return MVT::i64;

1842

return MVT::i32;

1843

}

1844

1845

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

1846

if (VT == MVT::f32)

1847

return X86ScalarSSEf32;

1848

else if (VT == MVT::f64)

1849

return X86ScalarSSEf64;

1850

return true;

1851

}

1852

1853

bool

1854

X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

1855

unsigned,

1856

unsigned,

1857

bool *Fast) const {

1858

if (Fast) {

1859

switch (VT.getSizeInBits()) {

1860

default:

1861

// 8-byte and under are always assumed to be fast.

1862

*Fast = true;

1863

break;

1864

case 128:

1865

*Fast = !Subtarget.isUnalignedMem16Slow();

1866

break;

1867

case 256:

1868

*Fast = !Subtarget.isUnalignedMem32Slow();

1869

break;

1870

// TODO: What about AVX-512 (512-bit) accesses?

1871

}

1872

}

1873

// Misaligned accesses of any size are always allowed.

1874

return true;

1875

}

1876

1877

/// Return the entry encoding for a jump table in the

1878

/// current function. The returned value is a member of the

1879

/// MachineJumpTableInfo::JTEntryKind enum.

1880

unsigned X86TargetLowering::getJumpTableEncoding() const {

1881

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

1882

// symbol.

1883

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

1884

return MachineJumpTableInfo::EK_Custom32;

1885

1886

// Otherwise, use the normal jump table encoding heuristics.

1887

return TargetLowering::getJumpTableEncoding();

1888

}

1889

1890

bool X86TargetLowering::useSoftFloat() const {

1891

return Subtarget.useSoftFloat();

1892

}

1893

1894

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

1895

ArgListTy &Args) const {

1896

1897

// Only relabel X86-32 for C / Stdcall CCs.

1898

if (Subtarget.is64Bit())

1899

return;

1900

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

1901

return;

1902

unsigned ParamRegs = 0;

1903

if (auto *M = MF->getFunction()->getParent())

1904

ParamRegs = M->getNumberRegisterParameters();

1905

1906

// Mark the first N int arguments as having reg

1907

for (unsigned Idx = 0; Idx < Args.size(); Idx++) {

1908

Type *T = Args[Idx].Ty;

1909

if (T->isPointerTy() || T->isIntegerTy())

1910

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

1911

unsigned numRegs = 1;

1912

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

1913

numRegs = 2;

1914

if (ParamRegs < numRegs)

1915

return;

1916

ParamRegs -= numRegs;

1917

Args[Idx].IsInReg = true;

1918

}

1919

}

1920

}

1921

1922

const MCExpr *

1923

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

1924

const MachineBasicBlock *MBB,

1925

unsigned uid,MCContext &Ctx) const{

1926

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 1926, __extension__ __PRETTY_FUNCTION__));

1927

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

1928

// entries.

1929

return MCSymbolRefExpr::create(MBB->getSymbol(),

1930

MCSymbolRefExpr::VK_GOTOFF, Ctx);

1931

}

1932

1933

/// Returns relocation base for the given PIC jumptable.

1934

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

1935

SelectionDAG &DAG) const {

1936

if (!Subtarget.is64Bit())

1937

// This doesn't have SDLoc associated with it, but is not really the

1938

// same as a Register.

1939

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

1940

getPointerTy(DAG.getDataLayout()));

1941

return Table;

1942

}

1943

1944

/// This returns the relocation base for the given PIC jumptable,

1945

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

1946

const MCExpr *X86TargetLowering::

1947

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

1948

MCContext &Ctx) const {

1949

// X86-64 uses RIP relative addressing based on the jump table label.

1950

if (Subtarget.isPICStyleRIPRel())

1951

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

1952

1953

// Otherwise, the reference is relative to the PIC base.

1954

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

1955

}

1956

1957

std::pair<const TargetRegisterClass *, uint8_t>

1958

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

1959

MVT VT) const {

1960

const TargetRegisterClass *RRC = nullptr;

1961

uint8_t Cost = 1;

1962

switch (VT.SimpleTy) {

1963

default:

1964

return TargetLowering::findRepresentativeClass(TRI, VT);

1965

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

1966

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

1967

break;

1968

case MVT::x86mmx:

1969

RRC = &X86::VR64RegClass;

1970

break;

1971

case MVT::f32: case MVT::f64:

1972

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

1973

case MVT::v4f32: case MVT::v2f64:

1974

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

1975

case MVT::v8f32: case MVT::v4f64:

1976

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

1977

case MVT::v16f32: case MVT::v8f64:

1978

RRC = &X86::VR128XRegClass;

1979

break;

1980

}

1981

return std::make_pair(RRC, Cost);

1982

}

1983

1984

unsigned X86TargetLowering::getAddressSpace() const {

1985

if (Subtarget.is64Bit())

1986

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

1987

return 256;

1988

}

1989

1990

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

1991

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

1992

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

1993

}

1994

1995

static Constant* SegmentOffset(IRBuilder<> &IRB,

1996

unsigned Offset, unsigned AddressSpace) {

1997

return ConstantExpr::getIntToPtr(

1998

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

1999

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

2000

}

2001

2002

Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {

2003

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

2004

// tcbhead_t; use it instead of the usual global variable (see

2005

// sysdeps/{i386,x86_64}/nptl/tls.h)

2006

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

2007

if (Subtarget.isTargetFuchsia()) {

2008

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

2009

return SegmentOffset(IRB, 0x10, getAddressSpace());

2010

} else {

2011

// %fs:0x28, unless we're using a Kernel code model, in which case

2012

// it's %gs:0x28. gs:0x14 on i386.

2013

unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

2014

return SegmentOffset(IRB, Offset, getAddressSpace());

2015

}

2016

}

2017

2018

return TargetLowering::getIRStackGuard(IRB);

2019

}

2020

2021

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

2022

// MSVC CRT provides functionalities for stack protection.

2023

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

2024

// MSVC CRT has a global variable holding security cookie.

2025

M.getOrInsertGlobal("__security_cookie",

2026

Type::getInt8PtrTy(M.getContext()));

2027

2028

// MSVC CRT has a function to validate security cookie.

2029

auto *SecurityCheckCookie = cast<Function>(

2030

M.getOrInsertFunction("__security_check_cookie",

2031

Type::getVoidTy(M.getContext()),

2032

Type::getInt8PtrTy(M.getContext())));

2033

SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);

2034

SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);

2035

return;

2036

}

2037

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

2038

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

2039

return;

2040

TargetLowering::insertSSPDeclarations(M);

2041

}

2042

2043

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

2044

// MSVC CRT has a global variable holding security cookie.

2045

if (Subtarget.getTargetTriple().isOSMSVCRT())

2046

return M.getGlobalVariable("__security_cookie");

2047

return TargetLowering::getSDagStackGuard(M);

2048

}

2049

2050

Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

2051

// MSVC CRT has a function to validate security cookie.

2052

if (Subtarget.getTargetTriple().isOSMSVCRT())

2053

return M.getFunction("__security_check_cookie");

2054

return TargetLowering::getSSPStackGuardCheck(M);

2055

}

2056

2057

Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {

2058

if (Subtarget.getTargetTriple().isOSContiki())

2059

return getDefaultSafeStackPointerLocation(IRB, false);

2060

2061

// Android provides a fixed TLS slot for the SafeStack pointer. See the

2062

// definition of TLS_SLOT_SAFESTACK in

2063

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

2064

if (Subtarget.isTargetAndroid()) {

2065

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

2066

// %gs:0x24 on i386

2067

unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

2068

return SegmentOffset(IRB, Offset, getAddressSpace());

2069

}

2070

2071

// Fuchsia is similar.

2072

if (Subtarget.isTargetFuchsia()) {

2073

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

2074

return SegmentOffset(IRB, 0x18, getAddressSpace());

2075

}

2076

2077

return TargetLowering::getSafeStackPointerLocation(IRB);

2078

}

2079

2080

bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,

2081

unsigned DestAS) const {

2082

assert(SrcAS != DestAS && "Expected different address spaces!")(static_cast <bool> (SrcAS != DestAS && "Expected different address spaces!"
) ? void (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2082, __extension__ __PRETTY_FUNCTION__));

2083

2084

return SrcAS < 256 && DestAS < 256;

2085

}

2086

2087

//===----------------------------------------------------------------------===//

2088

// Return Value Calling Convention Implementation

2089

//===----------------------------------------------------------------------===//

2090

2091

#include "X86GenCallingConv.inc"

2092

2093

bool X86TargetLowering::CanLowerReturn(

2094

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

2095

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

2096

SmallVector<CCValAssign, 16> RVLocs;

2097

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

2098

return CCInfo.CheckReturn(Outs, RetCC_X86);

2099

}

2100

2101

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

2102

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

2103

return ScratchRegs;

2104

}

2105

2106

/// Lowers masks values (v*i1) to the local register values

2107

/// \returns DAG node after lowering to register type

2108

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

2109

const SDLoc &Dl, SelectionDAG &DAG) {

2110

EVT ValVT = ValArg.getValueType();

2111

2112

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

2113

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

2114

// Two stage lowering might be required

2115

// bitcast: v8i1 -> i8 / v16i1 -> i16

2116

// anyextend: i8 -> i32 / i16 -> i32

2117

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

2118

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

2119

if (ValLoc == MVT::i32)

2120

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

2121

return ValToCopy;

2122

} else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

2123

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

2124

// One stage lowering is required

2125

// bitcast: v32i1 -> i32 / v64i1 -> i64

2126

return DAG.getBitcast(ValLoc, ValArg);

2127

} else

2128

return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);

2129

}

2130

2131

/// Breaks v64i1 value into two registers and adds the new node to the DAG

2132

static void Passv64i1ArgInRegs(

2133

const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,

2134

SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,

2135

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

2136

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2136, __extension__ __PRETTY_FUNCTION__));

2137

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2137, __extension__ __PRETTY_FUNCTION__));

2138

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2138, __extension__ __PRETTY_FUNCTION__));

2139

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2140, __extension__ __PRETTY_FUNCTION__))

2140

"The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2140, __extension__ __PRETTY_FUNCTION__));

2141

2142

// Before splitting the value we cast it to i64

2143

Arg = DAG.getBitcast(MVT::i64, Arg);

2144

2145

// Splitting the value into two i32 types

2146

SDValue Lo, Hi;

2147

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

2148

DAG.getConstant(0, Dl, MVT::i32));

2149

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

2150

DAG.getConstant(1, Dl, MVT::i32));

2151

2152

// Attach the two i32 types into corresponding registers

2153

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

2154

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

2155

}

2156

2157

SDValue

2158

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

2159

bool isVarArg,

2160

const SmallVectorImpl<ISD::OutputArg> &Outs,

2161

const SmallVectorImpl<SDValue> &OutVals,

2162

const SDLoc &dl, SelectionDAG &DAG) const {

2163

MachineFunction &MF = DAG.getMachineFunction();

2164

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

2165

2166

// In some cases we need to disable registers from the default CSR list.

2167

// For example, when they are used for argument passing.

2168

bool ShouldDisableCalleeSavedRegister =

2169

CallConv == CallingConv::X86_RegCall ||

2170

MF.getFunction()->hasFnAttribute("no_caller_saved_registers");

2171

2172

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

2173

report_fatal_error("X86 interrupts may not return any value");

2174

2175

SmallVector<CCValAssign, 16> RVLocs;

2176

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

2177

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

2178

2179

SDValue Flag;

2180

SmallVector<SDValue, 6> RetOps;

2181

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

2182

// Operand #1 = Bytes To Pop

2183

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

2184

MVT::i32));

2185

2186

// Copy the result values into the output registers.

2187

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

2188

++I, ++OutsIndex) {

2189

CCValAssign &VA = RVLocs[I];

2190

assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2190, __extension__ __PRETTY_FUNCTION__));

2191

2192

// Add the register to the CalleeSaveDisableRegs list.

2193

if (ShouldDisableCalleeSavedRegister)

2194

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

2195

2196

SDValue ValToCopy = OutVals[OutsIndex];

2197

EVT ValVT = ValToCopy.getValueType();

2198

2199

// Promote values to the appropriate types.

2200

if (VA.getLocInfo() == CCValAssign::SExt)

2201

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

2202

else if (VA.getLocInfo() == CCValAssign::ZExt)

2203

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

2204

else if (VA.getLocInfo() == CCValAssign::AExt) {

2205

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

2206

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

2207

else

2208

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

2209

}

2210

else if (VA.getLocInfo() == CCValAssign::BCvt)

2211

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

2212

2213

assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2214, __extension__ __PRETTY_FUNCTION__))

2214

"Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2214, __extension__ __PRETTY_FUNCTION__));

2215

2216

// If this is x86-64, and we disabled SSE, we can't return FP values,

2217

// or SSE or MMX vectors.

2218

if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||

2219

VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&

2220

(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {

2221

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

2222

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

2223

} else if (ValVT == MVT::f64 &&

2224

(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {

2225

// Likewise we can't return F64 values with SSE1 only. gcc does so, but

2226

// llvm-gcc has never done it right and no one has noticed, so this

2227

// should be OK for now.

2228

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

2229

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

2230

}

2231

2232

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

2233

// the RET instruction and handled by the FP Stackifier.

2234

if (VA.getLocReg() == X86::FP0 ||

2235

VA.getLocReg() == X86::FP1) {

2236

// If this is a copy from an xmm register to ST(0), use an FPExtend to

2237

// change the value to the FP stack register class.

2238

if (isScalarFPTypeInSSEReg(VA.getValVT()))

2239

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

2240

RetOps.push_back(ValToCopy);

2241

// Don't emit a copytoreg.

2242

continue;

2243

}

2244

2245

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

2246

// which is returned in RAX / RDX.

2247

if (Subtarget.is64Bit()) {

2248

if (ValVT == MVT::x86mmx) {

2249

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

2250

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

2251

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

2252

ValToCopy);

2253

// If we don't have SSE2 available, convert to v4f32 so the generated

2254

// register is legal.

2255

if (!Subtarget.hasSSE2())

2256

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

2257

}

2258

}

2259

}

2260

2261

SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

2262

2263

if (VA.needsCustom()) {

2264

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2265, __extension__ __PRETTY_FUNCTION__))

2265

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2265, __extension__ __PRETTY_FUNCTION__));

2266

2267

Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],

2268

Subtarget);

2269

2270

assert(2 == RegsToPass.size() &&(static_cast <bool> (2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"
) ? void (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2271, __extension__ __PRETTY_FUNCTION__))

2271

"Expecting two registers after Pass64BitArgInRegs")(static_cast <bool> (2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"
) ? void (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2271, __extension__ __PRETTY_FUNCTION__));

2272

2273

// Add the second register to the CalleeSaveDisableRegs list.

2274

if (ShouldDisableCalleeSavedRegister)

2275

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

2276

} else {

2277

RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

2278

}

2279

2280

// Add nodes to the DAG and add the values into the RetOps list

2281

for (auto &Reg : RegsToPass) {

2282

Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);

2283

Flag = Chain.getValue(1);

2284

RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));

2285

}

2286

}

2287

2288

// Swift calling convention does not require we copy the sret argument

2289

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

2290

2291

// All x86 ABIs require that for returning structs by value we copy

2292

// the sret argument into %rax/%eax (depending on ABI) for the return.

2293

// We saved the argument into a virtual register in the entry block,

2294

// so now we copy the value out and into %rax/%eax.

2295

2296

// Checking Function.hasStructRetAttr() here is insufficient because the IR

2297

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

2298

// false, then an sret argument may be implicitly inserted in the SelDAG. In

2299

// either case FuncInfo->setSRetReturnReg() will have been called.

2300

if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {

2301

// When we have both sret and another return value, we should use the

2302

// original Chain stored in RetOps[0], instead of the current Chain updated

2303

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

2304

2305

// For the case of sret and another return value, we have

2306

// Chain_0 at the function entry

2307

// Chain_1 = getCopyToReg(Chain_0) in the above loop

2308

// If we use Chain_1 in getCopyFromReg, we will have

2309

// Val = getCopyFromReg(Chain_1)

2310

// Chain_2 = getCopyToReg(Chain_1, Val) from below

2311

2312

// getCopyToReg(Chain_0) will be glued together with

2313

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

2314

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

2315

// Data dependency from Unit B to Unit A due to usage of Val in

2316

// getCopyToReg(Chain_1, Val)

2317

// Chain dependency from Unit A to Unit B

2318

2319

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

2320

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

2321

getPointerTy(MF.getDataLayout()));

2322

2323

unsigned RetValReg

2324

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

2325

X86::RAX : X86::EAX;

2326

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);

2327

Flag = Chain.getValue(1);

2328

2329

// RAX/EAX now acts like a return value.

2330

RetOps.push_back(

2331

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

2332

2333

// Add the returned register to the CalleeSaveDisableRegs list.

2334

if (ShouldDisableCalleeSavedRegister)

2335

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

2336

}

2337

2338

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

2339

const MCPhysReg *I =

2340

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

2341

if (I) {

2342

for (; *I; ++I) {

2343

if (X86::GR64RegClass.contains(*I))

2344

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

2345

else

2346

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2346);

2347

}

2348

}

2349

2350

RetOps[0] = Chain; // Update chain.

2351

2352

// Add the flag if we have it.

2353

if (Flag.getNode())

2354

RetOps.push_back(Flag);

2355

2356

X86ISD::NodeType opcode = X86ISD::RET_FLAG;

2357

if (CallConv == CallingConv::X86_INTR)

2358

opcode = X86ISD::IRET;

2359

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

2360

}

2361

2362

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

2363

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

2364

return false;

2365

2366

SDValue TCChain = Chain;

2367

SDNode *Copy = *N->use_begin();

2368

if (Copy->getOpcode() == ISD::CopyToReg) {

2369

// If the copy has a glue operand, we conservatively assume it isn't safe to

2370

// perform a tail call.

2371

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

2372

return false;

2373

TCChain = Copy->getOperand(0);

2374

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

2375

return false;

2376

2377

bool HasRet = false;

2378

for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();

2379

UI != UE; ++UI) {

2380

if (UI->getOpcode() != X86ISD::RET_FLAG)

2381

return false;

2382

// If we are returning more than one value, we can definitely

2383

// not make a tail call see PR19530

2384

if (UI->getNumOperands() > 4)

2385

return false;

2386

if (UI->getNumOperands() == 4 &&

2387

UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)

2388

return false;

2389

HasRet = true;

2390

}

2391

2392

if (!HasRet)

2393

return false;

2394

2395

Chain = TCChain;

2396

return true;

2397

}

2398

2399

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

2400

ISD::NodeType ExtendKind) const {

2401

MVT ReturnMVT = MVT::i32;

2402

2403

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

2404

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

2405

// The ABI does not require i1, i8 or i16 to be extended.

2406

2407

// On Darwin, there is code in the wild relying on Clang's old behaviour of

2408

// always extending i8/i16 return values, so keep doing that for now.

2409

// (PR26665).

2410

ReturnMVT = MVT::i8;

2411

}

2412

2413

EVT MinVT = getRegisterType(Context, ReturnMVT);

2414

return VT.bitsLT(MinVT) ? MinVT : VT;

2415

}

2416

2417

/// Reads two 32 bit registers and creates a 64 bit mask value.

2418

/// \param VA The current 32 bit value that need to be assigned.

2419

/// \param NextVA The next 32 bit value that need to be assigned.

2420

/// \param Root The parent DAG node.

2421

/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for

2422

/// glue purposes. In the case the DAG is already using

2423

/// physical register instead of virtual, we should glue

2424

/// our new SDValue to InFlag SDvalue.

2425

/// \return a new SDvalue of size 64bit.

2426

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

2427

SDValue &Root, SelectionDAG &DAG,

2428

const SDLoc &Dl, const X86Subtarget &Subtarget,

2429

SDValue *InFlag = nullptr) {

2430

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2430, __extension__ __PRETTY_FUNCTION__));

2431

2432

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2433, __extension__ __PRETTY_FUNCTION__))

2433

"Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2433, __extension__ __PRETTY_FUNCTION__));

2434

assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2435, __extension__ __PRETTY_FUNCTION__))

2435

"The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2435, __extension__ __PRETTY_FUNCTION__));

2436

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2437, __extension__ __PRETTY_FUNCTION__))

2437

"The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2437, __extension__ __PRETTY_FUNCTION__));

2438

2439

SDValue Lo, Hi;

2440

unsigned Reg;

2441

SDValue ArgValueLo, ArgValueHi;

2442

2443

MachineFunction &MF = DAG.getMachineFunction();

2444

const TargetRegisterClass *RC = &X86::GR32RegClass;

2445

2446

// Read a 32 bit value from the registers

2447

if (nullptr == InFlag) {

2448

// When no physical register is present,

2449

// create an intermediate virtual register

2450

Reg = MF.addLiveIn(VA.getLocReg(), RC);

2451

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

2452

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

2453

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

2454

} else {

2455

// When a physical register is available read the value from it and glue

2456

// the reads together.

2457

ArgValueLo =

2458

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);

2459

*InFlag = ArgValueLo.getValue(2);

2460

ArgValueHi =

2461

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);

2462

*InFlag = ArgValueHi.getValue(2);

2463

}

2464

2465

// Convert the i32 type into v32i1 type

2466

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

2467

2468

// Convert the i32 type into v32i1 type

2469

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

2470

2471

// Concatenate the two values together

2472

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

2473

}

2474

2475

/// The function will lower a register of various sizes (8/16/32/64)

2476

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

2477

/// \returns a DAG node contains the operand after lowering to mask type.

2478

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

2479

const EVT &ValLoc, const SDLoc &Dl,

2480

SelectionDAG &DAG) {

2481

SDValue ValReturned = ValArg;

2482

2483

if (ValVT == MVT::v1i1)

2484

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

2485

2486

if (ValVT == MVT::v64i1) {

2487

// In 32 bit machine, this case is handled by getv64i1Argument

2488

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2488, __extension__ __PRETTY_FUNCTION__));

2489

// In 64 bit machine, There is no need to truncate the value only bitcast

2490

} else {

2491

MVT maskLen;

2492

switch (ValVT.getSimpleVT().SimpleTy) {

2493

case MVT::v8i1:

2494

maskLen = MVT::i8;

2495

break;

2496

case MVT::v16i1:

2497

maskLen = MVT::i16;

2498

break;

2499

case MVT::v32i1:

2500

maskLen = MVT::i32;

2501

break;

2502

default:

2503

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2503);

2504

}

2505

2506

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

2507

}

2508

return DAG.getBitcast(ValVT, ValReturned);

2509

}

2510

2511

/// Lower the result values of a call into the

2512

/// appropriate copies out of appropriate physical registers.

2513

///

2514

SDValue X86TargetLowering::LowerCallResult(

2515

SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,

2516

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

2517

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

2518

uint32_t *RegMask) const {

2519

2520

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

2521

// Assign locations to each value returned by this call.

2522

SmallVector<CCValAssign, 16> RVLocs;

2523

bool Is64Bit = Subtarget.is64Bit();

2524

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

2525

*DAG.getContext());

2526

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

2527

2528

// Copy all of the result registers out of their specified physreg.

2529

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

2530

++I, ++InsIndex) {

2531

CCValAssign &VA = RVLocs[I];

2532

EVT CopyVT = VA.getLocVT();

2533

2534

// In some calling conventions we need to remove the used registers

2535

// from the register mask.

2536

if (RegMask) {

2537

for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);

2538

SubRegs.isValid(); ++SubRegs)

2539

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

2540

}

2541

2542

// If this is x86-64, and we disabled SSE, we can't return FP values

2543

if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&

2544

((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {

2545

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

2546

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

2547

}

2548

2549

// If we prefer to use the value in xmm registers, copy it out as f80 and

2550

// use a truncate to move it from fp stack reg to xmm reg.

2551

bool RoundAfterCopy = false;

2552

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

2553

isScalarFPTypeInSSEReg(VA.getValVT())) {

2554

if (!Subtarget.hasX87())

2555

report_fatal_error("X87 register return with X87 disabled");

2556

CopyVT = MVT::f80;

2557

RoundAfterCopy = (CopyVT != VA.getLocVT());

2558

}

2559

2560

SDValue Val;

2561

if (VA.needsCustom()) {

2562

2563

2564

Val =

2565

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);

2566

} else {

2567

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)

2568

.getValue(1);

2569

Val = Chain.getValue(0);

2570

InFlag = Chain.getValue(2);

2571

}

2572

2573

if (RoundAfterCopy)

2574

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

2575

// This truncation won't change the value.

2576

DAG.getIntPtrConstant(1, dl));

2577

2578

if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {

2579

if (VA.getValVT().isVector() &&

2580

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

2581

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

2582

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

2583

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

2584

} else

2585

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

2586

}

2587

2588

InVals.push_back(Val);

2589

}

2590

2591

return Chain;

2592

}

2593

2594

//===----------------------------------------------------------------------===//

2595

// C & StdCall & Fast Calling Convention implementation

2596

//===----------------------------------------------------------------------===//

2597

// StdCall calling convention seems to be standard for many Windows' API

2598

// routines and around. It differs from C calling convention just a little:

2599

// callee should clean up the stack, not caller. Symbols should be also

2600

// decorated in some fancy way :) It doesn't support any vector arguments.

2601

// For info on fast calling convention see Fast Calling Convention (tail call)

2602

// implementation LowerX86_32FastCCCallTo.

2603

2604

/// CallIsStructReturn - Determines whether a call uses struct return

2605

/// semantics.

2606

enum StructReturnType {

2607

NotStructReturn,

2608

RegStructReturn,

2609

StackStructReturn

2610

};

2611

static StructReturnType

2612

callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {

2613

if (Outs.empty())

2614

return NotStructReturn;

2615

2616

const ISD::ArgFlagsTy &Flags = Outs[0].Flags;

2617

if (!Flags.isSRet())

2618

return NotStructReturn;

2619

if (Flags.isInReg() || IsMCU)

2620

return RegStructReturn;

2621

return StackStructReturn;

2622

}

2623

2624

/// Determines whether a function uses struct return semantics.

2625

static StructReturnType

2626

argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {

2627

if (Ins.empty())

2628

return NotStructReturn;

2629

2630

const ISD::ArgFlagsTy &Flags = Ins[0].Flags;

2631

if (!Flags.isSRet())

2632

return NotStructReturn;

2633

if (Flags.isInReg() || IsMCU)

2634

return RegStructReturn;

2635

return StackStructReturn;

2636

}

2637

2638

/// Make a copy of an aggregate at address specified by "Src" to address

2639

/// "Dst" with size and alignment information specified by the specific

2640

/// parameter attribute. The copy will be passed as a byval function parameter.

2641

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

2642

SDValue Chain, ISD::ArgFlagsTy Flags,

2643

SelectionDAG &DAG, const SDLoc &dl) {

2644

SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

2645

2646

return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),

2647

/*isVolatile*/false, /*AlwaysInline=*/true,

2648

/*isTailCall*/false,

2649

MachinePointerInfo(), MachinePointerInfo());

2650

}

2651

2652

/// Return true if the calling convention is one that we can guarantee TCO for.

2653

static bool canGuaranteeTCO(CallingConv::ID CC) {

2654

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

2655

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

2656

CC == CallingConv::HHVM);

2657

}

2658

2659

/// Return true if we might ever do TCO for calls with this calling convention.

2660

static bool mayTailCallThisCC(CallingConv::ID CC) {

2661

switch (CC) {

2662

// C calling conventions:

2663

case CallingConv::C:

2664

case CallingConv::Win64:

2665

case CallingConv::X86_64_SysV:

2666

// Callee pop conventions:

2667

case CallingConv::X86_ThisCall:

2668

case CallingConv::X86_StdCall:

2669

case CallingConv::X86_VectorCall:

2670

case CallingConv::X86_FastCall:

2671

return true;

2672

default:

2673

return canGuaranteeTCO(CC);

2674

}

2675

}

2676

2677

/// Return true if the function is being made into a tailcall target by

2678

/// changing its ABI.

2679

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

2680

return GuaranteedTailCallOpt && canGuaranteeTCO(CC);

2681

}

2682

2683

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

2684

auto Attr =

2685

CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");

2686

if (!CI->isTailCall() || Attr.getValueAsString() == "true")

2687

return false;

2688

2689

ImmutableCallSite CS(CI);

2690

CallingConv::ID CalleeCC = CS.getCallingConv();

2691

if (!mayTailCallThisCC(CalleeCC))

2692

return false;

2693

2694

return true;

2695

}

2696

2697

SDValue

2698

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

2699

const SmallVectorImpl<ISD::InputArg> &Ins,

2700

const SDLoc &dl, SelectionDAG &DAG,

2701

const CCValAssign &VA,

2702

MachineFrameInfo &MFI, unsigned i) const {

2703

// Create the nodes corresponding to a load from this parameter slot.

2704

ISD::ArgFlagsTy Flags = Ins[i].Flags;

2705

bool AlwaysUseMutable = shouldGuaranteeTCO(

2706

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

2707

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

2708

EVT ValVT;

2709

MVT PtrVT = getPointerTy(DAG.getDataLayout());

2710

2711

// If value is passed by pointer we have address passed instead of the value

2712

// itself. No need to extend if the mask value and location share the same

2713

// absolute size.

2714

bool ExtendedInMem =

2715

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

2716

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

2717

2718

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

2719

ValVT = VA.getLocVT();

2720

else

2721

ValVT = VA.getValVT();

2722

2723

// Calculate SP offset of interrupt parameter, re-arrange the slot normally

2724

// taken by a return address.

2725

int Offset = 0;

2726

if (CallConv == CallingConv::X86_INTR) {

2727

// X86 interrupts may take one or two arguments.

2728

// On the stack there will be no return address as in regular call.

2729

// Offset of last argument need to be set to -4/-8 bytes.

2730

// Where offset of the first argument out of two, should be set to 0 bytes.

2731

Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);

2732

if (Subtarget.is64Bit() && Ins.size() == 2) {

2733

// The stack pointer needs to be realigned for 64 bit handlers with error

2734

// code, so the argument offset changes by 8 bytes.

2735

Offset += 8;

2736

}

2737

}

2738

2739

// FIXME: For now, all byval parameter objects are marked mutable. This can be

2740

// changed with more analysis.

2741

// In case of tail call optimization mark all arguments mutable. Since they

2742

// could be overwritten by lowering of arguments in case of a tail call.

2743

if (Flags.isByVal()) {

2744

unsigned Bytes = Flags.getByValSize();

2745

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

2746

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);

2747

// Adjust SP offset of interrupt parameter.

2748

if (CallConv == CallingConv::X86_INTR) {

2749

MFI.setObjectOffset(FI, Offset);

2750

}

2751

return DAG.getFrameIndex(FI, PtrVT);

2752

}

2753

2754

// This is an argument in memory. We might be able to perform copy elision.

2755

if (Flags.isCopyElisionCandidate()) {

2756

EVT ArgVT = Ins[i].ArgVT;

2757

SDValue PartAddr;

2758

if (Ins[i].PartOffset == 0) {

2759

// If this is a one-part value or the first part of a multi-part value,

2760

// create a stack object for the entire argument value type and return a

2761

// load from our portion of it. This assumes that if the first part of an

2762

// argument is in memory, the rest will also be in memory.

2763

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

2764

/*Immutable=*/false);

2765

PartAddr = DAG.getFrameIndex(FI, PtrVT);

2766

return DAG.getLoad(

2767

ValVT, dl, Chain, PartAddr,

2768

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

2769

} else {

2770

// This is not the first piece of an argument in memory. See if there is

2771

// already a fixed stack object including this offset. If so, assume it

2772

// was created by the PartOffset == 0 branch above and create a load from

2773

// the appropriate offset into it.

2774

int64_t PartBegin = VA.getLocMemOffset();

2775

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

2776

int FI = MFI.getObjectIndexBegin();

2777

for (; MFI.isFixedObjectIndex(FI); ++FI) {

2778

int64_t ObjBegin = MFI.getObjectOffset(FI);

2779

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

2780

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

2781

break;

2782

}

2783

if (MFI.isFixedObjectIndex(FI)) {

2784

SDValue Addr =

2785

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

2786

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

2787

return DAG.getLoad(

2788

ValVT, dl, Chain, Addr,

2789

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

2790

Ins[i].PartOffset));

2791

}

2792

}

2793

}

2794

2795

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

2796

VA.getLocMemOffset(), isImmutable);

2797

2798

// Set SExt or ZExt flag.

2799

if (VA.getLocInfo() == CCValAssign::ZExt) {

2800

MFI.setObjectZExt(FI, true);

2801

} else if (VA.getLocInfo() == CCValAssign::SExt) {

2802

MFI.setObjectSExt(FI, true);

2803

}

2804

2805

// Adjust SP offset of interrupt parameter.

2806

if (CallConv == CallingConv::X86_INTR) {

2807

MFI.setObjectOffset(FI, Offset);

2808

}

2809

2810

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

2811

SDValue Val = DAG.getLoad(

2812

ValVT, dl, Chain, FIN,

2813

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

2814

return ExtendedInMem

2815

? (VA.getValVT().isVector()

2816

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

2817

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

2818

: Val;

2819

}

2820

2821

// FIXME: Get this from tablegen.

2822

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

2823

const X86Subtarget &Subtarget) {

2824

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2824, __extension__ __PRETTY_FUNCTION__));

2825

2826

if (Subtarget.isCallingConvWin64(CallConv)) {

2827

static const MCPhysReg GPR64ArgRegsWin64[] = {

2828

X86::RCX, X86::RDX, X86::R8, X86::R9

2829

};

2830

return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

2831

}

2832

2833

static const MCPhysReg GPR64ArgRegs64Bit[] = {

2834

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

2835

};

2836

return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

2837

}

2838

2839

// FIXME: Get this from tablegen.

2840

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

2841

CallingConv::ID CallConv,

2842

const X86Subtarget &Subtarget) {

2843

2844

if (Subtarget.isCallingConvWin64(CallConv)) {

2845

// The XMM registers which might contain var arg parameters are shadowed

2846

// in their paired GPR. So we only need to save the GPR to their home

2847

// slots.

2848

// TODO: __vectorcall will change this.

2849

return None;

2850

}

2851

2852

const Function *Fn = MF.getFunction();

2853

bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);

2854

bool isSoftFloat = Subtarget.useSoftFloat();

2855

assert(!(isSoftFloat && NoImplicitFloatOps) &&(static_cast <bool> (!(isSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2856, __extension__ __PRETTY_FUNCTION__))

2856

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(isSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2856, __extension__ __PRETTY_FUNCTION__));

2857

if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())

2858

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

2859

// registers.

2860

return None;

2861

2862

static const MCPhysReg XMMArgRegs64Bit[] = {

2863

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

2864

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

2865

};

2866

return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

2867

}

2868

2869

#ifndef NDEBUG

2870

static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {

2871

return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),

2872

[](const CCValAssign &A, const CCValAssign &B) -> bool {

2873

return A.getValNo() < B.getValNo();

2874

});

2875

}

2876

#endif

2877

2878

SDValue X86TargetLowering::LowerFormalArguments(

2879

SDValue Chain, CallingConv::ID CallConv, bool isVarArg,

2880

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

2881

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

2882

MachineFunction &MF = DAG.getMachineFunction();

2883

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

2884

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

2885

2886

const Function *Fn = MF.getFunction();

2887

if (Fn->hasExternalLinkage() &&

2888

Subtarget.isTargetCygMing() &&

2889

Fn->getName() == "main")

2890

FuncInfo->setForceFramePointer(true);

2891

2892

MachineFrameInfo &MFI = MF.getFrameInfo();

2893

bool Is64Bit = Subtarget.is64Bit();

2894

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

2895

2896

assert((static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2898, __extension__ __PRETTY_FUNCTION__))

2897

!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2898, __extension__ __PRETTY_FUNCTION__))

2898

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2898, __extension__ __PRETTY_FUNCTION__));

2899

2900

if (CallConv == CallingConv::X86_INTR) {

2901

bool isLegal = Ins.size() == 1 ||

2902

(Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||

2903

(!Is64Bit && Ins[1].VT == MVT::i32)));

2904

if (!isLegal)

2905

report_fatal_error("X86 interrupts may take one or two arguments");

2906

}

2907

2908

// Assign locations to all of the incoming arguments.

2909

SmallVector<CCValAssign, 16> ArgLocs;

2910

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

2911

2912

// Allocate shadow area for Win64.

2913

if (IsWin64)

2914

CCInfo.AllocateStack(32, 8);

2915

2916

CCInfo.AnalyzeArguments(Ins, CC_X86);

2917

2918

// In vectorcall calling convention a second pass is required for the HVA

2919

// types.

2920

if (CallingConv::X86_VectorCall == CallConv) {

2921

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

2922

}

2923

2924

// The next loop assumes that the locations are in the same order of the

2925

// input arguments.

2926

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2927, __extension__ __PRETTY_FUNCTION__))

2927

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2927, __extension__ __PRETTY_FUNCTION__));

2928

2929

SDValue ArgValue;

2930

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

2931

++I, ++InsIndex) {

2932

assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2932, __extension__ __PRETTY_FUNCTION__));

2933

CCValAssign &VA = ArgLocs[I];

2934

2935

if (VA.isRegLoc()) {

2936

EVT RegVT = VA.getLocVT();

2937

if (VA.needsCustom()) {

2938

assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2940, __extension__ __PRETTY_FUNCTION__))

2939

VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2940, __extension__ __PRETTY_FUNCTION__))

2940

2941

2942

// v64i1 values, in regcall calling convention, that are

2943

// compiled to 32 bit arch, are split up into two registers.

2944

ArgValue =

2945

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

2946

} else {

2947

const TargetRegisterClass *RC;

2948

if (RegVT == MVT::i32)

2949

RC = &X86::GR32RegClass;

2950

else if (Is64Bit && RegVT == MVT::i64)

2951

RC = &X86::GR64RegClass;

2952

else if (RegVT == MVT::f32)

2953

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

2954

else if (RegVT == MVT::f64)

2955

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

2956

else if (RegVT == MVT::f80)

2957

RC = &X86::RFP80RegClass;

2958

else if (RegVT == MVT::f128)

2959

RC = &X86::FR128RegClass;

2960

else if (RegVT.is512BitVector())

2961

RC = &X86::VR512RegClass;

2962

else if (RegVT.is256BitVector())

2963

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

2964

else if (RegVT.is128BitVector())

2965

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

2966

else if (RegVT == MVT::x86mmx)

2967

RC = &X86::VR64RegClass;

2968

else if (RegVT == MVT::v1i1)

2969

RC = &X86::VK1RegClass;

2970

else if (RegVT == MVT::v8i1)

2971

RC = &X86::VK8RegClass;

2972

else if (RegVT == MVT::v16i1)

2973

RC = &X86::VK16RegClass;

2974

else if (RegVT == MVT::v32i1)

2975

RC = &X86::VK32RegClass;

2976

else if (RegVT == MVT::v64i1)

2977

RC = &X86::VK64RegClass;

2978

else

2979

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2979);

2980

2981

unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);

2982

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

2983

}

2984

2985

// If this is an 8 or 16-bit value, it is really passed promoted to 32

2986

// bits. Insert an assert[sz]ext to capture this, then truncate to the

2987

// right size.

2988

if (VA.getLocInfo() == CCValAssign::SExt)

2989

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

2990

DAG.getValueType(VA.getValVT()));

2991

else if (VA.getLocInfo() == CCValAssign::ZExt)

2992

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

2993

DAG.getValueType(VA.getValVT()));

2994

else if (VA.getLocInfo() == CCValAssign::BCvt)

2995

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

2996

2997

if (VA.isExtInLoc()) {

2998

// Handle MMX values passed in XMM regs.

2999

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

3000

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

3001

else if (VA.getValVT().isVector() &&

3002

VA.getValVT().getScalarType() == MVT::i1 &&

3003

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3004

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3005

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3006

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

3007

} else

3008

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

3009

}

3010

} else {

3011

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3011, __extension__ __PRETTY_FUNCTION__));

3012

ArgValue =

3013

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

3014

}

3015

3016

// If value is passed via pointer - do a load.

3017

if (VA.getLocInfo() == CCValAssign::Indirect)

3018

ArgValue =

3019

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

3020

3021

InVals.push_back(ArgValue);

3022

}

3023

3024

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

3025

// Swift calling convention does not require we copy the sret argument

3026

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

3027

if (CallConv == CallingConv::Swift)

3028

continue;

3029

3030

// All x86 ABIs require that for returning structs by value we copy the

3031

// sret argument into %rax/%eax (depending on ABI) for the return. Save

3032

// the argument into a virtual register so that we can access it from the

3033

// return points.

3034

if (Ins[I].Flags.isSRet()) {

3035

unsigned Reg = FuncInfo->getSRetReturnReg();

3036

if (!Reg) {

3037

MVT PtrTy = getPointerTy(DAG.getDataLayout());

3038

Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

3039

FuncInfo->setSRetReturnReg(Reg);

3040

}

3041

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

3042

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

3043

break;

3044

}

3045

}

3046

3047

unsigned StackSize = CCInfo.getNextStackOffset();

3048

// Align stack specially for tail calls.

3049

if (shouldGuaranteeTCO(CallConv,

3050

MF.getTarget().Options.GuaranteedTailCallOpt))

3051

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

3052

3053

// If the function takes variable number of arguments, make a frame index for

3054

// the start of the first vararg value... for expansion of llvm.va_start. We

3055

// can skip this if there are no va_start calls.

3056

if (MFI.hasVAStart() &&

3057

(Is64Bit || (CallConv != CallingConv::X86_FastCall &&

3058

CallConv != CallingConv::X86_ThisCall))) {

3059

FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));

3060

}

3061

3062

// Figure out if XMM registers are in use.

3063

assert(!(Subtarget.useSoftFloat() &&(static_cast <bool> (!(Subtarget.useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!") ? void (
0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3065, __extension__ __PRETTY_FUNCTION__))

3064

Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!(Subtarget.useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!") ? void (
0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3065, __extension__ __PRETTY_FUNCTION__))

3065

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(Subtarget.useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!") ? void (
0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3065, __extension__ __PRETTY_FUNCTION__));

3066

3067

// 64-bit calling conventions support varargs and register parameters, so we

3068

// have to do extra work to spill them in the prologue.

3069

if (Is64Bit && isVarArg && MFI.hasVAStart()) {

3070

// Find the first unallocated argument registers.

3071

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

3072

ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);

3073

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

3074

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

3075

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3076, __extension__ __PRETTY_FUNCTION__))

3076

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3076, __extension__ __PRETTY_FUNCTION__));

3077

3078

// Gather all the live in physical registers.

3079

SmallVector<SDValue, 6> LiveGPRs;

3080

SmallVector<SDValue, 8> LiveXMMRegs;

3081

SDValue ALVal;

3082

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

3083

unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);

3084

LiveGPRs.push_back(

3085

DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));

3086

}

3087

if (!ArgXMMs.empty()) {

3088

unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);

3089

ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);

3090

for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {

3091

unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);

3092

LiveXMMRegs.push_back(

3093

DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));

3094

}

3095

}

3096

3097

if (IsWin64) {

3098

// Get to the caller-allocated home save location. Add 8 to account

3099

// for the return address.

3100

int HomeOffset = TFI.getOffsetOfLocalArea() + 8;

3101

FuncInfo->setRegSaveFrameIndex(

3102

MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

3103

// Fixup to set vararg frame on shadow area (4 x i64).

3104

if (NumIntRegs < 4)

3105

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

3106

} else {

3107

// For X86-64, if there are vararg parameters that are passed via

3108

// registers, then we must store them to their spots on the stack so

3109

// they may be loaded by dereferencing the result of va_next.

3110

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

3111

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

3112

FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(

3113

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));

3114

}

3115

3116

// Store the integer parameter registers.

3117

SmallVector<SDValue, 8> MemOps;

3118

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

3119

getPointerTy(DAG.getDataLayout()));

3120

unsigned Offset = FuncInfo->getVarArgsGPOffset();

3121

for (SDValue Val : LiveGPRs) {

3122

SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

3123

RSFIN, DAG.getIntPtrConstant(Offset, dl));

3124

SDValue Store =

3125

DAG.getStore(Val.getValue(1), dl, Val, FIN,

3126

MachinePointerInfo::getFixedStack(

3127

DAG.getMachineFunction(),

3128

FuncInfo->getRegSaveFrameIndex(), Offset));

3129

MemOps.push_back(Store);

3130

Offset += 8;

3131

}

3132

3133

if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {

3134

// Now store the XMM (fp + vector) parameter registers.

3135

SmallVector<SDValue, 12> SaveXMMOps;

3136

SaveXMMOps.push_back(Chain);

3137

SaveXMMOps.push_back(ALVal);

3138

SaveXMMOps.push_back(DAG.getIntPtrConstant(

3139

FuncInfo->getRegSaveFrameIndex(), dl));

3140

SaveXMMOps.push_back(DAG.getIntPtrConstant(

3141

FuncInfo->getVarArgsFPOffset(), dl));

3142

SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),

3143

LiveXMMRegs.end());

3144

MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,

3145

MVT::Other, SaveXMMOps));

3146

}

3147

3148

if (!MemOps.empty())

3149

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

3150

}

3151

3152

if (isVarArg && MFI.hasMustTailInVarArgFunc()) {

3153

// Find the largest legal vector type.

3154

MVT VecVT = MVT::Other;

3155

// FIXME: Only some x86_32 calling conventions support AVX512.

3156

if (Subtarget.hasAVX512() &&

3157

(Is64Bit || (CallConv == CallingConv::X86_VectorCall ||

3158

CallConv == CallingConv::Intel_OCL_BI)))

3159

VecVT = MVT::v16f32;

3160

else if (Subtarget.hasAVX())

3161

VecVT = MVT::v8f32;

3162

else if (Subtarget.hasSSE2())

3163

VecVT = MVT::v4f32;

3164

3165

// We forward some GPRs and some vector types.

3166

SmallVector<MVT, 2> RegParmTypes;

3167

MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;

3168

RegParmTypes.push_back(IntVT);

3169

if (VecVT != MVT::Other)

3170

RegParmTypes.push_back(VecVT);

3171

3172

// Compute the set of forwarded registers. The rest are scratch.

3173

SmallVectorImpl<ForwardedRegister> &Forwards =

3174

FuncInfo->getForwardedMustTailRegParms();

3175

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

3176

3177

// Conservatively forward AL on x86_64, since it might be used for varargs.

3178

if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {

3179

unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);

3180

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

3181

}

3182

3183

// Copy all forwards from physical to virtual registers.

3184

for (ForwardedRegister &F : Forwards) {

3185

// FIXME: Can we use a less constrained schedule?

3186

SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

3187

F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));

3188

Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);

3189

}

3190

}

3191

3192

// Some CCs need callee pop.

3193

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

3194

MF.getTarget().Options.GuaranteedTailCallOpt)) {

3195

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

3196

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

3197

// X86 interrupts must pop the error code (and the alignment padding) if

3198

// present.

3199

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

3200

} else {

3201

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

3202

// If this is an sret function, the return should pop the hidden pointer.

3203

if (!Is64Bit && !canGuaranteeTCO(CallConv) &&

3204

!Subtarget.getTargetTriple().isOSMSVCRT() &&

3205

argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)

3206

FuncInfo->setBytesToPopOnReturn(4);

3207

}

3208

3209

if (!Is64Bit) {

3210

// RegSaveFrameIndex is X86-64 only.

3211

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

3212

if (CallConv == CallingConv::X86_FastCall ||

3213

CallConv == CallingConv::X86_ThisCall)

3214

// fastcc functions can't have varargs.

3215

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

3216

}

3217

3218

FuncInfo->setArgumentStackSize(StackSize);

3219

3220

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

3221

EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());

3222

if (Personality == EHPersonality::CoreCLR) {

3223

assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3223, __extension__ __PRETTY_FUNCTION__));

3224

// TODO: Add a mechanism to frame lowering that will allow us to indicate

3225

// that we'd prefer this slot be allocated towards the bottom of the frame

3226

// (i.e. near the stack pointer after allocating the frame). Every

3227

// funclet needs a copy of this slot in its (mostly empty) frame, and the

3228

// offset from the bottom of this and each funclet's frame must be the

3229

// same, so the size of funclets' (mostly empty) frames is dictated by

3230

// how far this slot is from the bottom (since they allocate just enough

3231

// space to accommodate holding this slot at the correct offset).

3232

int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);

3233

EHInfo->PSPSymFrameIdx = PSPSymFI;

3234

}

3235

}

3236

3237

if (CallConv == CallingConv::X86_RegCall ||

3238

Fn->hasFnAttribute("no_caller_saved_registers")) {

3239

MachineRegisterInfo &MRI = MF.getRegInfo();

3240

for (std::pair<unsigned, unsigned> Pair : MRI.liveins())

3241

MRI.disableCalleeSavedRegister(Pair.first);

3242

}

3243

3244

return Chain;

3245

}

3246

3247

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

3248

SDValue Arg, const SDLoc &dl,

3249

SelectionDAG &DAG,

3250

const CCValAssign &VA,

3251

ISD::ArgFlagsTy Flags) const {

3252

unsigned LocMemOffset = VA.getLocMemOffset();

3253

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

3254

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

3255

StackPtr, PtrOff);

3256

if (Flags.isByVal())

3257

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

3258

3259

return DAG.getStore(

3260

Chain, dl, Arg, PtrOff,

3261

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));

3262

}

3263

3264

/// Emit a load of return address if tail call

3265

/// optimization is performed and it is required.

3266

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

3267

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

3268

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

3269

// Adjust the Return address stack slot.

3270

EVT VT = getPointerTy(DAG.getDataLayout());

3271

OutRetAddr = getReturnAddressFrameIndex(DAG);

3272

3273

// Load the "old" Return address.

3274

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

3275

return SDValue(OutRetAddr.getNode(), 1);

3276

}

3277

3278

/// Emit a store of the return address if tail call

3279

/// optimization is performed and it is required (FPDiff!=0).

3280

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

3281

SDValue Chain, SDValue RetAddrFrIdx,

3282

EVT PtrVT, unsigned SlotSize,

3283

int FPDiff, const SDLoc &dl) {

3284

// Store the return address to the appropriate stack slot.

3285

if (!FPDiff) return Chain;

3286

// Calculate the new stack slot for the return address.

3287

int NewReturnAddrFI =

3288

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

3289

false);

3290

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

3291

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

3292

MachinePointerInfo::getFixedStack(

3293

DAG.getMachineFunction(), NewReturnAddrFI));

3294

return Chain;

3295

}

3296

3297

/// Returns a vector_shuffle mask for an movs{s|d}, movd

3298

/// operation of specified width.

3299

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

3300

SDValue V2) {

3301

unsigned NumElems = VT.getVectorNumElements();

3302

SmallVector<int, 8> Mask;

3303

Mask.push_back(NumElems);

3304

for (unsigned i = 1; i != NumElems; ++i)

3305

Mask.push_back(i);

3306

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

3307

}

3308

3309

SDValue

3310

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

3311

SmallVectorImpl<SDValue> &InVals) const {

3312

SelectionDAG &DAG = CLI.DAG;

3313

SDLoc &dl = CLI.DL;

3314

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

3315

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

3316

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

3317

SDValue Chain = CLI.Chain;

3318

SDValue Callee = CLI.Callee;

3319

CallingConv::ID CallConv = CLI.CallConv;

3320

bool &isTailCall = CLI.IsTailCall;

3321

bool isVarArg = CLI.IsVarArg;

3322

3323

MachineFunction &MF = DAG.getMachineFunction();

3324

bool Is64Bit = Subtarget.is64Bit();

3325

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

3326

StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());

3327

bool IsSibcall = false;

3328

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

3329

auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");

3330

const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());

3331

const Function *Fn = CI ? CI->getCalledFunction() : nullptr;

3332

bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||

3333

(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

3334

3335

if (CallConv == CallingConv::X86_INTR)

3336

report_fatal_error("X86 interrupts may not be called directly");

3337

3338

if (Attr.getValueAsString() == "true")

3339

isTailCall = false;

3340

3341

if (Subtarget.isPICStyleGOT() &&

3342

!MF.getTarget().Options.GuaranteedTailCallOpt) {

3343

// If we are using a GOT, disable tail calls to external symbols with

3344

// default visibility. Tail calling such a symbol requires using a GOT

3345

// relocation, which forces early binding of the symbol. This breaks code

3346

// that require lazy function symbol resolution. Using musttail or

3347

// GuaranteedTailCallOpt will override this.

3348

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

3349

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

3350

G->getGlobal()->hasDefaultVisibility()))

3351

isTailCall = false;

3352

}

3353

3354

bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();

3355

if (IsMustTail) {

3356

// Force this to be a tail call. The verifier rules are enough to ensure

3357

// that we can lower this successfully without moving the return address

3358

// around.

3359

isTailCall = true;

3360

} else if (isTailCall) {

3361

// Check if it's really possible to do a tail call.

3362

isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,

3363

isVarArg, SR != NotStructReturn,

3364

MF.getFunction()->hasStructRetAttr(), CLI.RetTy,

3365

Outs, OutVals, Ins, DAG);

3366

3367

// Sibcalls are automatically detected tailcalls which do not require

3368

// ABI changes.

3369

if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)

3370

IsSibcall = true;

3371

3372

if (isTailCall)

3373

++NumTailCalls;

3374

}

3375

3376

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3377, __extension__ __PRETTY_FUNCTION__))

3377

"Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3377, __extension__ __PRETTY_FUNCTION__));

3378

3379

// Analyze operands of the call, assigning locations to each operand.

3380

SmallVector<CCValAssign, 16> ArgLocs;

3381

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

3382

3383

// Allocate shadow area for Win64.

3384

if (IsWin64)

3385

CCInfo.AllocateStack(32, 8);

3386

3387

CCInfo.AnalyzeArguments(Outs, CC_X86);

3388

3389

// In vectorcall calling convention a second pass is required for the HVA

3390

// types.

3391

if (CallingConv::X86_VectorCall == CallConv) {

3392

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

3393

}

3394

3395

// Get a count of how many bytes are to be pushed on the stack.

3396

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

3397

if (IsSibcall)

3398

// This is a sibcall. The memory operands are available in caller's

3399

// own caller's stack.

3400

NumBytes = 0;

3401

else if (MF.getTarget().Options.GuaranteedTailCallOpt &&

3402

canGuaranteeTCO(CallConv))

3403

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

3404

3405

int FPDiff = 0;

3406

if (isTailCall && !IsSibcall && !IsMustTail) {

3407

// Lower arguments at fp - stackoffset + fpdiff.

3408

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

3409

3410

FPDiff = NumBytesCallerPushed - NumBytes;

3411

3412

// Set the delta of movement of the returnaddr stackslot.

3413

// But only set if delta is greater than previous delta.

3414

if (FPDiff < X86Info->getTCReturnAddrDelta())

3415

X86Info->setTCReturnAddrDelta(FPDiff);

3416

}

3417

3418

unsigned NumBytesToPush = NumBytes;

3419

unsigned NumBytesToPop = NumBytes;

3420

3421

// If we have an inalloca argument, all stack space has already been allocated

3422

// for us and be right at the top of the stack. We don't support multiple

3423

// arguments passed in memory when using inalloca.

3424

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

3425

NumBytesToPush = 0;

3426

if (!ArgLocs.back().isMemLoc())

3427

report_fatal_error("cannot use inalloca attribute on a register "

3428

"parameter");

3429

if (ArgLocs.back().getLocMemOffset() != 0)

3430

report_fatal_error("any parameter with the inalloca attribute must be "

3431

"the only memory argument");

3432

}

3433

3434

if (!IsSibcall)

3435

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

3436

NumBytes - NumBytesToPush, dl);

3437

3438

SDValue RetAddrFrIdx;

3439

// Load return address for tail calls.

3440

if (isTailCall && FPDiff)

3441

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

3442

Is64Bit, FPDiff, dl);

3443

3444

SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

3445

SmallVector<SDValue, 8> MemOpChains;

3446

SDValue StackPtr;

3447

3448

// The next loop assumes that the locations are in the same order of the

3449

// input arguments.

3450

3451

3452

3453

// Walk the register/memloc assignments, inserting copies/loads. In the case

3454

// of tail call optimization arguments are handle later.

3455

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

3456

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

3457

++I, ++OutIndex) {

3458

assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3458, __extension__ __PRETTY_FUNCTION__));

3459

// Skip inalloca arguments, they have already been written.

3460

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

3461

if (Flags.isInAlloca())

3462

continue;

3463

3464

CCValAssign &VA = ArgLocs[I];

3465

EVT RegVT = VA.getLocVT();

3466

SDValue Arg = OutVals[OutIndex];

3467

bool isByVal = Flags.isByVal();

3468

3469

// Promote the value if needed.

3470

switch (VA.getLocInfo()) {

3471

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3471);

3472

case CCValAssign::Full: break;

3473

case CCValAssign::SExt:

3474

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

3475

break;

3476

case CCValAssign::ZExt:

3477

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

3478

break;

3479

case CCValAssign::AExt:

3480

if (Arg.getValueType().isVector() &&

3481

Arg.getValueType().getVectorElementType() == MVT::i1)

3482

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

3483

else if (RegVT.is128BitVector()) {

3484

// Special case: passing MMX values in XMM registers.

3485

Arg = DAG.getBitcast(MVT::i64, Arg);

3486

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

3487

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

3488

} else

3489

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

3490

break;

3491

case CCValAssign::BCvt:

3492

Arg = DAG.getBitcast(RegVT, Arg);

3493

break;

3494

case CCValAssign::Indirect: {

3495

// Store the argument.

3496

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

3497

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

3498

Chain = DAG.getStore(

3499

Chain, dl, Arg, SpillSlot,

3500

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3501

Arg = SpillSlot;

3502

break;

3503

}

3504

}

3505

3506

if (VA.needsCustom()) {

3507

3508

3509

// Split v64i1 value into two registers

3510

Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],

3511

Subtarget);

3512

} else if (VA.isRegLoc()) {

3513

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

3514

if (isVarArg && IsWin64) {

3515

// Win64 ABI requires argument XMM reg to be copied to the corresponding

3516

// shadow reg if callee is a varargs function.

3517

unsigned ShadowReg = 0;

3518

switch (VA.getLocReg()) {

3519

case X86::XMM0: ShadowReg = X86::RCX; break;

3520

case X86::XMM1: ShadowReg = X86::RDX; break;

3521

case X86::XMM2: ShadowReg = X86::R8; break;

3522

case X86::XMM3: ShadowReg = X86::R9; break;

3523

}

3524

if (ShadowReg)

3525

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

3526

}

3527

} else if (!IsSibcall && (!isTailCall || isByVal)) {

3528

3529

if (!StackPtr.getNode())

3530

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

3531

getPointerTy(DAG.getDataLayout()));

3532

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

3533

dl, DAG, VA, Flags));

3534

}

3535

}

3536

3537

if (!MemOpChains.empty())

3538

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

3539

3540

if (Subtarget.isPICStyleGOT()) {

3541

// ELF / PIC requires GOT in the EBX register before function calls via PLT

3542

// GOT pointer.

3543

if (!isTailCall) {

3544

RegsToPass.push_back(std::make_pair(

3545

unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

3546

getPointerTy(DAG.getDataLayout()))));

3547

} else {

3548

// If we are tail calling and generating PIC/GOT style code load the

3549

// address of the callee into ECX. The value in ecx is used as target of

3550

// the tail jump. This is done to circumvent the ebx/callee-saved problem

3551

// for tail calls on PIC/GOT architectures. Normally we would just put the

3552

// address of GOT into ebx and then call target@PLT. But for tail calls

3553

// ebx would be restored (since ebx is callee saved) before jumping to the

3554

// target@PLT.

3555

3556

// Note: The actual moving to ECX is done further down.

3557

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

3558

if (G && !G->getGlobal()->hasLocalLinkage() &&

3559

G->getGlobal()->hasDefaultVisibility())

3560

Callee = LowerGlobalAddress(Callee, DAG);

3561

else if (isa<ExternalSymbolSDNode>(Callee))

3562

Callee = LowerExternalSymbol(Callee, DAG);

3563

}

3564

}

3565

3566

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {

3567

// From AMD64 ABI document:

3568

// For calls that may call functions that use varargs or stdargs

3569

// (prototype-less calls or calls to functions containing ellipsis (...) in

3570

// the declaration) %al is used as hidden argument to specify the number

3571

// of SSE registers used. The contents of %al do not need to match exactly

3572

// the number of registers, but must be an ubound on the number of SSE

3573

// registers used and is in the range 0 - 8 inclusive.

3574

3575

// Count the number of XMM registers allocated.

3576

static const MCPhysReg XMMArgRegs[] = {

3577

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3578

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3579

};

3580

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

3581

assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3582, __extension__ __PRETTY_FUNCTION__))

3582

&& "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3582, __extension__ __PRETTY_FUNCTION__));

3583

3584

RegsToPass.push_back(std::make_pair(unsigned(X86::AL),

3585

DAG.getConstant(NumXMMRegs, dl,

3586

MVT::i8)));

3587

}

3588

3589

if (isVarArg && IsMustTail) {

3590

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

3591

for (const auto &F : Forwards) {

3592

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

3593

RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));

3594

}

3595

}

3596

3597

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

3598

// don't need this because the eligibility check rejects calls that require

3599

// shuffling arguments passed in memory.

3600

if (!IsSibcall && isTailCall) {

3601

// Force all the incoming stack arguments to be loaded from the stack

3602

// before any new outgoing arguments are stored to the stack, because the

3603

// outgoing stack slots may alias the incoming argument stack slots, and

3604

// the alias isn't otherwise explicit. This is slightly more conservative

3605

// than necessary, because it means that each store effectively depends

3606

// on every argument instead of just those arguments it would clobber.

3607

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

3608

3609

SmallVector<SDValue, 8> MemOpChains2;

3610

SDValue FIN;

3611

int FI = 0;

3612

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

3613

++I, ++OutsIndex) {

3614

CCValAssign &VA = ArgLocs[I];

3615

3616

if (VA.isRegLoc()) {

3617

if (VA.needsCustom()) {

3618

assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3619, __extension__ __PRETTY_FUNCTION__))

3619

"Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3619, __extension__ __PRETTY_FUNCTION__));

3620

// This means that we are in special case where one argument was

3621

// passed through two register locations - Skip the next location

3622

++I;

3623

}

3624

3625

continue;

3626

}

3627

3628

3629

SDValue Arg = OutVals[OutsIndex];

3630

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

3631

// Skip inalloca arguments. They don't require any work.

3632

if (Flags.isInAlloca())

3633

continue;

3634

// Create frame index.

3635

int32_t Offset = VA.getLocMemOffset()+FPDiff;

3636

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

3637

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

3638

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

3639

3640

if (Flags.isByVal()) {

3641

// Copy relative to framepointer.

3642

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

3643

if (!StackPtr.getNode())

3644

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

3645

getPointerTy(DAG.getDataLayout()));

3646

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

3647

StackPtr, Source);

3648

3649

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

3650

ArgChain,

3651

Flags, DAG, dl));

3652

} else {

3653

// Store relative to framepointer.

3654

MemOpChains2.push_back(DAG.getStore(

3655

ArgChain, dl, Arg, FIN,

3656

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

3657

}

3658

}

3659

3660

if (!MemOpChains2.empty())

3661

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

3662

3663

// Store the return address to the appropriate stack slot.

3664

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

3665

getPointerTy(DAG.getDataLayout()),

3666

RegInfo->getSlotSize(), FPDiff, dl);

3667

}

3668

3669

// Build a sequence of copy-to-reg nodes chained together with token chain

3670

// and flag operands which copy the outgoing args into registers.

3671

SDValue InFlag;

3672

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

3673

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

3674

RegsToPass[i].second, InFlag);

3675

InFlag = Chain.getValue(1);

3676

}

3677

3678

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

3679

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3679, __extension__ __PRETTY_FUNCTION__));

3680

// In the 64-bit large code model, we have to make all calls

3681

// through a register, since the call instruction's 32-bit

3682

// pc-relative offset may not be large enough to hold the whole

3683

// address.

3684

} else if (Callee->getOpcode() == ISD::GlobalAddress) {

3685

// If the callee is a GlobalAddress node (quite common, every direct call

3686

// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack

3687

// it.

3688

GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);

3689

3690

// We should use extra load for direct calls to dllimported functions in

3691

// non-JIT mode.

3692

const GlobalValue *GV = G->getGlobal();

3693

if (!GV->hasDLLImportStorageClass()) {

3694

unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);

3695

3696

Callee = DAG.getTargetGlobalAddress(

3697

GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);

3698

3699

if (OpFlags == X86II::MO_GOTPCREL) {

3700

// Add a wrapper.

3701

Callee = DAG.getNode(X86ISD::WrapperRIP, dl,

3702

getPointerTy(DAG.getDataLayout()), Callee);

3703

// Add extra indirection

3704

Callee = DAG.getLoad(

3705

getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,

3706

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

3707

}

3708

}

3709

} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {

3710

const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();

3711

unsigned char OpFlags =

3712

Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);

3713

3714

Callee = DAG.getTargetExternalSymbol(

3715

S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);

3716

} else if (Subtarget.isTarget64BitILP32() &&

3717

Callee->getValueType(0) == MVT::i32) {

3718

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

3719

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

3720

}

3721

3722

// Returns a chain & a flag for retval copy to use.

3723

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

3724

SmallVector<SDValue, 8> Ops;

3725

3726

if (!IsSibcall && isTailCall) {

3727

Chain = DAG.getCALLSEQ_END(Chain,

3728

DAG.getIntPtrConstant(NumBytesToPop, dl, true),

3729

DAG.getIntPtrConstant(0, dl, true), InFlag, dl);

3730

InFlag = Chain.getValue(1);

3731

}

3732

3733

Ops.push_back(Chain);

3734

Ops.push_back(Callee);

3735

3736

if (isTailCall)

3737

Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

3738

3739

// Add argument registers to the end of the list so that they are known live

3740

// into the call.

3741

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

3742

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

3743

RegsToPass[i].second.getValueType()));

3744

3745

// Add a register mask operand representing the call-preserved registers.

3746

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we

3747

// set X86_INTR calling convention because it has the same CSR mask

3748

// (same preserved registers).

3749

const uint32_t *Mask = RegInfo->getCallPreservedMask(

3750

MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);

3751

assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3751, __extension__ __PRETTY_FUNCTION__));

3752

3753

// If this is an invoke in a 32-bit function using a funclet-based

3754

// personality, assume the function clobbers all registers. If an exception

3755

// is thrown, the runtime will not restore CSRs.

3756

// FIXME: Model this more precisely so that we can register allocate across

3757

// the normal edge and spill and fill across the exceptional edge.

3758

if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {

3759

const Function *CallerFn = MF.getFunction();

3760

EHPersonality Pers =

3761

CallerFn->hasPersonalityFn()

3762

? classifyEHPersonality(CallerFn->getPersonalityFn())

3763

: EHPersonality::Unknown;

3764

if (isFuncletEHPersonality(Pers))

3765

Mask = RegInfo->getNoPreservedMask();

3766

}

3767

3768

// Define a new register mask from the existing mask.

3769

uint32_t *RegMask = nullptr;

3770

3771

// In some calling conventions we need to remove the used physical registers

3772

// from the reg mask.

3773

if (CallConv == CallingConv::X86_RegCall || HasNCSR) {

3774

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

3775

3776

// Allocate a new Reg Mask and copy Mask.

3777

RegMask = MF.allocateRegisterMask(TRI->getNumRegs());

3778

unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;

3779

memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);

3780

3781

// Make sure all sub registers of the argument registers are reset

3782

// in the RegMask.

3783

for (auto const &RegPair : RegsToPass)

3784

for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);

3785

SubRegs.isValid(); ++SubRegs)

3786

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

3787

3788

// Create the RegMask Operand according to our updated mask.

3789

Ops.push_back(DAG.getRegisterMask(RegMask));

3790

} else {

3791

// Create the RegMask Operand according to the static mask.

3792

Ops.push_back(DAG.getRegisterMask(Mask));

3793

}

3794

3795

if (InFlag.getNode())

3796

Ops.push_back(InFlag);

3797

3798

if (isTailCall) {

3799

// We used to do:

3800

//// If this is the first return lowered for this function, add the regs

3801

//// to the liveout set for the function.

3802

// This isn't right, although it's probably harmless on x86; liveouts

3803

// should be computed from returns not tail calls. Consider a void

3804

// function making a tail call to a function returning int.

3805

MF.getFrameInfo().setHasTailCall();

3806

return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

3807

}

3808

3809

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

3810

InFlag = Chain.getValue(1);

3811

3812

// Create the CALLSEQ_END node.

3813

unsigned NumBytesForCalleeToPop;

3814

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

3815

DAG.getTarget().Options.GuaranteedTailCallOpt))

3816

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

3817

else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&

3818

!Subtarget.getTargetTriple().isOSMSVCRT() &&

3819

SR == StackStructReturn)

3820

// If this is a call to a struct-return function, the callee

3821

// pops the hidden struct pointer, so we have to push it back.

3822

// This is common for Darwin/X86, Linux & Mingw32 targets.

3823

// For MSVC Win32 targets, the caller pops the hidden struct pointer.

3824

NumBytesForCalleeToPop = 4;

3825

else

3826

NumBytesForCalleeToPop = 0; // Callee pops nothing.

3827

3828

if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {

3829

// No need to reset the stack after the call if the call doesn't return. To

3830

// make the MI verify, we'll pretend the callee does it for us.

3831

NumBytesForCalleeToPop = NumBytes;

3832

}

3833

3834

// Returns a flag for retval copy to use.

3835

if (!IsSibcall) {

3836

Chain = DAG.getCALLSEQ_END(Chain,

3837

DAG.getIntPtrConstant(NumBytesToPop, dl, true),

3838

DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,

3839

true),

3840

InFlag, dl);

3841

InFlag = Chain.getValue(1);

3842

}

3843

3844

// Handle result values, copying them out of physregs into vregs that we

3845

// return.

3846

return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,

3847

InVals, RegMask);

3848

}

3849

3850

//===----------------------------------------------------------------------===//

3851

// Fast Calling Convention (tail call) implementation

3852

//===----------------------------------------------------------------------===//

3853

3854

// Like std call, callee cleans arguments, convention except that ECX is

3855

// reserved for storing the tail called function address. Only 2 registers are

3856

// free for argument passing (inreg). Tail call optimization is performed

3857

// provided:

3858

// * tailcallopt is enabled

3859

// * caller/callee are fastcc

3860

// On X86_64 architecture with GOT-style position independent code only local

3861

// (within module) calls are supported at the moment.

3862

// To keep the stack aligned according to platform abi the function

3863

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

3864

// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)

3865

// If a tail called function callee has more arguments than the caller the

3866

// caller needs to make sure that there is room to move the RETADDR to. This is

3867

// achieved by reserving an area the size of the argument delta right after the

3868

// original RETADDR, but before the saved framepointer or the spilled registers

3869

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

3870

// stack layout:

3871

// arg1

3872

// arg2

3873

// RETADDR

3874

// [ new RETADDR

3875

// move area ]

3876

// (possible EBP)

3877

// ESI

3878

// EDI

3879

// local1 ..

3880

3881

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

3882

/// requirement.

3883

unsigned

3884

X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,

3885

SelectionDAG& DAG) const {

3886

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

3887

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

3888

unsigned StackAlignment = TFI.getStackAlignment();

3889

uint64_t AlignMask = StackAlignment - 1;

3890

int64_t Offset = StackSize;

3891

unsigned SlotSize = RegInfo->getSlotSize();

3892

if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {

3893

// Number smaller than 12 so just add the difference.

3894

Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));

3895

} else {

3896

// Mask out lower bits, add stackalignment once plus the 12 bytes.

3897

Offset = ((~AlignMask) & Offset) + StackAlignment +

3898

(StackAlignment-SlotSize);

3899

}

3900

return Offset;

3901

}

3902

3903

/// Return true if the given stack call argument is already available in the

3904

/// same position (relatively) of the caller's incoming argument stack.

3905

static

3906

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

3907

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

3908

const X86InstrInfo *TII, const CCValAssign &VA) {

3909

unsigned Bytes = Arg.getValueSizeInBits() / 8;

3910

3911

for (;;) {

3912

// Look through nodes that don't alter the bits of the incoming value.

3913

unsigned Op = Arg.getOpcode();

3914

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

3915

Arg = Arg.getOperand(0);

3916

continue;

3917

}

3918

if (Op == ISD::TRUNCATE) {

3919

const SDValue &TruncInput = Arg.getOperand(0);

3920

if (TruncInput.getOpcode() == ISD::AssertZext &&

3921

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

3922

Arg.getValueType()) {

3923

Arg = TruncInput.getOperand(0);

3924

continue;

3925

}

3926

}

3927

break;

3928

}

3929

3930

int FI = INT_MAX2147483647;

3931

if (Arg.getOpcode() == ISD::CopyFromReg) {

3932

unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

3933

if (!TargetRegisterInfo::isVirtualRegister(VR))

3934

return false;

3935

MachineInstr *Def = MRI->getVRegDef(VR);

3936

if (!Def)

3937

return false;

3938

if (!Flags.isByVal()) {

3939

if (!TII->isLoadFromStackSlot(*Def, FI))

3940

return false;

3941

} else {

3942

unsigned Opcode = Def->getOpcode();

3943

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

3944

Opcode == X86::LEA64_32r) &&

3945

Def->getOperand(1).isFI()) {

3946

FI = Def->getOperand(1).getIndex();

3947

Bytes = Flags.getByValSize();

3948

} else

3949

return false;

3950

}

3951

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

3952

if (Flags.isByVal())

3953

// ByVal argument is passed in as a pointer but it's now being

3954

// dereferenced. e.g.

3955

// define @foo(%struct.X* %A) {

3956

// tail call @bar(%struct.X* byval %A)

3957

// }

3958

return false;

3959

SDValue Ptr = Ld->getBasePtr();

3960

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

3961

if (!FINode)

3962

return false;

3963

FI = FINode->getIndex();

3964

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

3965

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

3966

FI = FINode->getIndex();

3967

Bytes = Flags.getByValSize();

3968

} else

3969

return false;

3970

3971

assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3971, __extension__ __PRETTY_FUNCTION__));

3972

if (!MFI.isFixedObjectIndex(FI))

3973

return false;

3974

3975

if (Offset != MFI.getObjectOffset(FI))

3976

return false;

3977

3978

// If this is not byval, check that the argument stack object is immutable.

3979

// inalloca and argument copy elision can create mutable argument stack

3980

// objects. Byval objects can be mutated, but a byval call intends to pass the

3981

// mutated memory.

3982

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

3983

return false;

3984

3985

if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {

3986

// If the argument location is wider than the argument type, check that any

3987

// extension flags match.

3988

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

3989

Flags.isSExt() != MFI.isObjectSExt(FI)) {

3990

return false;

3991

}

3992

}

3993

3994

return Bytes == MFI.getObjectSize(FI);

3995

}

3996

3997

/// Check whether the call is eligible for tail call optimization. Targets

3998

/// that want to do tail call optimization should implement this function.

3999

bool X86TargetLowering::IsEligibleForTailCallOptimization(

4000

SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,

4001

bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,

4002

const SmallVectorImpl<ISD::OutputArg> &Outs,

4003

const SmallVectorImpl<SDValue> &OutVals,

4004

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

4005

if (!mayTailCallThisCC(CalleeCC))

4006

return false;

4007

4008

// If -tailcallopt is specified, make fastcc functions tail-callable.

4009

MachineFunction &MF = DAG.getMachineFunction();

4010

const Function *CallerF = MF.getFunction();

4011

4012

// If the function return type is x86_fp80 and the callee return type is not,

4013

// then the FP_EXTEND of the call result is not a nop. It's not safe to

4014

// perform a tailcall optimization here.

4015

if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

4016

return false;

4017

4018

CallingConv::ID CallerCC = CallerF->getCallingConv();

4019

bool CCMatch = CallerCC == CalleeCC;

4020

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

4021

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

4022

4023

// Win64 functions have extra shadow space for argument homing. Don't do the

4024

// sibcall if the caller and callee have mismatched expectations for this

4025

// space.

4026

if (IsCalleeWin64 != IsCallerWin64)

4027

return false;

4028

4029

if (DAG.getTarget().Options.GuaranteedTailCallOpt) {

4030

if (canGuaranteeTCO(CalleeCC) && CCMatch)

4031

return true;

4032

return false;

4033

}

4034

4035

// Look for obvious safe cases to perform tail call optimization that do not

4036

// require ABI changes. This is what gcc calls sibcall.

4037

4038

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

4039

// emit a special epilogue.

4040

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4041

if (RegInfo->needsStackRealignment(MF))

4042

return false;

4043

4044

// Also avoid sibcall optimization if either caller or callee uses struct

4045

// return semantics.

4046

if (isCalleeStructRet || isCallerStructRet)

4047

return false;

4048

4049

// Do not sibcall optimize vararg calls unless all arguments are passed via

4050

// registers.

4051

LLVMContext &C = *DAG.getContext();

4052

if (isVarArg && !Outs.empty()) {

4053

// Optimizing for varargs on Win64 is unlikely to be safe without

4054

// additional testing.

4055

if (IsCalleeWin64 || IsCallerWin64)

4056

return false;

4057

4058

SmallVector<CCValAssign, 16> ArgLocs;

4059

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

4060

4061

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

4062

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

4063

if (!ArgLocs[i].isRegLoc())

4064

return false;

4065

}

4066

4067

// If the call result is in ST0 / ST1, it needs to be popped off the x87

4068

// stack. Therefore, if it's not used by the call it is not safe to optimize

4069

// this into a sibcall.

4070

bool Unused = false;

4071

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

4072

if (!Ins[i].Used) {

4073

Unused = true;

4074

break;

4075

}

4076

}

4077

if (Unused) {

4078

SmallVector<CCValAssign, 16> RVLocs;

4079

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

4080

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

4081

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

4082

CCValAssign &VA = RVLocs[i];

4083

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

4084

return false;

4085

}

4086

}

4087

4088

// Check that the call results are passed in the same way.

4089

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

4090

RetCC_X86, RetCC_X86))

4091

return false;

4092

// The callee has to preserve all registers the caller needs to preserve.

4093

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

4094

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

4095

if (!CCMatch) {

4096

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

4097

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

4098

return false;

4099

}

4100

4101

unsigned StackArgsSize = 0;

4102

4103

// If the callee takes no arguments then go on to check the results of the

4104

// call.

4105

if (!Outs.empty()) {

4106

// Check if stack adjustment is needed. For now, do not do this if any

4107

// argument is passed on the stack.

4108

SmallVector<CCValAssign, 16> ArgLocs;

4109

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

4110

4111

// Allocate shadow area for Win64

4112

if (IsCalleeWin64)

4113

CCInfo.AllocateStack(32, 8);

4114

4115

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

4116

StackArgsSize = CCInfo.getNextStackOffset();

4117

4118

if (CCInfo.getNextStackOffset()) {

4119

// Check if the arguments are already laid out in the right way as

4120

// the caller's fixed stack objects.

4121

MachineFrameInfo &MFI = MF.getFrameInfo();

4122

const MachineRegisterInfo *MRI = &MF.getRegInfo();

4123

const X86InstrInfo *TII = Subtarget.getInstrInfo();

4124

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

4125

CCValAssign &VA = ArgLocs[i];

4126

SDValue Arg = OutVals[i];

4127

ISD::ArgFlagsTy Flags = Outs[i].Flags;

4128

if (VA.getLocInfo() == CCValAssign::Indirect)

4129

return false;

4130

if (!VA.isRegLoc()) {

4131

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

4132

MFI, MRI, TII, VA))

4133

return false;

4134

}

4135

}

4136

}

4137

4138

bool PositionIndependent = isPositionIndependent();

4139

// If the tailcall address may be in a register, then make sure it's

4140

// possible to register allocate for it. In 32-bit, the call address can

4141

// only target EAX, EDX, or ECX since the tail call must be scheduled after

4142

// callee-saved registers are restored. These happen to be the same

4143

// registers used to pass 'inreg' arguments so watch out for those.

4144

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

4145

!isa<ExternalSymbolSDNode>(Callee)) ||

4146

PositionIndependent)) {

4147

unsigned NumInRegs = 0;

4148

// In PIC we need an extra register to formulate the address computation

4149

// for the callee.

4150

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

4151

4152

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

4153

CCValAssign &VA = ArgLocs[i];

4154

if (!VA.isRegLoc())

4155

continue;

4156

unsigned Reg = VA.getLocReg();

4157

switch (Reg) {

4158

default: break;

4159

case X86::EAX: case X86::EDX: case X86::ECX:

4160

if (++NumInRegs == MaxInRegs)

4161

return false;

4162

break;

4163

}

4164

}

4165

}

4166

4167

const MachineRegisterInfo &MRI = MF.getRegInfo();

4168

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

4169

return false;

4170

}

4171

4172

bool CalleeWillPop =

4173

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

4174

MF.getTarget().Options.GuaranteedTailCallOpt);

4175

4176

if (unsigned BytesToPop =

4177

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

4178

// If we have bytes to pop, the callee must pop them.

4179

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

4180

if (!CalleePopMatches)

4181

return false;

4182

} else if (CalleeWillPop && StackArgsSize > 0) {

4183

// If we don't have bytes to pop, make sure the callee doesn't pop any.

4184

return false;

4185

}

4186

4187

return true;

4188

}

4189

4190

FastISel *

4191

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

4192

const TargetLibraryInfo *libInfo) const {

4193

return X86::createFastISel(funcInfo, libInfo);

4194

}

4195

4196

//===----------------------------------------------------------------------===//

4197

// Other Lowering Hooks

4198

//===----------------------------------------------------------------------===//

4199

4200

static bool MayFoldLoad(SDValue Op) {

4201

return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());

4202

}

4203

4204

static bool MayFoldIntoStore(SDValue Op) {

4205

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

4206

}

4207

4208

static bool MayFoldIntoZeroExtend(SDValue Op) {

4209

if (Op.hasOneUse()) {

4210

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

4211

return (ISD::ZERO_EXTEND == Opcode);

4212

}

4213

return false;

4214

}

4215

4216

static bool isTargetShuffle(unsigned Opcode) {

4217

switch(Opcode) {

4218

default: return false;

4219

case X86ISD::BLENDI:

4220

case X86ISD::PSHUFB:

4221

case X86ISD::PSHUFD:

4222

case X86ISD::PSHUFHW:

4223

case X86ISD::PSHUFLW:

4224

case X86ISD::SHUFP:

4225

case X86ISD::INSERTPS:

4226

case X86ISD::EXTRQI:

4227

case X86ISD::INSERTQI:

4228

case X86ISD::PALIGNR:

4229

case X86ISD::VSHLDQ:

4230

case X86ISD::VSRLDQ:

4231

case X86ISD::MOVLHPS:

4232

case X86ISD::MOVHLPS:

4233

case X86ISD::MOVLPS:

4234

case X86ISD::MOVLPD:

4235

case X86ISD::MOVSHDUP:

4236

case X86ISD::MOVSLDUP:

4237

case X86ISD::MOVDDUP:

4238

case X86ISD::MOVSS:

4239

case X86ISD::MOVSD:

4240

case X86ISD::UNPCKL:

4241

case X86ISD::UNPCKH:

4242

case X86ISD::VBROADCAST:

4243

case X86ISD::VPERMILPI:

4244

case X86ISD::VPERMILPV:

4245

case X86ISD::VPERM2X128:

4246

case X86ISD::VPERMIL2:

4247

case X86ISD::VPERMI:

4248

case X86ISD::VPPERM:

4249

case X86ISD::VPERMV:

4250

case X86ISD::VPERMV3:

4251

case X86ISD::VPERMIV3:

4252

case X86ISD::VZEXT_MOVL:

4253

return true;

4254

}

4255

}

4256

4257

static bool isTargetShuffleVariableMask(unsigned Opcode) {

4258

switch (Opcode) {

4259

default: return false;

4260

// Target Shuffles.

4261

case X86ISD::PSHUFB:

4262

case X86ISD::VPERMILPV:

4263

case X86ISD::VPERMIL2:

4264

case X86ISD::VPPERM:

4265

case X86ISD::VPERMV:

4266

case X86ISD::VPERMV3:

4267

case X86ISD::VPERMIV3:

4268

return true;

4269

// 'Faux' Target Shuffles.

4270

case ISD::AND:

4271

case X86ISD::ANDNP:

4272

return true;

4273

}

4274

}

4275

4276

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

4277

MachineFunction &MF = DAG.getMachineFunction();

4278

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4279

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

4280

int ReturnAddrIndex = FuncInfo->getRAIndex();

4281

4282

if (ReturnAddrIndex == 0) {

4283

// Set up a frame object for the return address.

4284

unsigned SlotSize = RegInfo->getSlotSize();

4285

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

4286

-(int64_t)SlotSize,

4287

false);

4288

FuncInfo->setRAIndex(ReturnAddrIndex);

4289

}

4290

4291

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

4292

}

4293

4294

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

4295

bool hasSymbolicDisplacement) {

4296

// Offset should fit into 32 bit immediate field.

4297

if (!isInt<32>(Offset))

4298

return false;

4299

4300

// If we don't have a symbolic displacement - we don't have any extra

4301

// restrictions.

4302

if (!hasSymbolicDisplacement)

4303

return true;

4304

4305

// FIXME: Some tweaks might be needed for medium code model.

4306

if (M != CodeModel::Small && M != CodeModel::Kernel)

4307

return false;

4308

4309

// For small code model we assume that latest object is 16MB before end of 31

4310

// bits boundary. We may also accept pretty large negative constants knowing

4311

// that all objects are in the positive half of address space.

4312

if (M == CodeModel::Small && Offset < 16*1024*1024)

4313

return true;

4314

4315

// For kernel code model we know that all object resist in the negative half

4316

// of 32bits address space. We may not accept negative offsets, since they may

4317

// be just off and we may accept pretty large positive ones.

4318

if (M == CodeModel::Kernel && Offset >= 0)

4319

return true;

4320

4321

return false;

4322

}

4323

4324

/// Determines whether the callee is required to pop its own arguments.

4325

/// Callee pop is necessary to support tail calls.

4326

bool X86::isCalleePop(CallingConv::ID CallingConv,

4327

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

4328

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

4329

// can guarantee TCO.

4330

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

4331

return true;

4332

4333

switch (CallingConv) {

4334

default:

4335

return false;

4336

case CallingConv::X86_StdCall:

4337

case CallingConv::X86_FastCall:

4338

case CallingConv::X86_ThisCall:

4339

case CallingConv::X86_VectorCall:

4340

return !is64Bit;

4341

}

4342

}

4343

4344

/// \brief Return true if the condition is an unsigned comparison operation.

4345

static bool isX86CCUnsigned(unsigned X86CC) {

4346

switch (X86CC) {

4347

default:

4348

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4348);

4349

case X86::COND_E:

4350

case X86::COND_NE:

4351

case X86::COND_B:

4352

case X86::COND_A:

4353

case X86::COND_BE:

4354

case X86::COND_AE:

4355

return true;

4356

case X86::COND_G:

4357

case X86::COND_GE:

4358

case X86::COND_L:

4359

case X86::COND_LE:

4360

return false;

4361

}

4362

}

4363

4364

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

4365

switch (SetCCOpcode) {

4366

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4366);

4367

case ISD::SETEQ: return X86::COND_E;

4368

case ISD::SETGT: return X86::COND_G;

4369

case ISD::SETGE: return X86::COND_GE;

4370

case ISD::SETLT: return X86::COND_L;

4371

case ISD::SETLE: return X86::COND_LE;

4372

case ISD::SETNE: return X86::COND_NE;

4373

case ISD::SETULT: return X86::COND_B;

4374

case ISD::SETUGT: return X86::COND_A;

4375

case ISD::SETULE: return X86::COND_BE;

4376

case ISD::SETUGE: return X86::COND_AE;

4377

}

4378

}

4379

4380

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

4381

/// condition code, returning the condition code and the LHS/RHS of the

4382

/// comparison to make.

4383

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

4384

bool isFP, SDValue &LHS, SDValue &RHS,

4385

SelectionDAG &DAG) {

4386

if (!isFP) {

4387

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

4388

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {

4389

// X > -1 -> X == 0, jump !sign.

4390

RHS = DAG.getConstant(0, DL, RHS.getValueType());

4391

return X86::COND_NS;

4392

}

4393

if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {

4394

// X < 0 -> X == 0, jump on sign.

4395

return X86::COND_S;

4396

}

4397

if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {

4398

// X < 1 -> X <= 0

4399

RHS = DAG.getConstant(0, DL, RHS.getValueType());

4400

return X86::COND_LE;

4401

}

4402

}

4403

4404

return TranslateIntegerX86CC(SetCCOpcode);

4405

}

4406

4407

// First determine if it is required or is profitable to flip the operands.

4408

4409

// If LHS is a foldable load, but RHS is not, flip the condition.

4410

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

4411

!ISD::isNON_EXTLoad(RHS.getNode())) {

4412

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

4413

std::swap(LHS, RHS);

4414

}

4415

4416

switch (SetCCOpcode) {

4417

default: break;

4418

case ISD::SETOLT:

4419

case ISD::SETOLE:

4420

case ISD::SETUGT:

4421

case ISD::SETUGE:

4422

std::swap(LHS, RHS);

4423

break;

4424

}

4425

4426

// On a floating point condition, the flags are set as follows:

4427

// ZF PF CF op

4428

// 0 | 0 | 0 | X > Y

4429

// 0 | 0 | 1 | X < Y

4430

// 1 | 0 | 0 | X == Y

4431

// 1 | 1 | 1 | unordered

4432

switch (SetCCOpcode) {

4433

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4433);

4434

case ISD::SETUEQ:

4435

case ISD::SETEQ: return X86::COND_E;

4436

case ISD::SETOLT: // flipped

4437

case ISD::SETOGT:

4438

case ISD::SETGT: return X86::COND_A;

4439

case ISD::SETOLE: // flipped

4440

case ISD::SETOGE:

4441

case ISD::SETGE: return X86::COND_AE;

4442

case ISD::SETUGT: // flipped

4443

case ISD::SETULT:

4444

case ISD::SETLT: return X86::COND_B;

4445

case ISD::SETUGE: // flipped

4446

case ISD::SETULE:

4447

case ISD::SETLE: return X86::COND_BE;

4448

case ISD::SETONE:

4449

case ISD::SETNE: return X86::COND_NE;

4450

case ISD::SETUO: return X86::COND_P;

4451

case ISD::SETO: return X86::COND_NP;

4452

case ISD::SETOEQ:

4453

case ISD::SETUNE: return X86::COND_INVALID;

4454

}

4455

}

4456

4457

/// Is there a floating point cmov for the specific X86 condition code?

4458

/// Current x86 isa includes the following FP cmov instructions:

4459

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

4460

static bool hasFPCMov(unsigned X86CC) {

4461

switch (X86CC) {

4462

default:

4463

return false;

4464

case X86::COND_B:

4465

case X86::COND_BE:

4466

case X86::COND_E:

4467

case X86::COND_P:

4468

case X86::COND_A:

4469

case X86::COND_AE:

4470

case X86::COND_NE:

4471

case X86::COND_NP:

4472

return true;

4473

}

4474

}

4475

4476

4477

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

4478

const CallInst &I,

4479

unsigned Intrinsic) const {

4480

4481

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

4482

if (!IntrData)

4483

return false;

4484

4485

Info.opc = ISD::INTRINSIC_W_CHAIN;

4486

Info.readMem = false;

4487

Info.writeMem = false;

4488

Info.vol = false;

4489

Info.offset = 0;

4490

4491

switch (IntrData->Type) {

4492

case EXPAND_FROM_MEM: {

4493

Info.ptrVal = I.getArgOperand(0);

4494

Info.memVT = MVT::getVT(I.getType());

4495

Info.align = 1;

4496

Info.readMem = true;

4497

break;

4498

}

4499

case COMPRESS_TO_MEM: {

4500

Info.ptrVal = I.getArgOperand(0);

4501

Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());

4502

Info.align = 1;

4503

Info.writeMem = true;

4504

break;

4505

}

4506

case TRUNCATE_TO_MEM_VI8:

4507

case TRUNCATE_TO_MEM_VI16:

4508

case TRUNCATE_TO_MEM_VI32: {

4509

Info.ptrVal = I.getArgOperand(0);

4510

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

4511

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

4512

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

4513

ScalarVT = MVT::i8;

4514

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

4515

ScalarVT = MVT::i16;

4516

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

4517

ScalarVT = MVT::i32;

4518

4519

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

4520

Info.align = 1;

4521

Info.writeMem = true;

4522

break;

4523

}

4524

default:

4525

return false;

4526

}

4527

4528

return true;

4529

}

4530

4531

/// Returns true if the target can instruction select the

4532

/// specified FP immediate natively. If false, the legalizer will

4533

/// materialize the FP immediate as a load from a constant pool.

4534

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {

4535

for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {

4536

if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))

4537

return true;

4538

}

4539

return false;

4540

}

4541

4542

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

4543

ISD::LoadExtType ExtTy,

4544

EVT NewVT) const {

4545

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

4546

// relocation target a movq or addq instruction: don't let the load shrink.

4547

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

4548

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

4549

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

4550

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

4551

return true;

4552

}

4553

4554

/// \brief Returns true if it is beneficial to convert a load of a constant

4555

/// to just the constant itself.

4556

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

4557

Type *Ty) const {

4558

assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4558, __extension__ __PRETTY_FUNCTION__));

4559

4560

unsigned BitSize = Ty->getPrimitiveSizeInBits();

4561

if (BitSize == 0 || BitSize > 64)

4562

return false;

4563

return true;

4564

}

4565

4566

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

4567

// TODO: It might be a win to ease or lift this restriction, but the generic

4568

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

4569

if (VT.isVector() && Subtarget.hasAVX512())

4570

return false;

4571

4572

return true;

4573

}

4574

4575

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

4576

unsigned Index) const {

4577

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

4578

return false;

4579

4580

// Mask vectors support all subregister combinations and operations that

4581

// extract half of vector.

4582

if (ResVT.getVectorElementType() == MVT::i1)

4583

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

4584

(Index == ResVT.getVectorNumElements()));

4585

4586

return (Index % ResVT.getVectorNumElements()) == 0;

4587

}

4588

4589

bool X86TargetLowering::isCheapToSpeculateCttz() const {

4590

// Speculate cttz only if we can directly use TZCNT.

4591

return Subtarget.hasBMI();

4592

}

4593

4594

bool X86TargetLowering::isCheapToSpeculateCtlz() const {

4595

// Speculate ctlz only if we can directly use LZCNT.

4596

return Subtarget.hasLZCNT();

4597

}

4598

4599

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

4600

const SelectionDAG &DAG) const {

4601

// Do not merge to float value size (128 bytes) if no implicit

4602

// float attribute is set.

4603

bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(

4604

Attribute::NoImplicitFloat);

4605

4606

if (NoFloat) {

4607

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

4608

return (MemVT.getSizeInBits() <= MaxIntSize);

4609

}

4610

return true;

4611

}

4612

4613

bool X86TargetLowering::isCtlzFast() const {

4614

return Subtarget.hasFastLZCNT();

4615

}

4616

4617

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

4618

const Instruction &AndI) const {

4619

return true;

4620

}

4621

4622

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

4623

if (!Subtarget.hasBMI())

4624

return false;

4625

4626

// There are only 32-bit and 64-bit forms for 'andn'.

4627

EVT VT = Y.getValueType();

4628

if (VT != MVT::i32 && VT != MVT::i64)

4629

return false;

4630

4631

return true;

4632

}

4633

4634

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

4635

MVT VT = MVT::getIntegerVT(NumBits);

4636

if (isTypeLegal(VT))

4637

return VT;

4638

4639

// PMOVMSKB can handle this.

4640

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

4641

return MVT::v16i8;

4642

4643

// VPMOVMSKB can handle this.

4644

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

4645

return MVT::v32i8;

4646

4647

// TODO: Allow 64-bit type for 32-bit target.

4648

// TODO: 512-bit types should be allowed, but make sure that those

4649

// cases are handled in combineVectorSizedSetCCEquality().

4650

4651

return MVT::INVALID_SIMPLE_VALUE_TYPE;

4652

}

4653

4654

/// Val is the undef sentinel value or equal to the specified value.

4655

static bool isUndefOrEqual(int Val, int CmpVal) {

4656

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

4657

}

4658

4659

/// Val is either the undef or zero sentinel value.

4660

static bool isUndefOrZero(int Val) {

4661

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

4662

}

4663

4664

/// Return true if every element in Mask, beginning

4665

/// from position Pos and ending in Pos+Size is the undef sentinel value.

4666

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

4667

for (unsigned i = Pos, e = Pos + Size; i != e; ++i)

4668

if (Mask[i] != SM_SentinelUndef)

4669

return false;

4670

return true;

4671

}

4672

4673

/// Return true if Val is undef or if its value falls within the

4674

/// specified range (L, H].

4675

static bool isUndefOrInRange(int Val, int Low, int Hi) {

4676

return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);

4677

}

4678

4679

/// Return true if every element in Mask is undef or if its value

4680

/// falls within the specified range (L, H].

4681

static bool isUndefOrInRange(ArrayRef<int> Mask,

4682

int Low, int Hi) {

4683

for (int M : Mask)

4684

if (!isUndefOrInRange(M, Low, Hi))

4685

return false;

4686

return true;

4687

}

4688

4689

/// Return true if Val is undef, zero or if its value falls within the

4690

/// specified range (L, H].

4691

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

4692

return isUndefOrZero(Val) || (Val >= Low && Val < Hi);

4693

}

4694

4695

/// Return true if every element in Mask is undef, zero or if its value

4696

/// falls within the specified range (L, H].

4697

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

4698

for (int M : Mask)

4699

if (!isUndefOrZeroOrInRange(M, Low, Hi))

4700

return false;

4701

return true;

4702

}

4703

4704

/// Return true if every element in Mask, beginning

4705

/// from position Pos and ending in Pos+Size, falls within the specified

4706

/// sequential range (Low, Low+Size]. or is undef.

4707

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,

4708

unsigned Pos, unsigned Size, int Low) {

4709

for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)

4710

if (!isUndefOrEqual(Mask[i], Low))

4711

return false;

4712

return true;

4713

}

4714

4715

/// Return true if every element in Mask, beginning

4716

/// from position Pos and ending in Pos+Size, falls within the specified

4717

/// sequential range (Low, Low+Size], or is undef or is zero.

4718

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

4719

unsigned Size, int Low) {

4720

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)

4721

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

4722

return false;

4723

return true;

4724

}

4725

4726

/// Return true if every element in Mask, beginning

4727

/// from position Pos and ending in Pos+Size is undef or is zero.

4728

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

4729

unsigned Size) {

4730

for (unsigned i = Pos, e = Pos + Size; i != e; ++i)

4731

if (!isUndefOrZero(Mask[i]))

4732

return false;

4733

return true;

4734

}

4735

4736

/// \brief Helper function to test whether a shuffle mask could be

4737

/// simplified by widening the elements being shuffled.

4738

///

4739

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

4740

/// leaves it in an unspecified state.

4741

///

4742

/// NOTE: This must handle normal vector shuffle masks and *target* vector

4743

/// shuffle masks. The latter have the special property of a '-2' representing

4744

/// a zero-ed lane of a vector.

4745

static bool canWidenShuffleElements(ArrayRef<int> Mask,

4746

SmallVectorImpl<int> &WidenedMask) {

4747

WidenedMask.assign(Mask.size() / 2, 0);

4748

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

4749

int M0 = Mask[i];

4750

int M1 = Mask[i + 1];

4751

4752

// If both elements are undef, its trivial.

4753

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

4754

WidenedMask[i / 2] = SM_SentinelUndef;

4755

continue;

4756

}

4757

4758

// Check for an undef mask and a mask value properly aligned to fit with

4759

// a pair of values. If we find such a case, use the non-undef mask's value.

4760

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

4761

WidenedMask[i / 2] = M1 / 2;

4762

continue;

4763

}

4764

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

4765

WidenedMask[i / 2] = M0 / 2;

4766

continue;

4767

}

4768

4769

// When zeroing, we need to spread the zeroing across both lanes to widen.

4770

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

4771

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

4772

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

4773

WidenedMask[i / 2] = SM_SentinelZero;

4774

continue;

4775

}

4776

return false;

4777

}

4778

4779

// Finally check if the two mask values are adjacent and aligned with

4780

// a pair.

4781

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

4782

WidenedMask[i / 2] = M0 / 2;

4783

continue;

4784

}

4785

4786

// Otherwise we can't safely widen the elements used in this shuffle.

4787

return false;

4788

}

4789

assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4790, __extension__ __PRETTY_FUNCTION__))

4790

"Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4790, __extension__ __PRETTY_FUNCTION__));

4791

4792

return true;

4793

}

4794

4795

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

4796

bool X86::isZeroNode(SDValue Elt) {

4797

return isNullConstant(Elt) || isNullFPConstant(Elt);

4798

}

4799

4800

// Build a vector of constants.

4801

// Use an UNDEF node if MaskElt == -1.

4802

// Split 64-bit constants in the 32-bit mode.

4803

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

4804

const SDLoc &dl, bool IsMask = false) {

4805

4806

SmallVector<SDValue, 32> Ops;

4807

bool Split = false;

4808

4809

MVT ConstVecVT = VT;

4810

unsigned NumElts = VT.getVectorNumElements();

4811

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

4812

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

4813

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

4814

Split = true;

4815

}

4816

4817

MVT EltVT = ConstVecVT.getVectorElementType();

4818

for (unsigned i = 0; i < NumElts; ++i) {

4819

bool IsUndef = Values[i] < 0 && IsMask;

4820

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

4821

DAG.getConstant(Values[i], dl, EltVT);

4822

Ops.push_back(OpNode);

4823

if (Split)

4824

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

4825

DAG.getConstant(0, dl, EltVT));

4826

}

4827

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

4828

if (Split)

4829

ConstsNode = DAG.getBitcast(VT, ConstsNode);

4830

return ConstsNode;

4831

}

4832

4833

static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,

4834

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

4835

assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4836, __extension__ __PRETTY_FUNCTION__))

4836

"Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4836, __extension__ __PRETTY_FUNCTION__));

4837

SmallVector<SDValue, 32> Ops;

4838

bool Split = false;

4839

4840

MVT ConstVecVT = VT;

4841

unsigned NumElts = VT.getVectorNumElements();

4842

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

4843

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

4844

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

4845

Split = true;

4846

}

4847

4848

MVT EltVT = ConstVecVT.getVectorElementType();

4849

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

4850

if (Undefs[i]) {

4851

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

4852

continue;

4853

}

4854

const APInt &V = Bits[i];

4855

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4855, __extension__ __PRETTY_FUNCTION__));

4856

if (Split) {

4857

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

4858

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

4859

} else if (EltVT == MVT::f32) {

4860

APFloat FV(APFloat::IEEEsingle(), V);

4861

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

4862

} else if (EltVT == MVT::f64) {

4863

APFloat FV(APFloat::IEEEdouble(), V);

4864

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

4865

} else {

4866

Ops.push_back(DAG.getConstant(V, dl, EltVT));

4867

}

4868

}

4869

4870

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

4871

return DAG.getBitcast(VT, ConstsNode);

4872

}

4873

4874

/// Returns a vector of specified type with all zero elements.

4875

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

4876

SelectionDAG &DAG, const SDLoc &dl) {

4877

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4879, __extension__ __PRETTY_FUNCTION__))

4878

VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4879, __extension__ __PRETTY_FUNCTION__))

4879

"Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4879, __extension__ __PRETTY_FUNCTION__));

4880

4881

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

4882

// type. This ensures they get CSE'd. But if the integer type is not

4883

// available, use a floating-point +0.0 instead.

4884

SDValue Vec;

4885

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

4886

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

4887

} else if (VT.getVectorElementType() == MVT::i1) {

4888

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4889, __extension__ __PRETTY_FUNCTION__))

4889

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4889, __extension__ __PRETTY_FUNCTION__));

4890

assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&(static_cast <bool> ((Subtarget.hasVLX() || VT.getVectorNumElements
() >= 8) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4891, __extension__ __PRETTY_FUNCTION__))

4891

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasVLX() || VT.getVectorNumElements
() >= 8) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4891, __extension__ __PRETTY_FUNCTION__));

4892

Vec = DAG.getConstant(0, dl, VT);

4893

} else {

4894

unsigned Num32BitElts = VT.getSizeInBits() / 32;

4895

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

4896

}

4897

return DAG.getBitcast(VT, Vec);

4898

}

4899

4900

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

4901

const SDLoc &dl, unsigned vectorWidth) {

4902

EVT VT = Vec.getValueType();

4903

EVT ElVT = VT.getVectorElementType();

4904

unsigned Factor = VT.getSizeInBits()/vectorWidth;

4905

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

4906

VT.getVectorNumElements()/Factor);

4907

4908

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

4909

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

4910

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4910, __extension__ __PRETTY_FUNCTION__));

4911

4912

// This is the index of the first element of the vectorWidth-bit chunk

4913

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

4914

IdxVal &= ~(ElemsPerChunk - 1);

4915

4916

// If the input is a buildvector just emit a smaller one.

4917

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

4918

return DAG.getBuildVector(ResultVT, dl,

4919

Vec->ops().slice(IdxVal, ElemsPerChunk));

4920

4921

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

4922

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

4923

}

4924

4925

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

4926

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

4927

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

4928

/// instructions or a simple subregister reference. Idx is an index in the

4929

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

4930

/// lowering EXTRACT_VECTOR_ELT operations easier.

4931

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

4932

SelectionDAG &DAG, const SDLoc &dl) {

4933

assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4934, __extension__ __PRETTY_FUNCTION__))

4934

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4934, __extension__ __PRETTY_FUNCTION__));

4935

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

4936

}

4937

4938

/// Generate a DAG to grab 256-bits from a 512-bit vector.

4939

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

4940

SelectionDAG &DAG, const SDLoc &dl) {

4941

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4941, __extension__ __PRETTY_FUNCTION__));

4942

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

4943

}

4944

4945

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

4946

SelectionDAG &DAG, const SDLoc &dl,

4947

unsigned vectorWidth) {

4948

assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4949, __extension__ __PRETTY_FUNCTION__))

4949

"Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4949, __extension__ __PRETTY_FUNCTION__));

4950

// Inserting UNDEF is Result

4951

if (Vec.isUndef())

4952

return Result;

4953

EVT VT = Vec.getValueType();

4954

EVT ElVT = VT.getVectorElementType();

4955

EVT ResultVT = Result.getValueType();

4956

4957

// Insert the relevant vectorWidth bits.

4958

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

4959

4960

4961

// This is the index of the first element of the vectorWidth-bit chunk

4962

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

4963

IdxVal &= ~(ElemsPerChunk - 1);

4964

4965

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

4966

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

4967

}

4968

4969

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

4970

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

4971

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

4972

/// simple superregister reference. Idx is an index in the 128 bits

4973

/// we want. It need not be aligned to a 128-bit boundary. That makes

4974

/// lowering INSERT_VECTOR_ELT operations easier.

4975

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

4976

SelectionDAG &DAG, const SDLoc &dl) {

4977

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4977, __extension__ __PRETTY_FUNCTION__));

4978

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

4979

}

4980

4981

static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

4982

SelectionDAG &DAG, const SDLoc &dl) {

4983

assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is256BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is256BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4983, __extension__ __PRETTY_FUNCTION__));

4984

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);

4985

}

4986

4987

// Return true if the instruction zeroes the unused upper part of the

4988

// destination and accepts mask.

4989

static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {

4990

switch (Opcode) {

4991

default:

4992

return false;

4993

case X86ISD::TESTM:

4994

case X86ISD::TESTNM:

4995

case X86ISD::PCMPEQM:

4996

case X86ISD::PCMPGTM:

4997

case X86ISD::CMPM:

4998

case X86ISD::CMPMU:

4999

case X86ISD::CMPM_RND:

5000

return true;

5001

}

5002

}

5003

5004

/// Insert i1-subvector to i1-vector.

5005

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

5006

const X86Subtarget &Subtarget) {

5007

5008

SDLoc dl(Op);

5009

SDValue Vec = Op.getOperand(0);

5010

SDValue SubVec = Op.getOperand(1);

5011

SDValue Idx = Op.getOperand(2);

5012

5013

if (!isa<ConstantSDNode>(Idx))

5014

return SDValue();

5015

5016

// Inserting undef is a nop. We can just return the original vector.

5017

if (SubVec.isUndef())

5018

return Vec;

5019

5020

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

5021

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

5022

return Op;

5023

5024

MVT OpVT = Op.getSimpleValueType();

5025

unsigned NumElems = OpVT.getVectorNumElements();

5026

5027

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

5028

5029

// Extend to natively supported kshift.

5030

MVT WideOpVT = OpVT;

5031

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

5032

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

5033

5034

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

5035

// if necessary.

5036

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

5037

// May need to promote to a legal type.

5038

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

5039

getZeroVector(WideOpVT, Subtarget, DAG, dl),

5040

SubVec, Idx);

5041

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

5042

}

5043

5044

MVT SubVecVT = SubVec.getSimpleValueType();

5045

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

5046

5047

assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5049, __extension__ __PRETTY_FUNCTION__))

5048

IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5049, __extension__ __PRETTY_FUNCTION__))

5049

"Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5049, __extension__ __PRETTY_FUNCTION__));

5050

5051

SDValue Undef = DAG.getUNDEF(WideOpVT);

5052

5053

if (IdxVal == 0) {

5054

// Zero lower bits of the Vec

5055

SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);

5056

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

5057

ZeroIdx);

5058

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

5059

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

5060

// Merge them together, SubVec should be zero extended.

5061

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

5062

getZeroVector(WideOpVT, Subtarget, DAG, dl),

5063

SubVec, ZeroIdx);

5064

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

5065

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op,

5066

ZeroIdx);

5067

}

5068

5069

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

5070

Undef, SubVec, ZeroIdx);

5071

5072

if (Vec.isUndef()) {

5073

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5073, __extension__ __PRETTY_FUNCTION__));

5074

Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

5075

DAG.getConstant(IdxVal, dl, MVT::i8));

5076

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

5077

}

5078

5079

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

5080

5081

NumElems = WideOpVT.getVectorNumElements();

5082

unsigned ShiftLeft = NumElems - SubVecNumElems;

5083

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

5084

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

5085

DAG.getConstant(ShiftLeft, dl, MVT::i8));

5086

Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,

5087

DAG.getConstant(ShiftRight, dl, MVT::i8));

5088

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

5089

}

5090

5091

// Simple case when we put subvector in the upper part

5092

if (IdxVal + SubVecNumElems == NumElems) {

5093

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

5094

DAG.getConstant(IdxVal, dl, MVT::i8));

5095

if (SubVecNumElems * 2 == NumElems) {

5096

// Special case, use legal zero extending insert_subvector. This allows

5097

// isel to opimitize when bits are known zero.

5098

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

5099

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

5100

getZeroVector(WideOpVT, Subtarget, DAG, dl),

5101

Vec, ZeroIdx);

5102

} else {

5103

// Otherwise use explicit shifts to zero the bits.

5104

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

5105

Undef, Vec, ZeroIdx);

5106

NumElems = WideOpVT.getVectorNumElements();

5107

SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);

5108

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

5109

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

5110

}

5111

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

5112

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

5113

}

5114

5115

// Inserting into the middle is more complicated.

5116

5117

NumElems = WideOpVT.getVectorNumElements();

5118

5119

// Widen the vector if needed.

5120

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

5121

// Move the current value of the bit to be replace to the lsbs.

5122

Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

5123

DAG.getConstant(IdxVal, dl, MVT::i8));

5124

// Xor with the new bit.

5125

Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);

5126

// Shift to MSB, filling bottom bits with 0.

5127

unsigned ShiftLeft = NumElems - SubVecNumElems;

5128

Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,

5129

DAG.getConstant(ShiftLeft, dl, MVT::i8));

5130

// Shift to the final position, filling upper bits with 0.

5131

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

5132

Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,

5133

DAG.getConstant(ShiftRight, dl, MVT::i8));

5134

// Xor with original vector leaving the new value.

5135

Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);

5136

// Reduce to original width if needed.

5137

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

5138

}

5139

5140

/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128

5141

/// instructions. This is used because creating CONCAT_VECTOR nodes of

5142

/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower

5143

/// large BUILD_VECTORS.

5144

static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,

5145

unsigned NumElems, SelectionDAG &DAG,

5146

const SDLoc &dl) {

5147

SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);

5148

return insert128BitVector(V, V2, NumElems / 2, DAG, dl);

5149

}

5150

5151

static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,

5152

unsigned NumElems, SelectionDAG &DAG,

5153

const SDLoc &dl) {

5154

SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);

5155

return insert256BitVector(V, V2, NumElems / 2, DAG, dl);

5156

}

5157

5158

/// Returns a vector of specified type with all bits set.

5159

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

5160

/// Then bitcast to their original type, ensuring they get CSE'd.

5161

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

5162

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5163, __extension__ __PRETTY_FUNCTION__))

5163

"Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5163, __extension__ __PRETTY_FUNCTION__));

5164

5165

APInt Ones = APInt::getAllOnesValue(32);

5166

unsigned NumElts = VT.getSizeInBits() / 32;

5167

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

5168

return DAG.getBitcast(VT, Vec);

5169

}

5170

5171

static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,

5172

SelectionDAG &DAG) {

5173

EVT InVT = In.getValueType();

5174

assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode")(static_cast <bool> ((X86ISD::VSEXT == Opc || X86ISD::VZEXT
== Opc) && "Unexpected opcode") ? void (0) : __assert_fail
("(X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5174, __extension__ __PRETTY_FUNCTION__));

5175

5176

if (VT.is128BitVector() && InVT.is128BitVector())

5177

return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)

5178

: DAG.getZeroExtendVectorInReg(In, DL, VT);

5179

5180

// For 256-bit vectors, we only need the lower (128-bit) input half.

5181

// For 512-bit vectors, we only need the lower input half or quarter.

5182

if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {

5183

int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

5184

In = extractSubVector(In, 0, DAG, DL,

5185

std::max(128, (int)VT.getSizeInBits() / Scale));

5186

}

5187

5188

return DAG.getNode(Opc, DL, VT, In);

5189

}

5190

5191

/// Returns a vector_shuffle node for an unpackl operation.

5192

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,

5193

SDValue V1, SDValue V2) {

5194

SmallVector<int, 8> Mask;

5195

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

5196

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

5197

}

5198

5199

/// Returns a vector_shuffle node for an unpackh operation.

5200

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,

5201

SDValue V1, SDValue V2) {

5202

SmallVector<int, 8> Mask;

5203

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

5204

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

5205

}

5206

5207

/// Return a vector_shuffle of the specified vector of zero or undef vector.

5208

/// This produces a shuffle where the low element of V2 is swizzled into the

5209

/// zero/undef vector, landing at element Idx.

5210

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

5211

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

5212

bool IsZero,

5213

const X86Subtarget &Subtarget,

5214

SelectionDAG &DAG) {

5215

MVT VT = V2.getSimpleValueType();

5216

SDValue V1 = IsZero

5217

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

5218

int NumElems = VT.getVectorNumElements();

5219

SmallVector<int, 16> MaskVec(NumElems);

5220

for (int i = 0; i != NumElems; ++i)

5221

// If this is the insertion idx, put the low elt of V2 here.

5222

MaskVec[i] = (i == Idx) ? NumElems : i;

5223

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

5224

}

5225

5226

static SDValue peekThroughBitcasts(SDValue V) {

5227

while (V.getNode() && V.getOpcode() == ISD::BITCAST)

5228

V = V.getOperand(0);

5229

return V;

5230

}

5231

5232

static SDValue peekThroughOneUseBitcasts(SDValue V) {

5233

while (V.getNode() && V.getOpcode() == ISD::BITCAST &&

5234

V.getOperand(0).hasOneUse())

5235

V = V.getOperand(0);

5236

return V;

5237

}

5238

5239

static const Constant *getTargetConstantFromNode(SDValue Op) {

5240

Op = peekThroughBitcasts(Op);

5241

5242

auto *Load = dyn_cast<LoadSDNode>(Op);

5243

if (!Load)

5244

return nullptr;

5245

5246

SDValue Ptr = Load->getBasePtr();

5247

if (Ptr->getOpcode() == X86ISD::Wrapper ||

5248

Ptr->getOpcode() == X86ISD::WrapperRIP)

5249

Ptr = Ptr->getOperand(0);

5250

5251

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

5252

if (!CNode || CNode->isMachineConstantPoolEntry())

5253

return nullptr;

5254

5255

return dyn_cast<Constant>(CNode->getConstVal());

5256

}

5257

5258

// Extract raw constant bits from constant pools.

5259

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

5260

APInt &UndefElts,

5261

SmallVectorImpl<APInt> &EltBits,

5262

bool AllowWholeUndefs = true,

5263

bool AllowPartialUndefs = true) {

5264

assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5264, __extension__ __PRETTY_FUNCTION__));

5265

5266

Op = peekThroughBitcasts(Op);

5267

5268

EVT VT = Op.getValueType();

5269

unsigned SizeInBits = VT.getSizeInBits();

5270

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5270, __extension__ __PRETTY_FUNCTION__));

5271

unsigned NumElts = SizeInBits / EltSizeInBits;

5272

5273

// Bitcast a source array of element bits to the target size.

5274

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

5275

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

5276

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

5277

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5278, __extension__ __PRETTY_FUNCTION__))

5278

"Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5278, __extension__ __PRETTY_FUNCTION__));

5279

5280

// Don't split if we don't allow undef bits.

5281

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

5282

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

5283

return false;

5284

5285

// If we're already the right size, don't bother bitcasting.

5286

if (NumSrcElts == NumElts) {

5287

UndefElts = UndefSrcElts;

5288

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

5289

return true;

5290

}

5291

5292

// Extract all the undef/constant element data and pack into single bitsets.

5293

APInt UndefBits(SizeInBits, 0);

5294

APInt MaskBits(SizeInBits, 0);

5295

5296

for (unsigned i = 0; i != NumSrcElts; ++i) {

5297

unsigned BitOffset = i * SrcEltSizeInBits;

5298

if (UndefSrcElts[i])

5299

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

5300

MaskBits.insertBits(SrcEltBits[i], BitOffset);

5301

}

5302

5303

// Split the undef/constant single bitset data into the target elements.

5304

UndefElts = APInt(NumElts, 0);

5305

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

5306

5307

for (unsigned i = 0; i != NumElts; ++i) {

5308

unsigned BitOffset = i * EltSizeInBits;

5309

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

5310

5311

// Only treat an element as UNDEF if all bits are UNDEF.

5312

if (UndefEltBits.isAllOnesValue()) {

5313

if (!AllowWholeUndefs)

5314

return false;

5315

UndefElts.setBit(i);

5316

continue;

5317

}

5318

5319

// If only some bits are UNDEF then treat them as zero (or bail if not

5320

// supported).

5321

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

5322

return false;

5323

5324

APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);

5325

EltBits[i] = Bits.getZExtValue();

5326

}

5327

return true;

5328

};

5329

5330

// Collect constant bits and insert into mask/undef bit masks.

5331

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

5332

unsigned UndefBitIndex) {

5333

if (!Cst)

5334

return false;

5335

if (isa<UndefValue>(Cst)) {

5336

Undefs.setBit(UndefBitIndex);

5337

return true;

5338

}

5339

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

5340

Mask = CInt->getValue();

5341

return true;

5342

}

5343

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

5344

Mask = CFP->getValueAPF().bitcastToAPInt();

5345

return true;

5346

}

5347

return false;

5348

};

5349

5350

// Handle UNDEFs.

5351

if (Op.isUndef()) {

5352

APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);

5353

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

5354

return CastBitData(UndefSrcElts, SrcEltBits);

5355

}

5356

5357

// Extract scalar constant bits.

5358

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

5359

APInt UndefSrcElts = APInt::getNullValue(1);

5360

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

5361

return CastBitData(UndefSrcElts, SrcEltBits);

5362

}

5363

5364

// Extract constant bits from build vector.

5365

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {

5366

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

5367

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

5368

5369

APInt UndefSrcElts(NumSrcElts, 0);

5370

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

5371

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

5372

const SDValue &Src = Op.getOperand(i);

5373

if (Src.isUndef()) {

5374

UndefSrcElts.setBit(i);

5375

continue;

5376

}

5377

auto *Cst = cast<ConstantSDNode>(Src);

5378

SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);

5379

}

5380

return CastBitData(UndefSrcElts, SrcEltBits);

5381

}

5382

5383

// Extract constant bits from constant pool vector.

5384

if (auto *Cst = getTargetConstantFromNode(Op)) {

5385

Type *CstTy = Cst->getType();

5386

if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))

5387

return false;

5388

5389

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

5390

unsigned NumSrcElts = CstTy->getVectorNumElements();

5391

5392

APInt UndefSrcElts(NumSrcElts, 0);

5393

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

5394

for (unsigned i = 0; i != NumSrcElts; ++i)

5395

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

5396

UndefSrcElts, i))

5397

return false;

5398

5399

return CastBitData(UndefSrcElts, SrcEltBits);

5400

}

5401

5402

// Extract constant bits from a broadcasted constant pool scalar.

5403

if (Op.getOpcode() == X86ISD::VBROADCAST &&

5404

EltSizeInBits <= VT.getScalarSizeInBits()) {

5405

if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {

5406

unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();

5407

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

5408

5409

APInt UndefSrcElts(NumSrcElts, 0);

5410

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

5411

if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {

5412

if (UndefSrcElts[0])

5413

UndefSrcElts.setBits(0, NumSrcElts);

5414

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

5415

return CastBitData(UndefSrcElts, SrcEltBits);

5416

}

5417

}

5418

}

5419

5420

// Extract a rematerialized scalar constant insertion.

5421

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

5422

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

5423

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

5424

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

5425

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

5426

5427

APInt UndefSrcElts(NumSrcElts, 0);

5428

SmallVector<APInt, 64> SrcEltBits;

5429

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

5430

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

5431

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

5432

return CastBitData(UndefSrcElts, SrcEltBits);

5433

}

5434

5435

return false;

5436

}

5437

5438

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

5439

unsigned MaskEltSizeInBits,

5440

SmallVectorImpl<uint64_t> &RawMask) {

5441

APInt UndefElts;

5442

SmallVector<APInt, 64> EltBits;

5443

5444

// Extract the raw target constant bits.

5445

// FIXME: We currently don't support UNDEF bits or mask entries.

5446

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

5447

EltBits, /* AllowWholeUndefs */ false,

5448

/* AllowPartialUndefs */ false))

5449

return false;

5450

5451

// Insert the extracted elements into the mask.

5452

for (APInt Elt : EltBits)

5453

RawMask.push_back(Elt.getZExtValue());

5454

5455

return true;

5456

}

5457

5458

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

5459

/// Note: This ignores saturation, so inputs must be checked first.

5460

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

5461

bool Unary) {

5462

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5462, __extension__ __PRETTY_FUNCTION__));

5463

unsigned NumElts = VT.getVectorNumElements();

5464

unsigned NumLanes = VT.getSizeInBits() / 128;

5465

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

5466

unsigned Offset = Unary ? 0 : NumElts;

5467

5468

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

5469

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)

5470

Mask.push_back(Elt + (Lane * NumEltsPerLane));

5471

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)

5472

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

5473

}

5474

}

5475

5476

/// Calculates the shuffle mask corresponding to the target-specific opcode.

5477

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

5478

/// operands in \p Ops, and returns true.

5479

/// Sets \p IsUnary to true if only one source is used. Note that this will set

5480

/// IsUnary for shuffles which use a single input multiple times, and in those

5481

/// cases it will adjust the mask to only have indices within that single input.

5482

/// It is an error to call this with non-empty Mask/Ops vectors.

5483

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

5484

SmallVectorImpl<SDValue> &Ops,

5485

SmallVectorImpl<int> &Mask, bool &IsUnary) {

5486

unsigned NumElems = VT.getVectorNumElements();

5487

SDValue ImmN;

5488

5489

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5489, __extension__ __PRETTY_FUNCTION__));

5490

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5490, __extension__ __PRETTY_FUNCTION__));

5491

5492

IsUnary = false;

5493

bool IsFakeUnary = false;

5494

switch(N->getOpcode()) {

5495

case X86ISD::BLENDI:

5496

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5496, __extension__ __PRETTY_FUNCTION__));

5497

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5497, __extension__ __PRETTY_FUNCTION__));

5498

ImmN = N->getOperand(N->getNumOperands()-1);

5499

DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5500

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5501

break;

5502

case X86ISD::SHUFP:

5503

5504

5505

ImmN = N->getOperand(N->getNumOperands()-1);

5506

DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5507

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5508

break;

5509

case X86ISD::INSERTPS:

5510

5511

5512

ImmN = N->getOperand(N->getNumOperands()-1);

5513

DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5514

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5515

break;

5516

case X86ISD::EXTRQI:

5517

5518

if (isa<ConstantSDNode>(N->getOperand(1)) &&

5519

isa<ConstantSDNode>(N->getOperand(2))) {

5520

int BitLen = N->getConstantOperandVal(1);

5521

int BitIdx = N->getConstantOperandVal(2);

5522

DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);

5523

IsUnary = true;

5524

}

5525

break;

5526

case X86ISD::INSERTQI:

5527

5528

5529

if (isa<ConstantSDNode>(N->getOperand(2)) &&

5530

isa<ConstantSDNode>(N->getOperand(3))) {

5531

int BitLen = N->getConstantOperandVal(2);

5532

int BitIdx = N->getConstantOperandVal(3);

5533

DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);

5534

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5535

}

5536

break;

5537

case X86ISD::UNPCKH:

5538

5539

5540

DecodeUNPCKHMask(VT, Mask);

5541

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5542

break;

5543

case X86ISD::UNPCKL:

5544

5545

5546

DecodeUNPCKLMask(VT, Mask);

5547

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5548

break;

5549

case X86ISD::MOVHLPS:

5550

5551

5552

DecodeMOVHLPSMask(NumElems, Mask);

5553

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5554

break;

5555

case X86ISD::MOVLHPS:

5556

5557

5558

DecodeMOVLHPSMask(NumElems, Mask);

5559

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5560

break;

5561

case X86ISD::PALIGNR:

5562

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5562, __extension__ __PRETTY_FUNCTION__));

5563

5564

5565

ImmN = N->getOperand(N->getNumOperands()-1);

5566

DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5567

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5568

Ops.push_back(N->getOperand(1));

5569

Ops.push_back(N->getOperand(0));

5570

break;

5571

case X86ISD::VSHLDQ:

5572

5573

5574

ImmN = N->getOperand(N->getNumOperands() - 1);

5575

DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5576

IsUnary = true;

5577

break;

5578

case X86ISD::VSRLDQ:

5579

5580

5581

ImmN = N->getOperand(N->getNumOperands() - 1);

5582

DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5583

IsUnary = true;

5584

break;

5585

case X86ISD::PSHUFD:

5586

case X86ISD::VPERMILPI:

5587

5588

ImmN = N->getOperand(N->getNumOperands()-1);

5589

DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5590

IsUnary = true;

5591

break;

5592

case X86ISD::PSHUFHW:

5593

5594

ImmN = N->getOperand(N->getNumOperands()-1);

5595

DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5596

IsUnary = true;

5597

break;

5598

case X86ISD::PSHUFLW:

5599

5600

ImmN = N->getOperand(N->getNumOperands()-1);

5601

DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5602

IsUnary = true;

5603

break;

5604

case X86ISD::VZEXT_MOVL:

5605

5606

DecodeZeroMoveLowMask(VT, Mask);

5607

IsUnary = true;

5608

break;

5609

case X86ISD::VBROADCAST: {

5610

SDValue N0 = N->getOperand(0);

5611

// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,

5612

// add the pre-extracted value to the Ops vector.

5613

if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

5614

N0.getOperand(0).getValueType() == VT &&

5615

N0.getConstantOperandVal(1) == 0)

5616

Ops.push_back(N0.getOperand(0));

5617

5618

// We only decode broadcasts of same-sized vectors, unless the broadcast

5619

// came from an extract from the original width. If we found one, we

5620

// pushed it the Ops vector above.

5621

if (N0.getValueType() == VT || !Ops.empty()) {

5622

DecodeVectorBroadcast(VT, Mask);

5623

IsUnary = true;

5624

break;

5625

}

5626

return false;

5627

}

5628

case X86ISD::VPERMILPV: {

5629

5630

IsUnary = true;

5631

SDValue MaskNode = N->getOperand(1);

5632

unsigned MaskEltSize = VT.getScalarSizeInBits();

5633

SmallVector<uint64_t, 32> RawMask;

5634

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {

5635

DecodeVPERMILPMask(VT, RawMask, Mask);

5636

break;

5637

}

5638

if (auto *C = getTargetConstantFromNode(MaskNode)) {

5639

DecodeVPERMILPMask(C, MaskEltSize, Mask);

5640

break;

5641

}

5642

return false;

5643

}

5644

case X86ISD::PSHUFB: {

5645

5646

5647

5648

IsUnary = true;

5649

SDValue MaskNode = N->getOperand(1);

5650

SmallVector<uint64_t, 32> RawMask;

5651

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {

5652

DecodePSHUFBMask(RawMask, Mask);

5653

break;

5654

}

5655

if (auto *C = getTargetConstantFromNode(MaskNode)) {

5656

DecodePSHUFBMask(C, Mask);

5657

break;

5658

}

5659

return false;

5660

}

5661

case X86ISD::VPERMI:

5662

5663

ImmN = N->getOperand(N->getNumOperands()-1);

5664

DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5665

IsUnary = true;

5666

break;

5667

case X86ISD::MOVSS:

5668

case X86ISD::MOVSD:

5669

5670

5671

DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);

5672

break;

5673

case X86ISD::VPERM2X128:

5674

5675

5676

ImmN = N->getOperand(N->getNumOperands()-1);

5677

DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5678

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5679

break;

5680

case X86ISD::MOVSLDUP:

5681

5682

DecodeMOVSLDUPMask(VT, Mask);

5683

IsUnary = true;

5684

break;

5685

case X86ISD::MOVSHDUP:

5686

5687

DecodeMOVSHDUPMask(VT, Mask);

5688

IsUnary = true;

5689

break;

5690

case X86ISD::MOVDDUP:

5691

5692

DecodeMOVDDUPMask(VT, Mask);

5693

IsUnary = true;

5694

break;

5695

case X86ISD::MOVLPD:

5696

case X86ISD::MOVLPS:

5697

// Not yet implemented

5698

return false;

5699

case X86ISD::VPERMIL2: {

5700

5701

5702

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5703

unsigned MaskEltSize = VT.getScalarSizeInBits();

5704

SDValue MaskNode = N->getOperand(2);

5705

SDValue CtrlNode = N->getOperand(3);

5706

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

5707

unsigned CtrlImm = CtrlOp->getZExtValue();

5708

SmallVector<uint64_t, 32> RawMask;

5709

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {

5710

DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);

5711

break;

5712

}

5713

if (auto *C = getTargetConstantFromNode(MaskNode)) {

5714

DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);

5715

break;

5716

}

5717

}

5718

return false;

5719

}

5720

case X86ISD::VPPERM: {

5721

5722

5723

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5724

SDValue MaskNode = N->getOperand(2);

5725

SmallVector<uint64_t, 32> RawMask;

5726

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {

5727

DecodeVPPERMMask(RawMask, Mask);

5728

break;

5729

}

5730

if (auto *C = getTargetConstantFromNode(MaskNode)) {

5731

DecodeVPPERMMask(C, Mask);

5732

break;

5733

}

5734

return false;

5735

}

5736

case X86ISD::VPERMV: {

5737

5738

IsUnary = true;

5739

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

5740

Ops.push_back(N->getOperand(1));

5741

SDValue MaskNode = N->getOperand(0);

5742

SmallVector<uint64_t, 32> RawMask;

5743

unsigned MaskEltSize = VT.getScalarSizeInBits();

5744

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {

5745

DecodeVPERMVMask(RawMask, Mask);

5746

break;

5747

}

5748

if (auto *C = getTargetConstantFromNode(MaskNode)) {

5749

DecodeVPERMVMask(C, MaskEltSize, Mask);

5750

break;

5751

}

5752

return false;

5753

}

5754

case X86ISD::VPERMV3: {

5755

5756

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5756, __extension__ __PRETTY_FUNCTION__));

5757

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

5758

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

5759

Ops.push_back(N->getOperand(0));

5760

Ops.push_back(N->getOperand(2));

5761

SDValue MaskNode = N->getOperand(1);

5762

unsigned MaskEltSize = VT.getScalarSizeInBits();

5763

if (auto *C = getTargetConstantFromNode(MaskNode)) {

5764

DecodeVPERMV3Mask(C, MaskEltSize, Mask);

5765

break;

5766

}

5767

return false;

5768

}

5769

case X86ISD::VPERMIV3: {

5770

5771

5772

IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);

5773

// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.

5774

Ops.push_back(N->getOperand(1));

5775

Ops.push_back(N->getOperand(2));

5776

SDValue MaskNode = N->getOperand(0);

5777

unsigned MaskEltSize = VT.getScalarSizeInBits();

5778

if (auto *C = getTargetConstantFromNode(MaskNode)) {

5779

DecodeVPERMV3Mask(C, MaskEltSize, Mask);

5780

break;

5781

}

5782

return false;

5783

}

5784

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5784);

5785

}

5786

5787

// Empty mask indicates the decode failed.

5788

if (Mask.empty())

5789

return false;

5790

5791

// Check if we're getting a shuffle mask with zero'd elements.

5792

if (!AllowSentinelZero)

5793

if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))

5794

return false;

5795

5796

// If we have a fake unary shuffle, the shuffle mask is spread across two

5797

// inputs that are actually the same node. Re-map the mask to always point

5798

// into the first input.

5799

if (IsFakeUnary)

5800

for (int &M : Mask)

5801

if (M >= (int)Mask.size())

5802

M -= Mask.size();

5803

5804

// If we didn't already add operands in the opcode-specific code, default to

5805

// adding 1 or 2 operands starting at 0.

5806

if (Ops.empty()) {

5807

Ops.push_back(N->getOperand(0));

5808

if (!IsUnary || IsFakeUnary)

5809

Ops.push_back(N->getOperand(1));

5810

}

5811

5812

return true;

5813

}

5814

5815

/// Check a target shuffle mask's inputs to see if we can set any values to

5816

/// SM_SentinelZero - this is for elements that are known to be zero

5817

/// (not just zeroable) from their inputs.

5818

/// Returns true if the target shuffle mask was decoded.

5819

static bool setTargetShuffleZeroElements(SDValue N,

5820

SmallVectorImpl<int> &Mask,

5821

SmallVectorImpl<SDValue> &Ops) {

5822

bool IsUnary;

5823

if (!isTargetShuffle(N.getOpcode()))

5824

return false;

5825

5826

MVT VT = N.getSimpleValueType();

5827

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

5828

return false;

5829

5830

SDValue V1 = Ops[0];

5831

SDValue V2 = IsUnary ? V1 : Ops[1];

5832

5833

V1 = peekThroughBitcasts(V1);

5834

V2 = peekThroughBitcasts(V2);

5835

5836

assert((VT.getSizeInBits() % Mask.size()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Mask.size())
== 0 && "Illegal split of shuffle value type") ? void
(0) : __assert_fail ("(VT.getSizeInBits() % Mask.size()) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5837, __extension__ __PRETTY_FUNCTION__))

5837

"Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Mask.size())
== 0 && "Illegal split of shuffle value type") ? void
(0) : __assert_fail ("(VT.getSizeInBits() % Mask.size()) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5837, __extension__ __PRETTY_FUNCTION__));

5838

unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

5839

5840

// Extract known constant input data.

5841

APInt UndefSrcElts[2];

5842

SmallVector<APInt, 32> SrcEltBits[2];

5843

bool IsSrcConstant[2] = {

5844

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

5845

SrcEltBits[0], true, false),

5846

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

5847

SrcEltBits[1], true, false)};

5848

5849

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

5850

int M = Mask[i];

5851

5852

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

5853

if (M < 0)

5854

continue;

5855

5856

// Determine shuffle input and normalize the mask.

5857

unsigned SrcIdx = M / Size;

5858

SDValue V = M < Size ? V1 : V2;

5859

M %= Size;

5860

5861

// We are referencing an UNDEF input.

5862

if (V.isUndef()) {

5863

Mask[i] = SM_SentinelUndef;

5864

continue;

5865

}

5866

5867

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

5868

// TODO: We currently only set UNDEF for integer types - floats use the same

5869

// registers as vectors and many of the scalar folded loads rely on the

5870

// SCALAR_TO_VECTOR pattern.

5871

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

5872

(Size % V.getValueType().getVectorNumElements()) == 0) {

5873

int Scale = Size / V.getValueType().getVectorNumElements();

5874

int Idx = M / Scale;

5875

if (Idx != 0 && !VT.isFloatingPoint())

5876

Mask[i] = SM_SentinelUndef;

5877

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

5878

Mask[i] = SM_SentinelZero;

5879

continue;

5880

}

5881

5882

// Attempt to extract from the source's constant bits.

5883

if (IsSrcConstant[SrcIdx]) {

5884

if (UndefSrcElts[SrcIdx][M])

5885

Mask[i] = SM_SentinelUndef;

5886

else if (SrcEltBits[SrcIdx][M] == 0)

5887

Mask[i] = SM_SentinelZero;

5888

}

5889

}

5890

5891

assert(VT.getVectorNumElements() == Mask.size() &&(static_cast <bool> (VT.getVectorNumElements() == Mask.
size() && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == Mask.size() && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5892, __extension__ __PRETTY_FUNCTION__))

5892

"Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == Mask.
size() && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == Mask.size() && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5892, __extension__ __PRETTY_FUNCTION__));

5893

return true;

5894

}

5895

5896

// Attempt to decode ops that could be represented as a shuffle mask.

5897

// The decoded shuffle mask may contain a different number of elements to the

5898

// destination value type.

5899

static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,

5900

SmallVectorImpl<SDValue> &Ops,

5901

SelectionDAG &DAG) {

5902

Mask.clear();

5903

Ops.clear();

5904

5905

MVT VT = N.getSimpleValueType();

5906

unsigned NumElts = VT.getVectorNumElements();

5907

unsigned NumSizeInBits = VT.getSizeInBits();

5908

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

5909

assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&(static_cast <bool> ((NumBitsPerElt % 8) == 0 &&
(NumSizeInBits % 8) == 0 && "Expected byte aligned value types"
) ? void (0) : __assert_fail ("(NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && \"Expected byte aligned value types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5910, __extension__ __PRETTY_FUNCTION__))

5910

"Expected byte aligned value types")(static_cast <bool> ((NumBitsPerElt % 8) == 0 &&
(NumSizeInBits % 8) == 0 && "Expected byte aligned value types"
) ? void (0) : __assert_fail ("(NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && \"Expected byte aligned value types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5910, __extension__ __PRETTY_FUNCTION__));

5911

5912

unsigned Opcode = N.getOpcode();

5913

switch (Opcode) {

5914

case ISD::AND:

5915

case X86ISD::ANDNP: {

5916

// Attempt to decode as a per-byte mask.

5917

APInt UndefElts;

5918

SmallVector<APInt, 32> EltBits;

5919

SDValue N0 = N.getOperand(0);

5920

SDValue N1 = N.getOperand(1);

5921

bool IsAndN = (X86ISD::ANDNP == Opcode);

5922

uint64_t ZeroMask = IsAndN ? 255 : 0;

5923

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

5924

return false;

5925

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

5926

if (UndefElts[i]) {

5927

Mask.push_back(SM_SentinelUndef);

5928

continue;

5929

}

5930

uint64_t ByteBits = EltBits[i].getZExtValue();

5931

if (ByteBits != 0 && ByteBits != 255)

5932

return false;

5933

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

5934

}

5935

Ops.push_back(IsAndN ? N1 : N0);

5936

return true;

5937

}

5938

case ISD::SCALAR_TO_VECTOR: {

5939

// Match against a scalar_to_vector of an extract from a vector,

5940

// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.

5941

SDValue N0 = N.getOperand(0);

5942

SDValue SrcExtract;

5943

5944

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

5945

N0.getOperand(0).getValueType() == VT) ||

5946

(N0.getOpcode() == X86ISD::PEXTRW &&

5947

N0.getOperand(0).getValueType() == MVT::v8i16) ||

5948

(N0.getOpcode() == X86ISD::PEXTRB &&

5949

N0.getOperand(0).getValueType() == MVT::v16i8)) {

5950

SrcExtract = N0;

5951

}

5952

5953

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

5954

return false;

5955

5956

SDValue SrcVec = SrcExtract.getOperand(0);

5957

EVT SrcVT = SrcVec.getValueType();

5958

unsigned NumSrcElts = SrcVT.getVectorNumElements();

5959

unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

5960

5961

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

5962

if (NumSrcElts <= SrcIdx)

5963

return false;

5964

5965

Ops.push_back(SrcVec);

5966

Mask.push_back(SrcIdx);

5967

Mask.append(NumZeros, SM_SentinelZero);

5968

Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);

5969

return true;

5970

}

5971

case X86ISD::PINSRB:

5972

case X86ISD::PINSRW: {

5973

SDValue InVec = N.getOperand(0);

5974

SDValue InScl = N.getOperand(1);

5975

uint64_t InIdx = N.getConstantOperandVal(2);

5976

assert(InIdx < NumElts && "Illegal insertion index")(static_cast <bool> (InIdx < NumElts && "Illegal insertion index"
) ? void (0) : __assert_fail ("InIdx < NumElts && \"Illegal insertion index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5976, __extension__ __PRETTY_FUNCTION__));

5977

5978

// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.

5979

if (X86::isZeroNode(InScl)) {

5980

Ops.push_back(InVec);

5981

for (unsigned i = 0; i != NumElts; ++i)

5982

Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);

5983

return true;

5984

}

5985

5986

// Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.

5987

// TODO: Expand this to support INSERT_VECTOR_ELT/etc.

5988

unsigned ExOp =

5989

(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);

5990

if (InScl.getOpcode() != ExOp)

5991

return false;

5992

5993

SDValue ExVec = InScl.getOperand(0);

5994

uint64_t ExIdx = InScl.getConstantOperandVal(1);

5995

assert(ExIdx < NumElts && "Illegal extraction index")(static_cast <bool> (ExIdx < NumElts && "Illegal extraction index"
) ? void (0) : __assert_fail ("ExIdx < NumElts && \"Illegal extraction index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5995, __extension__ __PRETTY_FUNCTION__));

5996

Ops.push_back(InVec);

5997

Ops.push_back(ExVec);

5998

for (unsigned i = 0; i != NumElts; ++i)

5999

Mask.push_back(i == InIdx ? NumElts + ExIdx : i);

6000

return true;

6001

}

6002

case X86ISD::PACKSS:

6003

case X86ISD::PACKUS: {

6004

SDValue N0 = N.getOperand(0);

6005

SDValue N1 = N.getOperand(1);

6006

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6008, __extension__ __PRETTY_FUNCTION__))

6007

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6008, __extension__ __PRETTY_FUNCTION__))

6008

"Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6008, __extension__ __PRETTY_FUNCTION__));

6009

6010

// If we know input saturation won't happen we can treat this

6011

// as a truncation shuffle.

6012

if (Opcode == X86ISD::PACKSS) {

6013

if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||

6014

(!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))

6015

return false;

6016

} else {

6017

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

6018

if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||

6019

(!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))

6020

return false;

6021

}

6022

6023

bool IsUnary = (N0 == N1);

6024

6025

Ops.push_back(N0);

6026

if (!IsUnary)

6027

Ops.push_back(N1);

6028

6029

createPackShuffleMask(VT, Mask, IsUnary);

6030

return true;

6031

}

6032

case X86ISD::VSHLI:

6033

case X86ISD::VSRLI: {

6034

uint64_t ShiftVal = N.getConstantOperandVal(1);

6035

// Out of range bit shifts are guaranteed to be zero.

6036

if (NumBitsPerElt <= ShiftVal) {

6037

Mask.append(NumElts, SM_SentinelZero);

6038

return true;

6039

}

6040

6041

// We can only decode 'whole byte' bit shifts as shuffles.

6042

if ((ShiftVal % 8) != 0)

6043

break;

6044

6045

uint64_t ByteShift = ShiftVal / 8;

6046

unsigned NumBytes = NumSizeInBits / 8;

6047

unsigned NumBytesPerElt = NumBitsPerElt / 8;

6048

Ops.push_back(N.getOperand(0));

6049

6050

// Clear mask to all zeros and insert the shifted byte indices.

6051

Mask.append(NumBytes, SM_SentinelZero);

6052

6053

if (X86ISD::VSHLI == Opcode) {

6054

for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)

6055

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

6056

Mask[i + j] = i + j - ByteShift;

6057

} else {

6058

for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)

6059

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

6060

Mask[i + j - ByteShift] = i + j;

6061

}

6062

return true;

6063

}

6064

case ISD::ZERO_EXTEND_VECTOR_INREG:

6065

case X86ISD::VZEXT: {

6066

// TODO - add support for VPMOVZX with smaller input vector types.

6067

SDValue Src = N.getOperand(0);

6068

MVT SrcVT = Src.getSimpleValueType();

6069

if (NumSizeInBits != SrcVT.getSizeInBits())

6070

break;

6071

DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);

6072

Ops.push_back(Src);

6073

return true;

6074

}

6075

}

6076

6077

return false;

6078

}

6079

6080

/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.

6081

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

6082

SmallVectorImpl<int> &Mask) {

6083

int MaskWidth = Mask.size();

6084

SmallVector<SDValue, 16> UsedInputs;

6085

for (int i = 0, e = Inputs.size(); i < e; ++i) {

6086

int lo = UsedInputs.size() * MaskWidth;

6087

int hi = lo + MaskWidth;

6088

6089

// Strip UNDEF input usage.

6090

if (Inputs[i].isUndef())

6091

for (int &M : Mask)

6092

if ((lo <= M) && (M < hi))

6093

M = SM_SentinelUndef;

6094

6095

// Check for unused inputs.

6096

if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

6097

UsedInputs.push_back(Inputs[i]);

6098

continue;

6099

}

6100

for (int &M : Mask)

6101

if (lo <= M)

6102

M -= MaskWidth;

6103

}

6104

Inputs = UsedInputs;

6105

}

6106

6107

/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs

6108

/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the

6109

/// remaining input indices in case we now have a unary shuffle and adjust the

6110

/// inputs accordingly.

6111

/// Returns true if the target shuffle mask was decoded.

6112

static bool resolveTargetShuffleInputs(SDValue Op,

6113

SmallVectorImpl<SDValue> &Inputs,

6114

SmallVectorImpl<int> &Mask,

6115

SelectionDAG &DAG) {

6116

if (!setTargetShuffleZeroElements(Op, Mask, Inputs))

6117

if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))

6118

return false;

6119

6120

resolveTargetShuffleInputsAndMask(Inputs, Mask);

6121

return true;

6122

}

6123

6124

/// Returns the scalar element that will make up the ith

6125

/// element of the result of the vector shuffle.

6126

static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,

6127

unsigned Depth) {

6128

if (Depth == 6)

6129

return SDValue(); // Limit search depth.

6130

6131

SDValue V = SDValue(N, 0);

6132

EVT VT = V.getValueType();

6133

unsigned Opcode = V.getOpcode();

6134

6135

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

6136

if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {

6137

int Elt = SV->getMaskElt(Index);

6138

6139

if (Elt < 0)

6140

return DAG.getUNDEF(VT.getVectorElementType());

6141

6142

unsigned NumElems = VT.getVectorNumElements();

6143

SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)

6144

: SV->getOperand(1);

6145

return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);

6146

}

6147

6148

// Recurse into target specific vector shuffles to find scalars.

6149

if (isTargetShuffle(Opcode)) {

6150

MVT ShufVT = V.getSimpleValueType();

6151

MVT ShufSVT = ShufVT.getVectorElementType();

6152

int NumElems = (int)ShufVT.getVectorNumElements();

6153

SmallVector<int, 16> ShuffleMask;

6154

SmallVector<SDValue, 16> ShuffleOps;

6155

bool IsUnary;

6156

6157

if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))

6158

return SDValue();

6159

6160

int Elt = ShuffleMask[Index];

6161

if (Elt == SM_SentinelZero)

6162

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)

6163

: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);

6164

if (Elt == SM_SentinelUndef)

6165

return DAG.getUNDEF(ShufSVT);

6166

6167

assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
*NumElems) && "Shuffle index out of range") ? void (0
) : __assert_fail ("0 <= Elt && Elt < (2*NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6167, __extension__ __PRETTY_FUNCTION__));

6168

SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

6169

return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,

6170

Depth+1);

6171

}

6172

6173

// Actual nodes that may contain scalar elements

6174

if (Opcode == ISD::BITCAST) {

6175

V = V.getOperand(0);

6176

EVT SrcVT = V.getValueType();

6177

unsigned NumElems = VT.getVectorNumElements();

6178

6179

if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)

6180

return SDValue();

6181

}

6182

6183

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)

6184

return (Index == 0) ? V.getOperand(0)

6185

: DAG.getUNDEF(VT.getVectorElementType());

6186

6187

if (V.getOpcode() == ISD::BUILD_VECTOR)

6188

return V.getOperand(Index);

6189

6190

return SDValue();

6191

}

6192

6193

// Use PINSRB/PINSRW/PINSRD to create a build vector.

6194

static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,

6195

unsigned NumNonZero, unsigned NumZero,

6196

SelectionDAG &DAG,

6197

const X86Subtarget &Subtarget) {

6198

MVT VT = Op.getSimpleValueType();

6199

unsigned NumElts = VT.getVectorNumElements();

6200

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6202, __extension__ __PRETTY_FUNCTION__))

6201

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6202, __extension__ __PRETTY_FUNCTION__))

6202

"Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6202, __extension__ __PRETTY_FUNCTION__));

6203

6204

SDLoc dl(Op);

6205

SDValue V;

6206

bool First = true;

6207

6208

for (unsigned i = 0; i < NumElts; ++i) {

6209

bool IsNonZero = (NonZeros & (1 << i)) != 0;

6210

if (!IsNonZero)

6211

continue;

6212

6213

// If the build vector contains zeros or our first insertion is not the

6214

// first index then insert into zero vector to break any register

6215

// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.

6216

if (First) {

6217

First = false;

6218

if (NumZero || 0 != i)

6219

V = getZeroVector(VT, Subtarget, DAG, dl);

6220

else {

6221

assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6221, __extension__ __PRETTY_FUNCTION__));

6222

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

6223

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

6224

V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);

6225

V = DAG.getBitcast(VT, V);

6226

continue;

6227

}

6228

}

6229

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

6230

DAG.getIntPtrConstant(i, dl));

6231

}

6232

6233

return V;

6234

}

6235

6236

/// Custom lower build_vector of v16i8.

6237

static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,

6238

unsigned NumNonZero, unsigned NumZero,

6239

SelectionDAG &DAG,

6240

const X86Subtarget &Subtarget) {

6241

if (NumNonZero > 8 && !Subtarget.hasSSE41())

6242

return SDValue();

6243

6244

// SSE4.1 - use PINSRB to insert each byte directly.

6245

if (Subtarget.hasSSE41())

6246

return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,

6247

Subtarget);

6248

6249

SDLoc dl(Op);

6250

SDValue V;

6251

bool First = true;

6252

6253

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

6254

for (unsigned i = 0; i < 16; ++i) {

6255

bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;

6256

if (ThisIsNonZero && First) {

6257

if (NumZero)

6258

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

6259

else

6260

V = DAG.getUNDEF(MVT::v8i16);

6261

First = false;

6262

}

6263

6264

if ((i & 1) != 0) {

6265

// FIXME: Investigate extending to i32 instead of just i16.

6266

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

6267

SDValue ThisElt, LastElt;

6268

bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;

6269

if (LastIsNonZero) {

6270

LastElt =

6271

DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));

6272

}

6273

if (ThisIsNonZero) {

6274

ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));

6275

ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,

6276

DAG.getConstant(8, dl, MVT::i8));

6277

if (LastIsNonZero)

6278

ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);

6279

} else

6280

ThisElt = LastElt;

6281

6282

if (ThisElt) {

6283

if (1 == i) {

6284

V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)

6285

: DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);

6286

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

6287

V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);

6288

V = DAG.getBitcast(MVT::v8i16, V);

6289

} else {

6290

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,

6291

DAG.getIntPtrConstant(i / 2, dl));

6292

}

6293

}

6294

}

6295

}

6296

6297

return DAG.getBitcast(MVT::v16i8, V);

6298

}

6299

6300

/// Custom lower build_vector of v8i16.

6301

static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,

6302

unsigned NumNonZero, unsigned NumZero,

6303

SelectionDAG &DAG,

6304

const X86Subtarget &Subtarget) {

6305

if (NumNonZero > 4 && !Subtarget.hasSSE41())

6306

return SDValue();

6307

6308

// Use PINSRW to insert each byte directly.

6309

return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,

6310

Subtarget);

6311

}

6312

6313

/// Custom lower build_vector of v4i32 or v4f32.

6314

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

6315

const X86Subtarget &Subtarget) {

6316

// Find all zeroable elements.

6317

std::bitset<4> Zeroable;

6318

for (int i=0; i < 4; ++i) {

6319

SDValue Elt = Op->getOperand(i);

6320

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

6321

}

6322

assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6323, __extension__ __PRETTY_FUNCTION__))

6323

"We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6323, __extension__ __PRETTY_FUNCTION__));

6324

6325

// We only know how to deal with build_vector nodes where elements are either

6326

// zeroable or extract_vector_elt with constant index.

6327

SDValue FirstNonZero;

6328

unsigned FirstNonZeroIdx;

6329

for (unsigned i=0; i < 4; ++i) {

6330

if (Zeroable[i])

6331

continue;

6332

SDValue Elt = Op->getOperand(i);

6333

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

6334

!isa<ConstantSDNode>(Elt.getOperand(1)))

6335

return SDValue();

6336

// Make sure that this node is extracting from a 128-bit vector.

6337

MVT VT = Elt.getOperand(0).getSimpleValueType();

6338

if (!VT.is128BitVector())

6339

return SDValue();

6340

if (!FirstNonZero.getNode()) {

6341

FirstNonZero = Elt;

6342

FirstNonZeroIdx = i;

6343

}

6344

}

6345

6346

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6346, __extension__ __PRETTY_FUNCTION__));

6347

SDValue V1 = FirstNonZero.getOperand(0);

6348

MVT VT = V1.getSimpleValueType();

6349

6350

// See if this build_vector can be lowered as a blend with zero.

6351

SDValue Elt;

6352

unsigned EltMaskIdx, EltIdx;

6353

int Mask[4];

6354

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

6355

if (Zeroable[EltIdx]) {

6356

// The zero vector will be on the right hand side.

6357

Mask[EltIdx] = EltIdx+4;

6358

continue;

6359

}

6360

6361

Elt = Op->getOperand(EltIdx);

6362

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

6363

EltMaskIdx = Elt.getConstantOperandVal(1);

6364

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

6365

break;

6366

Mask[EltIdx] = EltIdx;

6367

}

6368

6369

if (EltIdx == 4) {

6370

// Let the shuffle legalizer deal with blend operations.

6371

SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

6372

if (V1.getSimpleValueType() != VT)

6373

V1 = DAG.getBitcast(VT, V1);

6374

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);

6375

}

6376

6377

// See if we can lower this build_vector to a INSERTPS.

6378

if (!Subtarget.hasSSE41())

6379

return SDValue();

6380

6381

SDValue V2 = Elt.getOperand(0);

6382

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

6383

V1 = SDValue();

6384

6385

bool CanFold = true;

6386

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

6387

if (Zeroable[i])

6388

continue;

6389

6390

SDValue Current = Op->getOperand(i);

6391

SDValue SrcVector = Current->getOperand(0);

6392

if (!V1.getNode())

6393

V1 = SrcVector;

6394

CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);

6395

}

6396

6397

if (!CanFold)

6398

return SDValue();

6399

6400

assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6400, __extension__ __PRETTY_FUNCTION__));

6401

if (V1.getSimpleValueType() != MVT::v4f32)

6402

V1 = DAG.getBitcast(MVT::v4f32, V1);

6403

if (V2.getSimpleValueType() != MVT::v4f32)

6404

V2 = DAG.getBitcast(MVT::v4f32, V2);

6405

6406

// Ok, we can emit an INSERTPS instruction.

6407

unsigned ZMask = Zeroable.to_ulong();

6408

6409

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

6410

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6410, __extension__ __PRETTY_FUNCTION__));

6411

SDLoc DL(Op);

6412

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

6413

DAG.getIntPtrConstant(InsertPSMask, DL));

6414

return DAG.getBitcast(VT, Result);

6415

}

6416

6417

/// Return a vector logical shift node.

6418

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

6419

SelectionDAG &DAG, const TargetLowering &TLI,

6420

const SDLoc &dl) {

6421

assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6421, __extension__ __PRETTY_FUNCTION__));

6422

MVT ShVT = MVT::v16i8;

6423

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

6424

SrcOp = DAG.getBitcast(ShVT, SrcOp);

6425

MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);

6426

assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6426, __extension__ __PRETTY_FUNCTION__));

6427

SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);

6428

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

6429

}

6430

6431

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

6432

SelectionDAG &DAG) {

6433

6434

// Check if the scalar load can be widened into a vector load. And if

6435

// the address is "base + cst" see if the cst can be "absorbed" into

6436

// the shuffle mask.

6437

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

6438

SDValue Ptr = LD->getBasePtr();

6439

if (!ISD::isNormalLoad(LD) || LD->isVolatile())

6440

return SDValue();

6441

EVT PVT = LD->getValueType(0);

6442

if (PVT != MVT::i32 && PVT != MVT::f32)

6443

return SDValue();

6444

6445

int FI = -1;

6446

int64_t Offset = 0;

6447

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

6448

FI = FINode->getIndex();

6449

Offset = 0;

6450

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

6451

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

6452

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

6453

Offset = Ptr.getConstantOperandVal(1);

6454

Ptr = Ptr.getOperand(0);

6455

} else {

6456

return SDValue();

6457

}

6458

6459

// FIXME: 256-bit vector instructions don't require a strict alignment,

6460

// improve this code to support it better.

6461

unsigned RequiredAlign = VT.getSizeInBits()/8;

6462

SDValue Chain = LD->getChain();

6463

// Make sure the stack object alignment is at least 16 or 32.

6464

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

6465

if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {

6466

if (MFI.isFixedObjectIndex(FI)) {

6467

// Can't change the alignment. FIXME: It's possible to compute

6468

// the exact stack offset and reference FI + adjust offset instead.

6469

// If someone *really* cares about this. That's the way to implement it.

6470

return SDValue();

6471

} else {

6472

MFI.setObjectAlignment(FI, RequiredAlign);

6473

}

6474

}

6475

6476

// (Offset % 16 or 32) must be multiple of 4. Then address is then

6477

// Ptr + (Offset & ~15).

6478

if (Offset < 0)

6479

return SDValue();

6480

if ((Offset % RequiredAlign) & 3)

6481

return SDValue();

6482

int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);

6483

if (StartOffset) {

6484

SDLoc DL(Ptr);

6485

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

6486

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

6487

}

6488

6489

int EltNo = (Offset - StartOffset) >> 2;

6490

unsigned NumElems = VT.getVectorNumElements();

6491

6492

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

6493

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

6494

LD->getPointerInfo().getWithOffset(StartOffset));

6495

6496

SmallVector<int, 8> Mask(NumElems, EltNo);

6497

6498

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

6499

}

6500

6501

return SDValue();

6502

}

6503

6504

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

6505

/// elements can be replaced by a single large load which has the same value as

6506

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

6507

///

6508

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

6509

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

6510

const SDLoc &DL, SelectionDAG &DAG,

6511

const X86Subtarget &Subtarget,

6512

bool isAfterLegalize) {

6513

unsigned NumElems = Elts.size();

6514

6515

int LastLoadedElt = -1;

6516

SmallBitVector LoadMask(NumElems, false);

6517

SmallBitVector ZeroMask(NumElems, false);

←

Calling constructor for 'SmallBitVector'

→

←

Returning from constructor for 'SmallBitVector'

→

6518

SmallBitVector UndefMask(NumElems, false);

6519

6520

// For each element in the initializer, see if we've found a load, zero or an

6521

// undef.

6522

for (unsigned i = 0; i < NumElems; ++i) {

←

Loop condition is true. Entering loop body

→

6523

SDValue Elt = peekThroughBitcasts(Elts[i]);

6524

if (!Elt.getNode())

←

Assuming the condition is false

→

←

Taking false branch

→

6525

return SDValue();

6526

6527

if (Elt.isUndef())

←

Taking false branch

→

6528

UndefMask[i] = true;

6529

else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))

←

Assuming the condition is false

→

←

Assuming the condition is false

→

←

Taking false branch

→

6530

ZeroMask[i] = true;

6531

else if (ISD::isNON_EXTLoad(Elt.getNode())) {

←

Taking false branch

→

6532

LoadMask[i] = true;

6533

LastLoadedElt = i;

6534

// Each loaded element must be the correct fractional portion of the

6535

// requested vector load.

6536

if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())

6537

return SDValue();

6538

} else

6539

return SDValue();

6540

}

6541

assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&(static_cast <bool> ((ZeroMask | UndefMask | LoadMask).
count() == NumElems && "Incomplete element masks") ? void
(0) : __assert_fail ("(ZeroMask | UndefMask | LoadMask).count() == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6542, __extension__ __PRETTY_FUNCTION__))

6542

"Incomplete element masks")(static_cast <bool> ((ZeroMask | UndefMask | LoadMask).
count() == NumElems && "Incomplete element masks") ? void
(0) : __assert_fail ("(ZeroMask | UndefMask | LoadMask).count() == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6542, __extension__ __PRETTY_FUNCTION__));

6543

6544

// Handle Special Cases - all undef or undef/zero.

6545

if (UndefMask.count() == NumElems)

6546

return DAG.getUNDEF(VT);

6547

6548

// FIXME: Should we return this as a BUILD_VECTOR instead?

6549

if ((ZeroMask | UndefMask).count() == NumElems)

6550

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

6551

: DAG.getConstantFP(0.0, DL, VT);

6552

6553

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

6554

int FirstLoadedElt = LoadMask.find_first();

6555

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

6556

LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);

6557

EVT LDBaseVT = EltBase.getValueType();

6558

6559

// Consecutive loads can contain UNDEFS but not ZERO elements.

6560

// Consecutive loads with UNDEFs and ZEROs elements require a

6561

// an additional shuffle stage to clear the ZERO elements.

6562

bool IsConsecutiveLoad = true;

6563

bool IsConsecutiveLoadWithZeros = true;

6564

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

6565

if (LoadMask[i]) {

6566

SDValue Elt = peekThroughBitcasts(Elts[i]);

6567

LoadSDNode *LD = cast<LoadSDNode>(Elt);

6568

if (!DAG.areNonVolatileConsecutiveLoads(

6569

LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,

6570

i - FirstLoadedElt)) {

6571

IsConsecutiveLoad = false;

6572

IsConsecutiveLoadWithZeros = false;

6573

break;

6574

}

6575

} else if (ZeroMask[i]) {

6576

IsConsecutiveLoad = false;

6577

}

6578

}

6579

6580

SmallVector<LoadSDNode *, 8> Loads;

6581

for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)

6582

if (LoadMask[i])

6583

Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));

6584

6585

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

6586

auto MMOFlags = LDBase->getMemOperand()->getFlags();

6587

assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&(static_cast <bool> (!(MMOFlags & MachineMemOperand
::MOVolatile) && "Cannot merge volatile loads.") ? void
(0) : __assert_fail ("!(MMOFlags & MachineMemOperand::MOVolatile) && \"Cannot merge volatile loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6588, __extension__ __PRETTY_FUNCTION__))

6588

"Cannot merge volatile loads.")(static_cast <bool> (!(MMOFlags & MachineMemOperand
::MOVolatile) && "Cannot merge volatile loads.") ? void
(0) : __assert_fail ("!(MMOFlags & MachineMemOperand::MOVolatile) && \"Cannot merge volatile loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6588, __extension__ __PRETTY_FUNCTION__));

6589

SDValue NewLd =

6590

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

6591

LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);

6592

for (auto *LD : Loads)

6593

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

6594

return NewLd;

6595

};

6596

6597

// LOAD - all consecutive load/undefs (must start/end with a load).

6598

// If we have found an entire vector of loads and undefs, then return a large

6599

// load of the entire vector width starting at the base pointer.

6600

// If the vector contains zeros, then attempt to shuffle those elements.

6601

if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&

6602

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

6603

assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6603, __extension__ __PRETTY_FUNCTION__));

6604

EVT EltVT = LDBase->getValueType(0);

6605

// Ensure that the input vector size for the merged loads matches the

6606

// cumulative size of the input elements.

6607

if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)

6608

return SDValue();

6609

6610

if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

6611

return SDValue();

6612

6613

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

6614

// will lower to regular temporal loads and use the cache.

6615

if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&

6616

VT.is256BitVector() && !Subtarget.hasInt256())

6617

return SDValue();

6618

6619

if (IsConsecutiveLoad)

6620

return CreateLoad(VT, LDBase);

6621

6622

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

6623

// vector and a zero vector to clear out the zero elements.

6624

if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {

6625

SmallVector<int, 4> ClearMask(NumElems, -1);

6626

for (unsigned i = 0; i < NumElems; ++i) {

6627

if (ZeroMask[i])

6628

ClearMask[i] = i + NumElems;

6629

else if (LoadMask[i])

6630

ClearMask[i] = i;

6631

}

6632

SDValue V = CreateLoad(VT, LDBase);

6633

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

6634

: DAG.getConstantFP(0.0, DL, VT);

6635

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

6636

}

6637

}

6638

6639

int LoadSize =

6640

(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();

6641

6642

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

6643

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

6644

(LoadSize == 32 || LoadSize == 64) &&

6645

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

6646

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)

6647

: MVT::getIntegerVT(LoadSize);

6648

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);

6649

if (TLI.isTypeLegal(VecVT)) {

6650

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

6651

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

6652

SDValue ResNode =

6653

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,

6654

LDBase->getPointerInfo(),

6655

LDBase->getAlignment(),

6656

false/*isVolatile*/, true/*ReadMem*/,

6657

false/*WriteMem*/);

6658

for (auto *LD : Loads)

6659

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

6660

return DAG.getBitcast(VT, ResNode);

6661

}

6662

}

6663

6664

return SDValue();

6665

}

←

Potential leak of memory pointed to by 'ZeroMask.X'

6666

6667

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

6668

unsigned SplatBitSize, LLVMContext &C) {

6669

unsigned ScalarSize = VT.getScalarSizeInBits();

6670

unsigned NumElm = SplatBitSize / ScalarSize;

6671

6672

SmallVector<Constant *, 32> ConstantVec;

6673

for (unsigned i = 0; i < NumElm; i++) {

6674

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

6675

Constant *Const;

6676

if (VT.isFloatingPoint()) {

6677

if (ScalarSize == 32) {

6678

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

6679

} else {

6680

assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6680, __extension__ __PRETTY_FUNCTION__));

6681

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

6682

}

6683

} else

6684

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

6685

ConstantVec.push_back(Const);

6686

}

6687

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

6688

}

6689

6690

static bool isUseOfShuffle(SDNode *N) {

6691

for (auto *U : N->uses()) {

6692

if (isTargetShuffle(U->getOpcode()))

6693

return true;

6694

if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts

6695

return isUseOfShuffle(U);

6696

}

6697

return false;

6698

}

6699

6700

// Check if the current node of build vector is a zero extended vector.

6701

// // If so, return the value extended.

6702

// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.

6703

// // NumElt - return the number of zero extended identical values.

6704

// // EltType - return the type of the value include the zero extend.

6705

static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,

6706

unsigned &NumElt, MVT &EltType) {

6707

SDValue ExtValue = Op->getOperand(0);

6708

unsigned NumElts = Op->getNumOperands();

6709

unsigned Delta = NumElts;

6710

6711

for (unsigned i = 1; i < NumElts; i++) {

6712

if (Op->getOperand(i) == ExtValue) {

6713

Delta = i;

6714

break;

6715

}

6716

if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))

6717

return SDValue();

6718

}

6719

if (!isPowerOf2_32(Delta) || Delta == 1)

6720

return SDValue();

6721

6722

for (unsigned i = Delta; i < NumElts; i++) {

6723

if (i % Delta == 0) {

6724

if (Op->getOperand(i) != ExtValue)

6725

return SDValue();

6726

} else if (!(isNullConstant(Op->getOperand(i)) ||

6727

Op->getOperand(i).isUndef()))

6728

return SDValue();

6729

}

6730

unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();

6731

unsigned ExtVTSize = EltSize * Delta;

6732

EltType = MVT::getIntegerVT(ExtVTSize);

6733

NumElt = NumElts / Delta;

6734

return ExtValue;

6735

}

6736

6737

/// Attempt to use the vbroadcast instruction to generate a splat value

6738

/// from a splat BUILD_VECTOR which uses:

6739

/// a. A single scalar load, or a constant.

6740

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

6741

///

6742

/// The VBROADCAST node is returned when a pattern is found,

6743

/// or SDValue() otherwise.

6744

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

6745

const X86Subtarget &Subtarget,

6746

SelectionDAG &DAG) {

6747

// VBROADCAST requires AVX.

6748

// TODO: Splats could be generated for non-AVX CPUs using SSE

6749

// instructions, but there's less potential gain for only 128-bit vectors.

6750

if (!Subtarget.hasAVX())

6751

return SDValue();

6752

6753

MVT VT = BVOp->getSimpleValueType(0);

6754

SDLoc dl(BVOp);

6755

6756

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6757, __extension__ __PRETTY_FUNCTION__))

6757

"Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6757, __extension__ __PRETTY_FUNCTION__));

6758

6759

BitVector UndefElements;

6760

SDValue Ld = BVOp->getSplatValue(&UndefElements);

6761

6762

// Attempt to use VBROADCASTM

6763

// From this paterrn:

6764

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

6765

// b. t1 = (build_vector t0 t0)

6766

6767

// Create (VBROADCASTM v2i1 X)

6768

if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {

6769

MVT EltType = VT.getScalarType();

6770

unsigned NumElts = VT.getVectorNumElements();

6771

SDValue BOperand;

6772

SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);

6773

if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||

6774

(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&

6775

Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {

6776

if (ZeroExtended)

6777

BOperand = ZeroExtended.getOperand(0);

6778

else

6779

BOperand = Ld.getOperand(0).getOperand(0);

6780

if (BOperand.getValueType().isVector() &&

6781

BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {

6782

if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||

6783

NumElts == 8)) || // for broadcastmb2q

6784

(EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||

6785

NumElts == 16))) { // for broadcastmw2d

6786

SDValue Brdcst =

6787

DAG.getNode(X86ISD::VBROADCASTM, dl,

6788

MVT::getVectorVT(EltType, NumElts), BOperand);

6789

return DAG.getBitcast(VT, Brdcst);

6790

}

6791

}

6792

}

6793

}

6794

6795

// We need a splat of a single value to use broadcast, and it doesn't

6796

// make any sense if the value is only in one element of the vector.

6797

if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {

6798

APInt SplatValue, Undef;

6799

unsigned SplatBitSize;

6800

bool HasUndef;

6801

// Check if this is a repeated constant pattern suitable for broadcasting.

6802

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

6803

SplatBitSize > VT.getScalarSizeInBits() &&

6804

SplatBitSize < VT.getSizeInBits()) {

6805

// Avoid replacing with broadcast when it's a use of a shuffle

6806

// instruction to preserve the present custom lowering of shuffles.

6807

if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())

6808

return SDValue();

6809

// replace BUILD_VECTOR with broadcast of the repeated constants.

6810

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

6811

LLVMContext *Ctx = DAG.getContext();

6812

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

6813

if (Subtarget.hasAVX()) {

6814

if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&

6815

!(SplatBitSize == 64 && Subtarget.is32Bit())) {

6816

// Splatted value can fit in one INTEGER constant in constant pool.

6817

// Load the constant and broadcast it.

6818

MVT CVT = MVT::getIntegerVT(SplatBitSize);

6819

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

6820

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

6821

SDValue CP = DAG.getConstantPool(C, PVT);

6822

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

6823

6824

unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

6825

Ld = DAG.getLoad(

6826

CVT, dl, DAG.getEntryNode(), CP,

6827

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

6828

Alignment);

6829

SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,

6830

MVT::getVectorVT(CVT, Repeat), Ld);

6831

return DAG.getBitcast(VT, Brdcst);

6832

} else if (SplatBitSize == 32 || SplatBitSize == 64) {

6833

// Splatted value can fit in one FLOAT constant in constant pool.

6834

// Load the constant and broadcast it.

6835

// AVX have support for 32 and 64 bit broadcast for floats only.

6836

// No 64bit integer in 32bit subtarget.

6837

MVT CVT = MVT::getFloatingPointVT(SplatBitSize);

6838

// Lower the splat via APFloat directly, to avoid any conversion.

6839

Constant *C =

6840

SplatBitSize == 32

6841

? ConstantFP::get(*Ctx,

6842

APFloat(APFloat::IEEEsingle(), SplatValue))

6843

: ConstantFP::get(*Ctx,

6844

APFloat(APFloat::IEEEdouble(), SplatValue));

6845

SDValue CP = DAG.getConstantPool(C, PVT);

6846

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

6847

6848

unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

6849

Ld = DAG.getLoad(

6850

CVT, dl, DAG.getEntryNode(), CP,

6851

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

6852

Alignment);

6853

SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,

6854

MVT::getVectorVT(CVT, Repeat), Ld);

6855

return DAG.getBitcast(VT, Brdcst);

6856

} else if (SplatBitSize > 64) {

6857

// Load the vector of constants and broadcast it.

6858

MVT CVT = VT.getScalarType();

6859

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

6860

*Ctx);

6861

SDValue VCP = DAG.getConstantPool(VecC, PVT);

6862

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

6863

unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();

6864

Ld = DAG.getLoad(

6865

MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,

6866

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

6867

Alignment);

6868

SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);

6869

return DAG.getBitcast(VT, Brdcst);

6870

}

6871

}

6872

}

6873

return SDValue();

6874

}

6875

6876

bool ConstSplatVal =

6877

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

6878

6879

// Make sure that all of the users of a non-constant load are from the

6880

// BUILD_VECTOR node.

6881

if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))

6882

return SDValue();

6883

6884

unsigned ScalarSize = Ld.getValueSizeInBits();

6885

bool IsGE256 = (VT.getSizeInBits() >= 256);

6886

6887

// When optimizing for size, generate up to 5 extra bytes for a broadcast

6888

// instruction to save 8 or more bytes of constant pool data.

6889

// TODO: If multiple splats are generated to load the same constant,

6890

// it may be detrimental to overall size. There needs to be a way to detect

6891

// that condition to know if this is truly a size win.

6892

bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();

6893

6894

// Handle broadcasting a single constant scalar from the constant pool

6895

// into a vector.

6896

// On Sandybridge (no AVX2), it is still better to load a constant vector

6897

// from the constant pool and not to broadcast it from a scalar.

6898

// But override that restriction when optimizing for size.

6899

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

6900

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

6901

EVT CVT = Ld.getValueType();

6902

assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6902, __extension__ __PRETTY_FUNCTION__));

6903

6904

// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.

6905

// For size optimization, also splat v2f64 and v2i64, and for size opt

6906

// with AVX2, also splat i8 and i16.

6907

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

6908

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

6909

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

6910

const Constant *C = nullptr;

6911

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

6912

C = CI->getConstantIntValue();

6913

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

6914

C = CF->getConstantFPValue();

6915

6916

assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6916, __extension__ __PRETTY_FUNCTION__));

6917

6918

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

6919

SDValue CP =

6920

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

6921

unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

6922

Ld = DAG.getLoad(

6923

CVT, dl, DAG.getEntryNode(), CP,

6924

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

6925

Alignment);

6926

6927

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6928

}

6929

}

6930

6931

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

6932

6933

// Handle AVX2 in-register broadcasts.

6934

if (!IsLoad && Subtarget.hasInt256() &&

6935

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

6936

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6937

6938

// The scalar source must be a normal load.

6939

if (!IsLoad)

6940

return SDValue();

6941

6942

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

6943

(Subtarget.hasVLX() && ScalarSize == 64))

6944

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6945

6946

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

6947

// double since there is no vbroadcastsd xmm

6948

if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {

6949

if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)

6950

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6951

}

6952

6953

// Unsupported broadcast.

6954

return SDValue();

6955

}

6956

6957

/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real

6958

/// underlying vector and index.

6959

///

6960

/// Modifies \p ExtractedFromVec to the real vector and returns the real

6961

/// index.

6962

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

6963

SDValue ExtIdx) {

6964

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

6965

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

6966

return Idx;

6967

6968

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

6969

// lowered this:

6970

// (extract_vector_elt (v8f32 %1), Constant<6>)

6971

// to:

6972

// (extract_vector_elt (vector_shuffle<2,u,u,u>

6973

// (extract_subvector (v8f32 %0), Constant<4>),

6974

// undef)

6975

// Constant<0>)

6976

// In this case the vector is the extract_subvector expression and the index

6977

// is 2, as specified by the shuffle.

6978

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

6979

SDValue ShuffleVec = SVOp->getOperand(0);

6980

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

6981

assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6982, __extension__ __PRETTY_FUNCTION__))

6982

ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6982, __extension__ __PRETTY_FUNCTION__));

6983

6984

int ShuffleIdx = SVOp->getMaskElt(Idx);

6985

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

6986

ExtractedFromVec = ShuffleVec;

6987

return ShuffleIdx;

6988

}

6989

return Idx;

6990

}

6991

6992

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

6993

MVT VT = Op.getSimpleValueType();

6994

6995

// Skip if insert_vec_elt is not supported.

6996

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

6997

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

6998

return SDValue();

6999

7000

SDLoc DL(Op);

7001

unsigned NumElems = Op.getNumOperands();

7002

7003

SDValue VecIn1;

7004

SDValue VecIn2;

7005

SmallVector<unsigned, 4> InsertIndices;

7006

SmallVector<int, 8> Mask(NumElems, -1);

7007

7008

for (unsigned i = 0; i != NumElems; ++i) {

7009

unsigned Opc = Op.getOperand(i).getOpcode();

7010

7011

if (Opc == ISD::UNDEF)

7012

continue;

7013

7014

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

7015

// Quit if more than 1 elements need inserting.

7016

if (InsertIndices.size() > 1)

7017

return SDValue();

7018

7019

InsertIndices.push_back(i);

7020

continue;

7021

}

7022

7023

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

7024

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

7025

7026

// Quit if non-constant index.

7027

if (!isa<ConstantSDNode>(ExtIdx))

7028

return SDValue();

7029

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

7030

7031

// Quit if extracted from vector of different type.

7032

if (ExtractedFromVec.getValueType() != VT)

7033

return SDValue();

7034

7035

if (!VecIn1.getNode())

7036

VecIn1 = ExtractedFromVec;

7037

else if (VecIn1 != ExtractedFromVec) {

7038

if (!VecIn2.getNode())

7039

VecIn2 = ExtractedFromVec;

7040

else if (VecIn2 != ExtractedFromVec)

7041

// Quit if more than 2 vectors to shuffle

7042

return SDValue();

7043

}

7044

7045

if (ExtractedFromVec == VecIn1)

7046

Mask[i] = Idx;

7047

else if (ExtractedFromVec == VecIn2)

7048

Mask[i] = Idx + NumElems;

7049

}

7050

7051

if (!VecIn1.getNode())

7052

return SDValue();

7053

7054

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

7055

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

7056

7057

for (unsigned Idx : InsertIndices)

7058

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

7059

DAG.getIntPtrConstant(Idx, DL));

7060

7061

return NV;

7062

}

7063

7064

static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {

7065

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector") ? void (0) : __assert_fail
("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7067, __extension__ __PRETTY_FUNCTION__))

7066

Op.getScalarValueSizeInBits() == 1 &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector") ? void (0) : __assert_fail
("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7067, __extension__ __PRETTY_FUNCTION__))

7067

"Can not convert non-constant vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector") ? void (0) : __assert_fail
("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7067, __extension__ __PRETTY_FUNCTION__));

7068

uint64_t Immediate = 0;

7069

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

7070

SDValue In = Op.getOperand(idx);

7071

if (!In.isUndef())

7072

Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;

7073

}

7074

SDLoc dl(Op);

7075

MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));

7076

return DAG.getConstant(Immediate, dl, VT);

7077

}

7078

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

7079

SDValue

7080

X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {

7081

7082

MVT VT = Op.getSimpleValueType();

7083

assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7084, __extension__ __PRETTY_FUNCTION__))

7084

"Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7084, __extension__ __PRETTY_FUNCTION__));

7085

7086

SDLoc dl(Op);

7087

if (ISD::isBuildVectorAllZeros(Op.getNode()))

7088

return Op;

7089

7090

if (ISD::isBuildVectorAllOnes(Op.getNode()))

7091

return Op;

7092

7093

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {

7094

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

7095

// Split the pieces.

7096

SDValue Lower =

7097

DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));

7098

SDValue Upper =

7099

DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));

7100

// We have to manually lower both halves so getNode doesn't try to

7101

// reassemble the build_vector.

7102

Lower = LowerBUILD_VECTORvXi1(Lower, DAG);

7103

Upper = LowerBUILD_VECTORvXi1(Upper, DAG);

7104

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);

7105

}

7106

SDValue Imm = ConvertI1VectorToInteger(Op, DAG);

7107

if (Imm.getValueSizeInBits() == VT.getSizeInBits())

7108

return DAG.getBitcast(VT, Imm);

7109

SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);

7110

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,

7111

DAG.getIntPtrConstant(0, dl));

7112

}

7113

7114

// Vector has one or more non-const elements

7115

uint64_t Immediate = 0;

7116

SmallVector<unsigned, 16> NonConstIdx;

7117

bool IsSplat = true;

7118

bool HasConstElts = false;

7119

int SplatIdx = -1;

7120

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

7121

SDValue In = Op.getOperand(idx);

7122

if (In.isUndef())

7123

continue;

7124

if (!isa<ConstantSDNode>(In))

7125

NonConstIdx.push_back(idx);

7126

else {

7127

Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;

7128

HasConstElts = true;

7129

}

7130

if (SplatIdx < 0)

7131

SplatIdx = idx;

7132

else if (In != Op.getOperand(SplatIdx))

7133

IsSplat = false;

7134

}

7135

7136

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

7137

if (IsSplat)

7138

return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),

7139

DAG.getConstant(1, dl, VT),

7140

DAG.getConstant(0, dl, VT));

7141

7142

// insert elements one by one

7143

SDValue DstVec;

7144

SDValue Imm;

7145

if (Immediate) {

7146

MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));

7147

Imm = DAG.getConstant(Immediate, dl, ImmVT);

7148

}

7149

else if (HasConstElts)

7150

Imm = DAG.getConstant(0, dl, VT);

7151

else

7152

Imm = DAG.getUNDEF(VT);

7153

if (Imm.getValueSizeInBits() == VT.getSizeInBits())

7154

DstVec = DAG.getBitcast(VT, Imm);

7155

else {

7156

SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);

7157

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,

7158

DAG.getIntPtrConstant(0, dl));

7159

}

7160

7161

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

7162

unsigned InsertIdx = NonConstIdx[i];

7163

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

7164

Op.getOperand(InsertIdx),

7165

DAG.getIntPtrConstant(InsertIdx, dl));

7166

}

7167

return DstVec;

7168

}

7169

7170

/// \brief Return true if \p N implements a horizontal binop and return the

7171

/// operands for the horizontal binop into V0 and V1.

7172

///

7173

/// This is a helper function of LowerToHorizontalOp().

7174

/// This function checks that the build_vector \p N in input implements a

7175

/// horizontal operation. Parameter \p Opcode defines the kind of horizontal

7176

/// operation to match.

7177

/// For example, if \p Opcode is equal to ISD::ADD, then this function

7178

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

7179

/// is equal to ISD::SUB, then this function checks if this is a horizontal

7180

/// arithmetic sub.

7181

///

7182

/// This function only analyzes elements of \p N whose indices are

7183

/// in range [BaseIdx, LastIdx).

7184

static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,

7185

SelectionDAG &DAG,

7186

unsigned BaseIdx, unsigned LastIdx,

7187

SDValue &V0, SDValue &V1) {

7188

EVT VT = N->getValueType(0);

7189

7190

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7190, __extension__ __PRETTY_FUNCTION__));

7191

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7192, __extension__ __PRETTY_FUNCTION__))

7192

"Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7192, __extension__ __PRETTY_FUNCTION__));

7193

7194

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

7195

bool CanFold = true;

7196

unsigned ExpectedVExtractIdx = BaseIdx;

7197

unsigned NumElts = LastIdx - BaseIdx;

7198

V0 = DAG.getUNDEF(VT);

7199

V1 = DAG.getUNDEF(VT);

7200

7201

// Check if N implements a horizontal binop.

7202

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

7203

SDValue Op = N->getOperand(i + BaseIdx);

7204

7205

// Skip UNDEFs.

7206

if (Op->isUndef()) {

7207

// Update the expected vector extract index.

7208

if (i * 2 == NumElts)

7209

ExpectedVExtractIdx = BaseIdx;

7210

ExpectedVExtractIdx += 2;

7211

continue;

7212

}

7213

7214

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

7215

7216

if (!CanFold)

7217

break;

7218

7219

SDValue Op0 = Op.getOperand(0);

7220

SDValue Op1 = Op.getOperand(1);

7221

7222

// Try to match the following pattern:

7223

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

7224

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

7225

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

7226

Op0.getOperand(0) == Op1.getOperand(0) &&

7227

isa<ConstantSDNode>(Op0.getOperand(1)) &&

7228

isa<ConstantSDNode>(Op1.getOperand(1)));

7229

if (!CanFold)

7230

break;

7231

7232

unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();

7233

unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

7234

7235

if (i * 2 < NumElts) {

7236

if (V0.isUndef()) {

7237

V0 = Op0.getOperand(0);

7238

if (V0.getValueType() != VT)

7239

return false;

7240

}

7241

} else {

7242

if (V1.isUndef()) {

7243

V1 = Op0.getOperand(0);

7244

if (V1.getValueType() != VT)

7245

return false;

7246

}

7247

if (i * 2 == NumElts)

7248

ExpectedVExtractIdx = BaseIdx;

7249

}

7250

7251

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

7252

if (I0 == ExpectedVExtractIdx)

7253

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

7254

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

7255

// Try to match the following dag sequence:

7256

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

7257

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

7258

} else

7259

CanFold = false;

7260

7261

ExpectedVExtractIdx += 2;

7262

}

7263

7264

return CanFold;

7265

}

7266

7267

/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by

7268

/// a concat_vector.

7269

///

7270

/// This is a helper function of LowerToHorizontalOp().

7271

/// This function expects two 256-bit vectors called V0 and V1.

7272

/// At first, each vector is split into two separate 128-bit vectors.

7273

/// Then, the resulting 128-bit vectors are used to implement two

7274

/// horizontal binary operations.

7275

///

7276

/// The kind of horizontal binary operation is defined by \p X86Opcode.

7277

///

7278

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

7279

/// the two new horizontal binop.

7280

/// When Mode is set, the first horizontal binop dag node would take as input

7281

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

7282

/// horizontal binop dag node would take as input the lower 128-bit of V1

7283

/// and the upper 128-bit of V1.

7284

/// Example:

7285

/// HADD V0_LO, V0_HI

7286

/// HADD V1_LO, V1_HI

7287

///

7288

/// Otherwise, the first horizontal binop dag node takes as input the lower

7289

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

7290

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

7291

/// Example:

7292

/// HADD V0_LO, V1_LO

7293

/// HADD V0_HI, V1_HI

7294

///

7295

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

7296

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

7297

/// the upper 128-bits of the result.

7298

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

7299

const SDLoc &DL, SelectionDAG &DAG,

7300

unsigned X86Opcode, bool Mode,

7301

bool isUndefLO, bool isUndefHI) {

7302

MVT VT = V0.getSimpleValueType();

7303

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7304, __extension__ __PRETTY_FUNCTION__))

7304

"Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7304, __extension__ __PRETTY_FUNCTION__));

7305

7306

unsigned NumElts = VT.getVectorNumElements();

7307

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

7308

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

7309

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

7310

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

7311

MVT NewVT = V0_LO.getSimpleValueType();

7312

7313

SDValue LO = DAG.getUNDEF(NewVT);

7314

SDValue HI = DAG.getUNDEF(NewVT);

7315

7316

if (Mode) {

7317

// Don't emit a horizontal binop if the result is expected to be UNDEF.

7318

if (!isUndefLO && !V0->isUndef())

7319

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

7320

if (!isUndefHI && !V1->isUndef())

7321

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

7322

} else {

7323

// Don't emit a horizontal binop if the result is expected to be UNDEF.

7324

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

7325

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

7326

7327

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

7328

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

7329

}

7330

7331

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

7332

}

7333

7334

/// Returns true iff \p BV builds a vector with the result equivalent to

7335

/// the result of ADDSUB operation.

7336

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation

7337

/// are written to the parameters \p Opnd0 and \p Opnd1.

7338

static bool isAddSub(const BuildVectorSDNode *BV,

7339

const X86Subtarget &Subtarget, SelectionDAG &DAG,

7340

SDValue &Opnd0, SDValue &Opnd1) {

7341

7342

MVT VT = BV->getSimpleValueType(0);

7343

if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&

7344

(!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&

7345

(!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))

7346

return false;

7347

7348

unsigned NumElts = VT.getVectorNumElements();

7349

SDValue InVec0 = DAG.getUNDEF(VT);

7350

SDValue InVec1 = DAG.getUNDEF(VT);

7351

7352

// Odd-numbered elements in the input build vector are obtained from

7353

// adding two integer/float elements.

7354

// Even-numbered elements in the input build vector are obtained from

7355

// subtracting two integer/float elements.

7356

unsigned ExpectedOpcode = ISD::FSUB;

7357

unsigned NextExpectedOpcode = ISD::FADD;

7358

bool AddFound = false;

7359

bool SubFound = false;

7360

7361

for (unsigned i = 0, e = NumElts; i != e; ++i) {

7362

SDValue Op = BV->getOperand(i);

7363

7364

// Skip 'undef' values.

7365

unsigned Opcode = Op.getOpcode();

7366

if (Opcode == ISD::UNDEF) {

7367

std::swap(ExpectedOpcode, NextExpectedOpcode);

7368

continue;

7369

}

7370

7371

// Early exit if we found an unexpected opcode.

7372

if (Opcode != ExpectedOpcode)

7373

return false;

7374

7375

SDValue Op0 = Op.getOperand(0);

7376

SDValue Op1 = Op.getOperand(1);

7377

7378

// Try to match the following pattern:

7379

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

7380

// Early exit if we cannot match that sequence.

7381

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

7382

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

7383

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

7384

!isa<ConstantSDNode>(Op1.getOperand(1)) ||

7385

Op0.getOperand(1) != Op1.getOperand(1))

7386

return false;

7387

7388

unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();

7389

if (I0 != i)

7390

return false;

7391

7392

// We found a valid add/sub node. Update the information accordingly.

7393

if (i & 1)

7394

AddFound = true;

7395

else

7396

SubFound = true;

7397

7398

// Update InVec0 and InVec1.

7399

if (InVec0.isUndef()) {

7400

InVec0 = Op0.getOperand(0);

7401

if (InVec0.getSimpleValueType() != VT)

7402

return false;

7403

}

7404

if (InVec1.isUndef()) {

7405

InVec1 = Op1.getOperand(0);

7406

if (InVec1.getSimpleValueType() != VT)

7407

return false;

7408

}

7409

7410

// Make sure that operands in input to each add/sub node always

7411

// come from a same pair of vectors.

7412

if (InVec0 != Op0.getOperand(0)) {

7413

if (ExpectedOpcode == ISD::FSUB)

7414

return false;

7415

7416

// FADD is commutable. Try to commute the operands

7417

// and then test again.

7418

std::swap(Op0, Op1);

7419

if (InVec0 != Op0.getOperand(0))

7420

return false;

7421

}

7422

7423

if (InVec1 != Op1.getOperand(0))

7424

return false;

7425

7426

// Update the pair of expected opcodes.

7427

std::swap(ExpectedOpcode, NextExpectedOpcode);

7428

}

7429

7430

// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.

7431

if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())

7432

return false;

7433

7434

Opnd0 = InVec0;

7435

Opnd1 = InVec1;

7436

return true;

7437

}

7438

7439

/// Returns true if is possible to fold MUL and an idiom that has already been

7440

/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).

7441

/// If (and only if) true is returned, the operands of FMADDSUB are written to

7442

/// parameters \p Opnd0, \p Opnd1, \p Opnd2.

7443

///

7444

/// Prior to calling this function it should be known that there is some

7445

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

7446

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

7447

/// before replacement of such SDNode with ADDSUB operation. Thus the number

7448

/// of \p Opnd0 uses is expected to be equal to 2.

7449

/// For example, this function may be called for the following IR:

7450

/// %AB = fmul fast <2 x double> %A, %B

7451

/// %Sub = fsub fast <2 x double> %AB, %C

7452

/// %Add = fadd fast <2 x double> %AB, %C

7453

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

7454

/// <2 x i32> <i32 0, i32 3>

7455

/// There is a def for %Addsub here, which potentially can be replaced by

7456

/// X86ISD::ADDSUB operation:

7457

/// %Addsub = X86ISD::ADDSUB %AB, %C

7458

/// and such ADDSUB can further be replaced with FMADDSUB:

7459

/// %Addsub = FMADDSUB %A, %B, %C.

7460

///

7461

/// The main reason why this method is called before the replacement of the

7462

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

7463

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

7464

/// FMADDSUB is.

7465

static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,

7466

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {

7467

if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||

7468

!Subtarget.hasAnyFMA())

7469

return false;

7470

7471

// FIXME: These checks must match the similar ones in

7472

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

7473

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

7474

// or MUL + ADDSUB to FMADDSUB.

7475

const TargetOptions &Options = DAG.getTarget().Options;

7476

bool AllowFusion =

7477

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

7478

if (!AllowFusion)

7479

return false;

7480

7481

Opnd2 = Opnd1;

7482

Opnd1 = Opnd0.getOperand(1);

7483

Opnd0 = Opnd0.getOperand(0);

7484

7485

return true;

7486

}

7487

7488

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation

7489

/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.

7490

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

7491

const X86Subtarget &Subtarget,

7492

SelectionDAG &DAG) {

7493

SDValue Opnd0, Opnd1;

7494

if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))

7495

return SDValue();

7496

7497

MVT VT = BV->getSimpleValueType(0);

7498

SDLoc DL(BV);

7499

7500

// Try to generate X86ISD::FMADDSUB node here.

7501

SDValue Opnd2;

7502

// TODO: According to coverage reports, the FMADDSUB transform is not

7503

// triggered by any tests.

7504

if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))

7505

return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

7506

7507

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

7508

// the ADDSUB idiom has been successfully recognized. There are no known

7509

// X86 targets with 512-bit ADDSUB instructions!

7510

// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom

7511

// recognition.

7512

if (VT.is512BitVector())

7513

return SDValue();

7514

7515

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

7516

}

7517

7518

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

7519

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

7520

const X86Subtarget &Subtarget,

7521

SelectionDAG &DAG) {

7522

MVT VT = BV->getSimpleValueType(0);

7523

unsigned NumElts = VT.getVectorNumElements();

7524

unsigned NumUndefsLO = 0;

7525

unsigned NumUndefsHI = 0;

7526

unsigned Half = NumElts/2;

7527

7528

// Count the number of UNDEF operands in the build_vector in input.

7529

for (unsigned i = 0, e = Half; i != e; ++i)

7530

if (BV->getOperand(i)->isUndef())

7531

NumUndefsLO++;

7532

7533

for (unsigned i = Half, e = NumElts; i != e; ++i)

7534

if (BV->getOperand(i)->isUndef())

7535

NumUndefsHI++;

7536

7537

// Early exit if this is either a build_vector of all UNDEFs or all the

7538

// operands but one are UNDEF.

7539

if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)

7540

return SDValue();

7541

7542

SDLoc DL(BV);

7543

SDValue InVec0, InVec1;

7544

if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {

7545

// Try to match an SSE3 float HADD/HSUB.

7546

if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))

7547

return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

7548

7549

if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))

7550

return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);

7551

} else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {

7552

// Try to match an SSSE3 integer HADD/HSUB.

7553

if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

7554

return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);

7555

7556

if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))

7557

return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);

7558

}

7559

7560

if (!Subtarget.hasAVX())

7561

return SDValue();

7562

7563

if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {

7564

// Try to match an AVX horizontal add/sub of packed single/double

7565

// precision floating point values from 256-bit vectors.

7566

SDValue InVec2, InVec3;

7567

if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&

7568

isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&

7569

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

7570

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

7571

return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

7572

7573

if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&

7574

isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&

7575

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

7576

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

7577

return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);

7578

} else if (VT == MVT::v8i32 || VT == MVT::v16i16) {

7579

// Try to match an AVX2 horizontal add/sub of signed integers.

7580

SDValue InVec2, InVec3;

7581

unsigned X86Opcode;

7582

bool CanFold = true;

7583

7584

if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

7585

isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&

7586

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

7587

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

7588

X86Opcode = X86ISD::HADD;

7589

else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&

7590

isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&

7591

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

7592

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

7593

X86Opcode = X86ISD::HSUB;

7594

else

7595

CanFold = false;

7596

7597

if (CanFold) {

7598

// Fold this build_vector into a single horizontal add/sub.

7599

// Do this only if the target has AVX2.

7600

if (Subtarget.hasAVX2())

7601

return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);

7602

7603

// Do not try to expand this build_vector into a pair of horizontal

7604

// add/sub if we can emit a pair of scalar add/sub.

7605

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

7606

return SDValue();

7607

7608

// Convert this build_vector into a pair of horizontal binop followed by

7609

// a concat vector.

7610

bool isUndefLO = NumUndefsLO == Half;

7611

bool isUndefHI = NumUndefsHI == Half;

7612

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,

7613

isUndefLO, isUndefHI);

7614

}

7615

}

7616

7617

if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

7618

VT == MVT::v16i16) && Subtarget.hasAVX()) {

7619

unsigned X86Opcode;

7620

if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

7621

X86Opcode = X86ISD::HADD;

7622

else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))

7623

X86Opcode = X86ISD::HSUB;

7624

else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))

7625

X86Opcode = X86ISD::FHADD;

7626

else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))

7627

X86Opcode = X86ISD::FHSUB;

7628

else

7629

return SDValue();

7630

7631

// Don't try to expand this build_vector into a pair of horizontal add/sub

7632

// if we can simply emit a pair of scalar add/sub.

7633

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

7634

return SDValue();

7635

7636

// Convert this build_vector into two horizontal add/sub followed by

7637

// a concat vector.

7638

bool isUndefLO = NumUndefsLO == Half;

7639

bool isUndefHI = NumUndefsHI == Half;

7640

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

7641

isUndefLO, isUndefHI);

7642

}

7643

7644

return SDValue();

7645

}

7646

7647

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

7648

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

7649

/// just apply the bit to the vectors.

7650

/// NOTE: Its not in our interest to start make a general purpose vectorizer

7651

/// from this, but enough scalar bit operations are created from the later

7652

/// legalization + scalarization stages to need basic support.

7653

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

7654

SelectionDAG &DAG) {

7655

SDLoc DL(Op);

7656

MVT VT = Op->getSimpleValueType(0);

7657

unsigned NumElems = VT.getVectorNumElements();

7658

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

7659

7660

// Check that all elements have the same opcode.

7661

// TODO: Should we allow UNDEFS and if so how many?

7662

unsigned Opcode = Op->getOperand(0).getOpcode();

7663

for (unsigned i = 1; i < NumElems; ++i)

7664

if (Opcode != Op->getOperand(i).getOpcode())

7665

return SDValue();

7666

7667

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

7668

switch (Opcode) {

7669

default:

7670

return SDValue();

7671

case ISD::AND:

7672

case ISD::XOR:

7673

case ISD::OR:

7674

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

7675

return SDValue();

7676

break;

7677

}

7678

7679

SmallVector<SDValue, 4> LHSElts, RHSElts;

7680

for (SDValue Elt : Op->ops()) {

7681

SDValue LHS = Elt.getOperand(0);

7682

SDValue RHS = Elt.getOperand(1);

7683

7684

// We expect the canonicalized RHS operand to be the constant.

7685

if (!isa<ConstantSDNode>(RHS))

7686

return SDValue();

7687

LHSElts.push_back(LHS);

7688

RHSElts.push_back(RHS);

7689

}

7690

7691

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

7692

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

7693

return DAG.getNode(Opcode, DL, VT, LHS, RHS);

7694

}

7695

7696

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

7697

/// functionality to do this, so it's all zeros, all ones, or some derivation

7698

/// that is cheap to calculate.

7699

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

7700

const X86Subtarget &Subtarget) {

7701

SDLoc DL(Op);

7702

MVT VT = Op.getSimpleValueType();

7703

7704

// Vectors containing all zeros can be matched by pxor and xorps.

7705

if (ISD::isBuildVectorAllZeros(Op.getNode())) {

7706

// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd

7707

// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.

7708

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

7709

return Op;

7710

7711

return getZeroVector(VT, Subtarget, DAG, DL);

7712

}

7713

7714

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

7715

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

7716

// vpcmpeqd on 256-bit vectors.

7717

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

7718

if (VT == MVT::v4i32 || VT == MVT::v16i32 ||

7719

(VT == MVT::v8i32 && Subtarget.hasInt256()))

7720

return Op;

7721

7722

return getOnesVector(VT, DAG, DL);

7723

}

7724

7725

return SDValue();

7726

}

7727

7728

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

7729

// reasoned to be a permutation of a vector by indices in a non-constant vector.

7730

// (build_vector (extract_elt V, (extract_elt I, 0)),

7731

// (extract_elt V, (extract_elt I, 1)),

7732

// ...

7733

// ->

7734

// (vpermv I, V)

7735

7736

// TODO: Handle undefs

7737

// TODO: Utilize pshufb and zero mask blending to support more efficient

7738

// construction of vectors with constant-0 elements.

7739

// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,

7740

// when no native operation available.

7741

static SDValue

7742

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

7743

const X86Subtarget &Subtarget) {

7744

// Look for VPERMV and PSHUFB opportunities.

7745

MVT VT = V.getSimpleValueType();

7746

switch (VT.SimpleTy) {

7747

default:

7748

return SDValue();

7749

case MVT::v16i8:

7750

if (!Subtarget.hasSSE3())

7751

return SDValue();

7752

break;

7753

case MVT::v8f32:

7754

case MVT::v8i32:

7755

if (!Subtarget.hasAVX2())

7756

return SDValue();

7757

break;

7758

case MVT::v4i64:

7759

case MVT::v4f64:

7760

if (!Subtarget.hasVLX())

7761

return SDValue();

7762

break;

7763

case MVT::v16f32:

7764

case MVT::v8f64:

7765

case MVT::v16i32:

7766

case MVT::v8i64:

7767

if (!Subtarget.hasAVX512())

7768

return SDValue();

7769

break;

7770

case MVT::v32i16:

7771

if (!Subtarget.hasBWI())

7772

return SDValue();

7773

break;

7774

case MVT::v8i16:

7775

case MVT::v16i16:

7776

if (!Subtarget.hasVLX() || !Subtarget.hasBWI())

7777

return SDValue();

7778

break;

7779

case MVT::v64i8:

7780

if (!Subtarget.hasVBMI())

7781

return SDValue();

7782

break;

7783

case MVT::v32i8:

7784

if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())

7785

return SDValue();

7786

break;

7787

}

7788

SDValue SrcVec, IndicesVec;

7789

// Check for a match of the permute source vector and permute index elements.

7790

// This is done by checking that the i-th build_vector operand is of the form:

7791

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

7792

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

7793

SDValue Op = V.getOperand(Idx);

7794

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

7795

return SDValue();

7796

7797

// If this is the first extract encountered in V, set the source vector,

7798

// otherwise verify the extract is from the previously defined source

7799

// vector.

7800

if (!SrcVec)

7801

SrcVec = Op.getOperand(0);

7802

else if (SrcVec != Op.getOperand(0))

7803

return SDValue();

7804

SDValue ExtractedIndex = Op->getOperand(1);

7805

// Peek through extends.

7806

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

7807

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

7808

ExtractedIndex = ExtractedIndex.getOperand(0);

7809

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

7810

return SDValue();

7811

7812

// If this is the first extract from the index vector candidate, set the

7813

// indices vector, otherwise verify the extract is from the previously

7814

// defined indices vector.

7815

if (!IndicesVec)

7816

IndicesVec = ExtractedIndex.getOperand(0);

7817

else if (IndicesVec != ExtractedIndex.getOperand(0))

7818

return SDValue();

7819

7820

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

7821

if (!PermIdx || PermIdx->getZExtValue() != Idx)

7822

return SDValue();

7823

}

7824

MVT IndicesVT = VT;

7825

if (VT.isFloatingPoint())

7826

IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),

7827

VT.getVectorNumElements());

7828

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

7829

return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,

7830

SDLoc(V), VT, IndicesVec, SrcVec);

7831

}

7832

7833

SDValue

7834

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

7835

SDLoc dl(Op);

7836

7837

MVT VT = Op.getSimpleValueType();

7838

MVT ExtVT = VT.getVectorElementType();

7839

unsigned NumElems = Op.getNumOperands();

7840

7841

// Generate vectors for predicate vectors.

7842

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

7843

return LowerBUILD_VECTORvXi1(Op, DAG);

7844

7845

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

7846

return VectorConstant;

7847

7848

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

7849

// TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB

7850

// transform here.

7851

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

7852

return AddSub;

7853

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

7854

return HorizontalOp;

7855

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

7856

return Broadcast;

7857

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))

7858

return BitOp;

7859

7860

unsigned EVTBits = ExtVT.getSizeInBits();

7861

7862

unsigned NumZero = 0;

7863

unsigned NumNonZero = 0;

7864

uint64_t NonZeros = 0;

7865

bool IsAllConstants = true;

7866

SmallSet<SDValue, 8> Values;

7867

unsigned NumConstants = NumElems;

7868

for (unsigned i = 0; i < NumElems; ++i) {

7869

SDValue Elt = Op.getOperand(i);

7870

if (Elt.isUndef())

7871

continue;

7872

Values.insert(Elt);

7873

if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {

7874

IsAllConstants = false;

7875

NumConstants--;

7876

}

7877

if (X86::isZeroNode(Elt))

7878

NumZero++;

7879

else {

7880

assert(i < sizeof(NonZeros) * 8)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * 8", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7880, __extension__ __PRETTY_FUNCTION__)); // Make sure the shift is within range.

7881

NonZeros |= ((uint64_t)1 << i);

7882

NumNonZero++;

7883

}

7884

}

7885

7886

// All undef vector. Return an UNDEF. All zero vectors were handled above.

7887

if (NumNonZero == 0)

7888

return DAG.getUNDEF(VT);

7889

7890

// If we are inserting one variable into a vector of non-zero constants, try

7891

// to avoid loading each constant element as a scalar. Load the constants as a

7892

// vector and then insert the variable scalar element. If insertion is not

7893

// supported, we assume that we will fall back to a shuffle to get the scalar

7894

// blended with the constants. Insertion into a zero vector is handled as a

7895

// special-case somewhere below here.

7896

LLVMContext &Context = *DAG.getContext();

7897

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

7898

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

7899

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

7900

// Create an all-constant vector. The variable element in the old

7901

// build vector is replaced by undef in the constant vector. Save the

7902

// variable scalar element and its index for use in the insertelement.

7903

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

7904

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

7905

SDValue VarElt;

7906

SDValue InsIndex;

7907

for (unsigned i = 0; i != NumElems; ++i) {

7908

SDValue Elt = Op.getOperand(i);

7909

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

7910

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

7911

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

7912

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

7913

else if (!Elt.isUndef()) {

7914

assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7915, __extension__ __PRETTY_FUNCTION__))

7915

"Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7915, __extension__ __PRETTY_FUNCTION__));

7916

VarElt = Elt;

7917

InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));

7918

}

7919

}

7920

Constant *CV = ConstantVector::get(ConstVecOps);

7921

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

7922

7923

// The constants we just created may not be legal (eg, floating point). We

7924

// must lower the vector right here because we can not guarantee that we'll

7925

// legalize it before loading it. This is also why we could not just create

7926

// a new build vector here. If the build vector contains illegal constants,

7927

// it could get split back up into a series of insert elements.

7928

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

7929

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

7930

MachineFunction &MF = DAG.getMachineFunction();

7931

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

7932

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

7933

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

7934

}

7935

7936

// Special case for single non-zero, non-undef, element.

7937

if (NumNonZero == 1) {

7938

unsigned Idx = countTrailingZeros(NonZeros);

7939

SDValue Item = Op.getOperand(Idx);

7940

7941

// If this is an insertion of an i64 value on x86-32, and if the top bits of

7942

// the value are obviously zero, truncate the value to i32 and do the

7943

// insertion that way. Only do this if the value is non-constant or if the

7944

// value is a constant being inserted into element 0. It is cheaper to do

7945

// a constant pool load than it is to do a movd + shuffle.

7946

if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&

7947

(!IsAllConstants || Idx == 0)) {

7948

if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {

7949

// Handle SSE only.

7950

assert(VT == MVT::v2i64 && "Expected an SSE value type!")(static_cast <bool> (VT == MVT::v2i64 && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("VT == MVT::v2i64 && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7950, __extension__ __PRETTY_FUNCTION__));

7951

MVT VecVT = MVT::v4i32;

7952

7953

// Truncate the value (which may itself be a constant) to i32, and

7954

// convert it to a vector with movd (S2V+shuffle to zero extend).

7955

Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);

7956

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);

7957

return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(

7958

Item, Idx * 2, true, Subtarget, DAG));

7959

}

7960

}

7961

7962

// If we have a constant or non-constant insertion into the low element of

7963

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

7964

// the rest of the elements. This will be matched as movd/movq/movss/movsd

7965

// depending on what the source datatype is.

7966

if (Idx == 0) {

7967

if (NumZero == 0)

7968

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

7969

7970

if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||

7971

(ExtVT == MVT::i64 && Subtarget.is64Bit())) {

7972

assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7974, __extension__ __PRETTY_FUNCTION__))

7973

VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7974, __extension__ __PRETTY_FUNCTION__))

7974

"Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7974, __extension__ __PRETTY_FUNCTION__));

7975

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

7976

// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.

7977

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

7978

}

7979

7980

// We can't directly insert an i8 or i16 into a vector, so zero extend

7981

// it to i32 first.

7982

if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {

7983

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

7984

if (VT.getSizeInBits() >= 256) {

7985

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);

7986

if (Subtarget.hasAVX()) {

7987

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

7988

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

7989

} else {

7990

// Without AVX, we need to extend to a 128-bit vector and then

7991

// insert into the 256-bit vector.

7992

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);

7993

SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);

7994

Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);

7995

}

7996

} else {

7997

assert(VT.is128BitVector() && "Expected an SSE value type!")(static_cast <bool> (VT.is128BitVector() && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7997, __extension__ __PRETTY_FUNCTION__));

7998

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);

7999

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

8000

}

8001

return DAG.getBitcast(VT, Item);

8002

}

8003

}

8004

8005

// Is it a vector logical left shift?

8006

if (NumElems == 2 && Idx == 1 &&

8007

X86::isZeroNode(Op.getOperand(0)) &&

8008

!X86::isZeroNode(Op.getOperand(1))) {

8009

unsigned NumBits = VT.getSizeInBits();

8010

return getVShift(true, VT,

8011

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

8012

VT, Op.getOperand(1)),

8013

NumBits/2, DAG, *this, dl);

8014

}

8015

8016

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

8017

return SDValue();

8018

8019

// Otherwise, if this is a vector with i32 or f32 elements, and the element

8020

// is a non-constant being inserted into an element other than the low one,

8021

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

8022

// movd/movss) to move this into the low element, then shuffle it into

8023

// place.

8024

if (EVTBits == 32) {

8025

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

8026

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

8027

}

8028

}

8029

8030

// Splat is obviously ok. Let legalizer expand it to a shuffle.

8031

if (Values.size() == 1) {

8032

if (EVTBits == 32) {

8033

// Instead of a shuffle like this:

8034

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

8035

// Check if it's possible to issue this instead.

8036

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

8037

unsigned Idx = countTrailingZeros(NonZeros);

8038

SDValue Item = Op.getOperand(Idx);

8039

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

8040

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

8041

}

8042

return SDValue();

8043

}

8044

8045

// A vector full of immediates; various special cases are already

8046

// handled, so this is best done with a single constant-pool load.

8047

if (IsAllConstants)

8048

return SDValue();

8049

8050

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

8051

return V;

8052

8053

// See if we can use a vector load to get all of the elements.

8054

if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {

8055

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

8056

if (SDValue LD =

8057

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

8058

return LD;

8059

}

8060

8061

// For AVX-length vectors, build the individual 128-bit pieces and use

8062

// shuffles to put them in place.

8063

if (VT.is256BitVector() || VT.is512BitVector()) {

8064

EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);

8065

8066

// Build both the lower and upper subvector.

8067

SDValue Lower =

8068

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

8069

SDValue Upper = DAG.getBuildVector(

8070

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

8071

8072

// Recreate the wider vector with the lower and upper part.

8073

if (VT.is256BitVector())

8074

return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);

8075

return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);

8076

}

8077

8078

// Let legalizer expand 2-wide build_vectors.

8079

if (EVTBits == 64) {

8080

if (NumNonZero == 1) {

8081

// One half is zero or undef.

8082

unsigned Idx = countTrailingZeros(NonZeros);

8083

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

8084

Op.getOperand(Idx));

8085

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

8086

}

8087

return SDValue();

8088

}

8089

8090

// If element VT is < 32 bits, convert it to inserts into a zero vector.

8091

if (EVTBits == 8 && NumElems == 16)

8092

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,

8093

DAG, Subtarget))

8094

return V;

8095

8096

if (EVTBits == 16 && NumElems == 8)

8097

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,

8098

DAG, Subtarget))

8099

return V;

8100

8101

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

8102

if (EVTBits == 32 && NumElems == 4)

8103

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

8104

return V;

8105

8106

// If element VT is == 32 bits, turn it into a number of shuffles.

8107

if (NumElems == 4 && NumZero > 0) {

8108

SmallVector<SDValue, 8> Ops(NumElems);

8109

for (unsigned i = 0; i < 4; ++i) {

8110

bool isZero = !(NonZeros & (1ULL << i));

8111

if (isZero)

8112

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

8113

else

8114

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

8115

}

8116

8117

for (unsigned i = 0; i < 2; ++i) {

8118

switch ((NonZeros >> (i*2)) & 0x3) {

8119

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8119);

8120

case 0:

8121

Ops[i] = Ops[i*2]; // Must be a zero vector.

8122

break;

8123

case 1:

8124

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

8125

break;

8126

case 2:

8127

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

8128

break;

8129

case 3:

8130

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

8131

break;

8132

}

8133

}

8134

8135

bool Reverse1 = (NonZeros & 0x3) == 2;

8136

bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;

8137

int MaskVec[] = {

8138

Reverse1 ? 1 : 0,

8139

Reverse1 ? 0 : 1,

8140

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

8141

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

8142

};

8143

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

8144

}

8145

8146

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8146, __extension__ __PRETTY_FUNCTION__));

8147

8148

// Check for a build vector from mostly shuffle plus few inserting.

8149

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

8150

return Sh;

8151

8152

// For SSE 4.1, use insertps to put the high elements into the low element.

8153

if (Subtarget.hasSSE41()) {

8154

SDValue Result;

8155

if (!Op.getOperand(0).isUndef())

8156

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

8157

else

8158

Result = DAG.getUNDEF(VT);

8159

8160

for (unsigned i = 1; i < NumElems; ++i) {

8161

if (Op.getOperand(i).isUndef()) continue;

8162

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

8163

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

8164

}

8165

return Result;

8166

}

8167

8168

// Otherwise, expand into a number of unpckl*, start by extending each of

8169

// our (non-undef) elements to the full vector width with the element in the

8170

// bottom slot of the vector (which generates no code for SSE).

8171

SmallVector<SDValue, 8> Ops(NumElems);

8172

for (unsigned i = 0; i < NumElems; ++i) {

8173

if (!Op.getOperand(i).isUndef())

8174

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

8175

else

8176

Ops[i] = DAG.getUNDEF(VT);

8177

}

8178

8179

// Next, we iteratively mix elements, e.g. for v4f32:

8180

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

8181

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

8182

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

8183

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

8184

// Generate scaled UNPCKL shuffle mask.

8185

SmallVector<int, 16> Mask;

8186

for(unsigned i = 0; i != Scale; ++i)

8187

Mask.push_back(i);

8188

for (unsigned i = 0; i != Scale; ++i)

8189

Mask.push_back(NumElems+i);

8190

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

8191

8192

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

8193

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

8194

}

8195

return Ops[0];

8196

}

8197

8198

// 256-bit AVX can use the vinsertf128 instruction

8199

// to create 256-bit vectors from two other 128-bit ones.

8200

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {

8201

SDLoc dl(Op);

8202

MVT ResVT = Op.getSimpleValueType();

8203

8204

assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8205, __extension__ __PRETTY_FUNCTION__))

8205

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8205, __extension__ __PRETTY_FUNCTION__));

8206

8207

SDValue V1 = Op.getOperand(0);

8208

SDValue V2 = Op.getOperand(1);

8209

unsigned NumElems = ResVT.getVectorNumElements();

8210

if (ResVT.is256BitVector())

8211

return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

8212

8213

if (Op.getNumOperands() == 4) {

8214

MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),

8215

ResVT.getVectorNumElements()/2);

8216

SDValue V3 = Op.getOperand(2);

8217

SDValue V4 = Op.getOperand(3);

8218

return concat256BitVectors(

8219

concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),

8220

concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,

8221

NumElems, DAG, dl);

8222

}

8223

return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

8224

}

8225

8226

// Return true if all the operands of the given CONCAT_VECTORS node are zeros

8227

// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)

8228

static bool isExpandWithZeros(const SDValue &Op) {

8229

assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&(static_cast <bool> (Op.getOpcode() == ISD::CONCAT_VECTORS
&& "Expand with zeros only possible in CONCAT_VECTORS nodes!"
) ? void (0) : __assert_fail ("Op.getOpcode() == ISD::CONCAT_VECTORS && \"Expand with zeros only possible in CONCAT_VECTORS nodes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8230, __extension__ __PRETTY_FUNCTION__))

8230

"Expand with zeros only possible in CONCAT_VECTORS nodes!")(static_cast <bool> (Op.getOpcode() == ISD::CONCAT_VECTORS
&& "Expand with zeros only possible in CONCAT_VECTORS nodes!"
) ? void (0) : __assert_fail ("Op.getOpcode() == ISD::CONCAT_VECTORS && \"Expand with zeros only possible in CONCAT_VECTORS nodes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8230, __extension__ __PRETTY_FUNCTION__));

8231

8232

for (unsigned i = 1; i < Op.getNumOperands(); i++)

8233

if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))

8234

return false;

8235

8236

return true;

8237

}

8238

8239

// Returns true if the given node is a type promotion (by concatenating i1

8240

// zeros) of the result of a node that already zeros all upper bits of

8241

// k-register.

8242

static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {

8243

unsigned Opc = Op.getOpcode();

8244

8245

assert(Opc == ISD::CONCAT_VECTORS &&(static_cast <bool> (Opc == ISD::CONCAT_VECTORS &&
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected node to check for type promotion!") ? void (0) :
__assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8247, __extension__ __PRETTY_FUNCTION__))

8246

Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Opc == ISD::CONCAT_VECTORS &&
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected node to check for type promotion!") ? void (0) :
__assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8247, __extension__ __PRETTY_FUNCTION__))

8247

"Unexpected node to check for type promotion!")(static_cast <bool> (Opc == ISD::CONCAT_VECTORS &&
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected node to check for type promotion!") ? void (0) :
__assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8247, __extension__ __PRETTY_FUNCTION__));

8248

8249

// As long as we are concatenating zeros to the upper part of a previous node

8250

// result, climb up the tree until a node with different opcode is

8251

// encountered

8252

while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {

8253

if (Opc == ISD::INSERT_SUBVECTOR) {

8254

if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&

8255

Op.getConstantOperandVal(2) == 0)

8256

Op = Op.getOperand(1);

8257

else

8258

return SDValue();

8259

} else { // Opc == ISD::CONCAT_VECTORS

8260

if (isExpandWithZeros(Op))

8261

Op = Op.getOperand(0);

8262

else

8263

return SDValue();

8264

}

8265

Opc = Op.getOpcode();

8266

}

8267

8268

// Check if the first inserted node zeroes the upper bits, or an 'and' result

8269

// of a node that zeros the upper bits (its masked version).

8270

if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||

8271

(Op.getOpcode() == ISD::AND &&

8272

(isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||

8273

isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {

8274

return Op;

8275

}

8276

8277

return SDValue();

8278

}

8279

8280

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

8281

const X86Subtarget &Subtarget,

8282

SelectionDAG & DAG) {

8283

SDLoc dl(Op);

8284

MVT ResVT = Op.getSimpleValueType();

8285

unsigned NumOperands = Op.getNumOperands();

8286

8287

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8288, __extension__ __PRETTY_FUNCTION__))

8288

"Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8288, __extension__ __PRETTY_FUNCTION__));

8289

8290

// If this node promotes - by concatenating zeroes - the type of the result

8291

// of a node with instruction that zeroes all upper (irrelevant) bits of the

8292

// output register, mark it as legal and catch the pattern in instruction

8293

// selection to avoid emitting extra instructions (for zeroing upper bits).

8294

if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {

8295

SDValue ZeroC = DAG.getIntPtrConstant(0, dl);

8296

SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);

8297

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,

8298

ZeroC);

8299

}

8300

8301

unsigned NumZero = 0;

8302

unsigned NumNonZero = 0;

8303

uint64_t NonZeros = 0;

8304

for (unsigned i = 0; i != NumOperands; ++i) {

8305

SDValue SubVec = Op.getOperand(i);

8306

if (SubVec.isUndef())

8307

continue;

8308

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

8309

++NumZero;

8310

else {

8311

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8311, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

8312

NonZeros |= (uint64_t)1 << i;

8313

++NumNonZero;

8314

}

8315

}

8316

8317

8318

// If there are zero or one non-zeros we can handle this very simply.

8319

if (NumNonZero <= 1) {

8320

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

8321

: DAG.getUNDEF(ResVT);

8322

if (!NumNonZero)

8323

return Vec;

8324

unsigned Idx = countTrailingZeros(NonZeros);

8325

SDValue SubVec = Op.getOperand(Idx);

8326

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

8327

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

8328

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

8329

}

8330

8331

if (NumOperands > 2) {

8332

MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),

8333

ResVT.getVectorNumElements()/2);

8334

ArrayRef<SDUse> Ops = Op->ops();

8335

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

8336

Ops.slice(0, NumOperands/2));

8337

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

8338

Ops.slice(NumOperands/2));

8339

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

8340

}

8341

8342

assert(NumNonZero == 2 && "Simple cases not handled?")(static_cast <bool> (NumNonZero == 2 && "Simple cases not handled?"
) ? void (0) : __assert_fail ("NumNonZero == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8342, __extension__ __PRETTY_FUNCTION__));

8343

8344

if (ResVT.getVectorNumElements() >= 16)

8345

return Op; // The operation is legal with KUNPCK

8346

8347

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

8348

DAG.getUNDEF(ResVT), Op.getOperand(0),

8349

DAG.getIntPtrConstant(0, dl));

8350

unsigned NumElems = ResVT.getVectorNumElements();

8351

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

8352

DAG.getIntPtrConstant(NumElems/2, dl));

8353

}

8354

8355

static SDValue LowerCONCAT_VECTORS(SDValue Op,

8356

const X86Subtarget &Subtarget,

8357

SelectionDAG &DAG) {

8358

MVT VT = Op.getSimpleValueType();

8359

if (VT.getVectorElementType() == MVT::i1)

8360

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

8361

8362

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8364, __extension__ __PRETTY_FUNCTION__))

8363

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8364, __extension__ __PRETTY_FUNCTION__))

8364

Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8364, __extension__ __PRETTY_FUNCTION__));

8365

8366

// AVX can use the vinsertf128 instruction to create 256-bit vectors

8367

// from two other 128-bit ones.

8368

8369

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

8370

return LowerAVXCONCAT_VECTORS(Op, DAG);

8371

}

8372

8373

//===----------------------------------------------------------------------===//

8374

// Vector shuffle lowering

8375

8376

// This is an experimental code path for lowering vector shuffles on x86. It is

8377

// designed to handle arbitrary vector shuffles and blends, gracefully

8378

// degrading performance as necessary. It works hard to recognize idiomatic

8379

// shuffles and lower them to optimal instruction patterns without leaving

8380

// a framework that allows reasonably efficient handling of all vector shuffle

8381

// patterns.

8382

//===----------------------------------------------------------------------===//

8383

8384

/// \brief Tiny helper function to identify a no-op mask.

8385

///

8386

/// This is a somewhat boring predicate function. It checks whether the mask

8387

/// array input, which is assumed to be a single-input shuffle mask of the kind

8388

/// used by the X86 shuffle instructions (not a fully general

8389

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

8390

/// in-place shuffle are 'no-op's.

8391

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

8392

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

8393

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8393, __extension__ __PRETTY_FUNCTION__));

8394

if (Mask[i] >= 0 && Mask[i] != i)

8395

return false;

8396

}

8397

return true;

8398

}

8399

8400

/// \brief Test whether there are elements crossing 128-bit lanes in this

8401

/// shuffle mask.

8402

///

8403

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

8404

/// and we routinely test for these.

8405

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

8406

int LaneSize = 128 / VT.getScalarSizeInBits();

8407

int Size = Mask.size();

8408

for (int i = 0; i < Size; ++i)

8409

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

8410

return true;

8411

return false;

8412

}

8413

8414

/// \brief Test whether a shuffle mask is equivalent within each sub-lane.

8415

///

8416

/// This checks a shuffle mask to see if it is performing the same

8417

/// lane-relative shuffle in each sub-lane. This trivially implies

8418

/// that it is also not lane-crossing. It may however involve a blend from the

8419

/// same lane of a second vector.

8420

///

8421

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

8422

/// non-trivial to compute in the face of undef lanes. The representation is

8423

/// suitable for use with existing 128-bit shuffles as entries from the second

8424

/// vector have been remapped to [LaneSize, 2*LaneSize).

8425

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

8426

ArrayRef<int> Mask,

8427

SmallVectorImpl<int> &RepeatedMask) {

8428

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

8429

RepeatedMask.assign(LaneSize, -1);

8430

int Size = Mask.size();

8431

for (int i = 0; i < Size; ++i) {

8432

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8432, __extension__ __PRETTY_FUNCTION__));

8433

if (Mask[i] < 0)

8434

continue;

8435

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

8436

// This entry crosses lanes, so there is no way to model this shuffle.

8437

return false;

8438

8439

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

8440

// Adjust second vector indices to start at LaneSize instead of Size.

8441

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

8442

: Mask[i] % LaneSize + LaneSize;

8443

if (RepeatedMask[i % LaneSize] < 0)

8444

// This is the first non-undef entry in this slot of a 128-bit lane.

8445

RepeatedMask[i % LaneSize] = LocalM;

8446

else if (RepeatedMask[i % LaneSize] != LocalM)

8447

// Found a mismatch with the repeated mask.

8448

return false;

8449

}

8450

return true;

8451

}

8452

8453

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

8454

static bool

8455

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

8456

SmallVectorImpl<int> &RepeatedMask) {

8457

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

8458

}

8459

8460

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

8461

static bool

8462

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

8463

SmallVectorImpl<int> &RepeatedMask) {

8464

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

8465

}

8466

8467

/// Test whether a target shuffle mask is equivalent within each sub-lane.

8468

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

8469

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

8470

ArrayRef<int> Mask,

8471

SmallVectorImpl<int> &RepeatedMask) {

8472

int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

8473

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

8474

int Size = Mask.size();

8475

for (int i = 0; i < Size; ++i) {

8476

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8476, __extension__ __PRETTY_FUNCTION__));

8477

if (Mask[i] == SM_SentinelUndef)

8478

continue;

8479

if (Mask[i] == SM_SentinelZero) {

8480

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

8481

return false;

8482

RepeatedMask[i % LaneSize] = SM_SentinelZero;

8483

continue;

8484

}

8485

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

8486

// This entry crosses lanes, so there is no way to model this shuffle.

8487

return false;

8488

8489

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

8490

// Adjust second vector indices to start at LaneSize instead of Size.

8491

int LocalM =

8492

Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;

8493

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

8494

// This is the first non-undef entry in this slot of a 128-bit lane.

8495

RepeatedMask[i % LaneSize] = LocalM;

8496

else if (RepeatedMask[i % LaneSize] != LocalM)

8497

// Found a mismatch with the repeated mask.

8498

return false;

8499

}

8500

return true;

8501

}

8502

8503

/// \brief Checks whether a shuffle mask is equivalent to an explicit list of

8504

/// arguments.

8505

///

8506

/// This is a fast way to test a shuffle mask against a fixed pattern:

8507

///

8508

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

8509

///

8510

/// It returns true if the mask is exactly as wide as the argument list, and

8511

/// each element of the mask is either -1 (signifying undef) or the value given

8512

/// in the argument.

8513

static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,

8514

ArrayRef<int> ExpectedMask) {

8515

if (Mask.size() != ExpectedMask.size())

8516

return false;

8517

8518

int Size = Mask.size();

8519

8520

// If the values are build vectors, we can look through them to find

8521

// equivalent inputs that make the shuffles equivalent.

8522

auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);

8523

auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

8524

8525

for (int i = 0; i < Size; ++i) {

8526

8527

if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {

8528

auto *MaskBV = Mask[i] < Size ? BV1 : BV2;

8529

auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;

8530

if (!MaskBV || !ExpectedBV ||

8531

MaskBV->getOperand(Mask[i] % Size) !=

8532

ExpectedBV->getOperand(ExpectedMask[i] % Size))

8533

return false;

8534

}

8535

}

8536

8537

return true;

8538

}

8539

8540

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

8541

///

8542

/// The masks must be exactly the same width.

8543

///

8544

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

8545

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

8546

///

8547

/// SM_SentinelZero is accepted as a valid negative index but must match in both.

8548

static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,

8549

ArrayRef<int> ExpectedMask) {

8550

int Size = Mask.size();

8551

if (Size != (int)ExpectedMask.size())

8552

return false;

8553

8554

for (int i = 0; i < Size; ++i)

8555

if (Mask[i] == SM_SentinelUndef)

8556

continue;

8557

else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)

8558

return false;

8559

else if (Mask[i] != ExpectedMask[i])

8560

return false;

8561

8562

return true;

8563

}

8564

8565

// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle

8566

// mask.

8567

static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,

8568

const APInt &Zeroable) {

8569

int NumElts = Mask.size();

8570

assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes")(static_cast <bool> (NumElts == (int)Zeroable.getBitWidth
() && "Mismatch mask sizes") ? void (0) : __assert_fail
("NumElts == (int)Zeroable.getBitWidth() && \"Mismatch mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8570, __extension__ __PRETTY_FUNCTION__));

8571

8572

SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);

8573

for (int i = 0; i != NumElts; ++i) {

8574

int M = Mask[i];

8575

if (M == SM_SentinelUndef)

8576

continue;

8577

assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index")(static_cast <bool> (0 <= M && M < (2 * NumElts
) && "Out of range shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (2 * NumElts) && \"Out of range shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8577, __extension__ __PRETTY_FUNCTION__));

8578

TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);

8579

}

8580

return TargetMask;

8581

}

8582

8583

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

8584

// instructions.

8585

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {

8586

if (VT != MVT::v8i32 && VT != MVT::v8f32)

8587

return false;

8588

8589

SmallVector<int, 8> Unpcklwd;

8590

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

8591

/* Unary = */ false);

8592

SmallVector<int, 8> Unpckhwd;

8593

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

8594

/* Unary = */ false);

8595

bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||

8596

isTargetShuffleEquivalent(Mask, Unpckhwd));

8597

return IsUnpackwdMask;

8598

}

8599

8600

/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.

8601

///

8602

/// This helper function produces an 8-bit shuffle immediate corresponding to

8603

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

8604

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

8605

/// example.

8606

///

8607

/// NB: We rely heavily on "undef" masks preserving the input lane.

8608

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

8609

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8609, __extension__ __PRETTY_FUNCTION__));

8610

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8610, __extension__ __PRETTY_FUNCTION__));

8611

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8611, __extension__ __PRETTY_FUNCTION__));

8612

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8612, __extension__ __PRETTY_FUNCTION__));

8613

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8613, __extension__ __PRETTY_FUNCTION__));

8614

8615

unsigned Imm = 0;

8616

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

8617

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

8618

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

8619

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

8620

return Imm;

8621

}

8622

8623

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

8624

SelectionDAG &DAG) {

8625

return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

8626

}

8627

8628

/// \brief Compute whether each element of a shuffle is zeroable.

8629

///

8630

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

8631

/// Either it is an undef element in the shuffle mask, the element of the input

8632

/// referenced is undef, or the element of the input referenced is known to be

8633

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

8634

/// as many lanes with this technique as possible to simplify the remaining

8635

/// shuffle.

8636

static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,

8637

SDValue V1, SDValue V2) {

8638

APInt Zeroable(Mask.size(), 0);

8639

V1 = peekThroughBitcasts(V1);

8640

V2 = peekThroughBitcasts(V2);

8641

8642

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

8643

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

8644

8645

int VectorSizeInBits = V1.getValueSizeInBits();

8646

int ScalarSizeInBits = VectorSizeInBits / Mask.size();

8647

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8647, __extension__ __PRETTY_FUNCTION__));

8648

8649

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

8650

int M = Mask[i];

8651

// Handle the easy cases.

8652

if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

8653

Zeroable.setBit(i);

8654

continue;

8655

}

8656

8657

// Determine shuffle input and normalize the mask.

8658

SDValue V = M < Size ? V1 : V2;

8659

M %= Size;

8660

8661

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

8662

if (V.getOpcode() != ISD::BUILD_VECTOR)

8663

continue;

8664

8665

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

8666

// the (larger) source element must be UNDEF/ZERO.

8667

if ((Size % V.getNumOperands()) == 0) {

8668

int Scale = Size / V->getNumOperands();

8669

SDValue Op = V.getOperand(M / Scale);

8670

if (Op.isUndef() || X86::isZeroNode(Op))

8671

Zeroable.setBit(i);

8672

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

8673

APInt Val = Cst->getAPIntValue();

8674

Val.lshrInPlace((M % Scale) * ScalarSizeInBits);

8675

Val = Val.getLoBits(ScalarSizeInBits);

8676

if (Val == 0)

8677

Zeroable.setBit(i);

8678

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

8679

APInt Val = Cst->getValueAPF().bitcastToAPInt();

8680

Val.lshrInPlace((M % Scale) * ScalarSizeInBits);

8681

Val = Val.getLoBits(ScalarSizeInBits);

8682

if (Val == 0)

8683

Zeroable.setBit(i);

8684

}

8685

continue;

8686

}

8687

8688

// If the BUILD_VECTOR has more elements then all the (smaller) source

8689

// elements must be UNDEF or ZERO.

8690

if ((V.getNumOperands() % Size) == 0) {

8691

int Scale = V->getNumOperands() / Size;

8692

bool AllZeroable = true;

8693

for (int j = 0; j < Scale; ++j) {

8694

SDValue Op = V.getOperand((M * Scale) + j);

8695

AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));

8696

}

8697

if (AllZeroable)

8698

Zeroable.setBit(i);

8699

continue;

8700

}

8701

}

8702

8703

return Zeroable;

8704

}

8705

8706

// The Shuffle result is as follow:

8707

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

8708

// Each Zeroable's element correspond to a particular Mask's element.

8709

// As described in computeZeroableShuffleElements function.

8710

8711

// The function looks for a sub-mask that the nonzero elements are in

8712

// increasing order. If such sub-mask exist. The function returns true.

8713

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

8714

ArrayRef<int> Mask, const EVT &VectorType,

8715

bool &IsZeroSideLeft) {

8716

int NextElement = -1;

8717

// Check if the Mask's nonzero elements are in increasing order.

8718

for (int i = 0, e = Mask.size(); i < e; i++) {

8719

// Checks if the mask's zeros elements are built from only zeros.

8720

8721

if (Mask[i] < 0)

8722

return false;

8723

if (Zeroable[i])

8724

continue;

8725

// Find the lowest non zero element

8726

if (NextElement < 0) {

8727

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

8728

IsZeroSideLeft = NextElement != 0;

8729

}

8730

// Exit if the mask's non zero elements are not in increasing order.

8731

if (NextElement != Mask[i])

8732

return false;

8733

NextElement++;

8734

}

8735

return true;

8736

}

8737

8738

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

8739

static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

8740

ArrayRef<int> Mask, SDValue V1,

8741

SDValue V2,

8742

const APInt &Zeroable,

8743

const X86Subtarget &Subtarget,

8744

SelectionDAG &DAG) {

8745

int Size = Mask.size();

8746

int LaneSize = 128 / VT.getScalarSizeInBits();

8747

const int NumBytes = VT.getSizeInBits() / 8;

8748

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

8749

8750

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8752, __extension__ __PRETTY_FUNCTION__))

8751

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8752, __extension__ __PRETTY_FUNCTION__))

8752

(Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8752, __extension__ __PRETTY_FUNCTION__));

8753

8754

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

8755

// Sign bit set in i8 mask means zero element.

8756

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

8757

8758

SDValue V;

8759

for (int i = 0; i < NumBytes; ++i) {

8760

int M = Mask[i / NumEltBytes];

8761

if (M < 0) {

8762

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

8763

continue;

8764

}

8765

if (Zeroable[i / NumEltBytes]) {

8766

PSHUFBMask[i] = ZeroMask;

8767

continue;

8768

}

8769

8770

// We can only use a single input of V1 or V2.

8771

SDValue SrcV = (M >= Size ? V2 : V1);

8772

if (V && V != SrcV)

8773

return SDValue();

8774

V = SrcV;

8775

M %= Size;

8776

8777

// PSHUFB can't cross lanes, ensure this doesn't happen.

8778

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

8779

return SDValue();

8780

8781

M = M % LaneSize;

8782

M = M * NumEltBytes + (i % NumEltBytes);

8783

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

8784

}

8785

assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8785, __extension__ __PRETTY_FUNCTION__));

8786

8787

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

8788

return DAG.getBitcast(

8789

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

8790

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

8791

}

8792

8793

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

8794

const X86Subtarget &Subtarget, SelectionDAG &DAG,

8795

const SDLoc &dl);

8796

8797

// X86 has dedicated shuffle that can be lowered to VEXPAND

8798

static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,

8799

const APInt &Zeroable,

8800

ArrayRef<int> Mask, SDValue &V1,

8801

SDValue &V2, SelectionDAG &DAG,

8802

const X86Subtarget &Subtarget) {

8803

bool IsLeftZeroSide = true;

8804

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

8805

IsLeftZeroSide))

8806

return SDValue();

8807

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

8808

MVT IntegerType =

8809

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

8810

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

8811

unsigned NumElts = VT.getVectorNumElements();

8812

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8813, __extension__ __PRETTY_FUNCTION__))

8813

"Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8813, __extension__ __PRETTY_FUNCTION__));

8814

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

8815

Subtarget, DAG, DL);

8816

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

8817

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

8818

return DAG.getSelect(DL, VT, VMask,

8819

DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),

8820

ZeroVector);

8821

}

8822

8823

static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

8824

unsigned &UnpackOpcode, bool IsUnary,

8825

ArrayRef<int> TargetMask, SDLoc &DL,

8826

SelectionDAG &DAG,

8827

const X86Subtarget &Subtarget) {

8828

int NumElts = VT.getVectorNumElements();

8829

8830

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

8831

for (int i = 0; i != NumElts; i += 2) {

8832

int M1 = TargetMask[i + 0];

8833

int M2 = TargetMask[i + 1];

8834

Undef1 &= (SM_SentinelUndef == M1);

8835

Undef2 &= (SM_SentinelUndef == M2);

8836

Zero1 &= isUndefOrZero(M1);

8837

Zero2 &= isUndefOrZero(M2);

8838

}

8839

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8840, __extension__ __PRETTY_FUNCTION__))

8840

"Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8840, __extension__ __PRETTY_FUNCTION__));

8841

8842

// Attempt to match the target mask against the unpack lo/hi mask patterns.

8843

SmallVector<int, 64> Unpckl, Unpckh;

8844

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

8845

if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {

8846

UnpackOpcode = X86ISD::UNPCKL;

8847

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

8848

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

8849

return true;

8850

}

8851

8852

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

8853

if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {

8854

UnpackOpcode = X86ISD::UNPCKH;

8855

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

8856

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

8857

return true;

8858

}

8859

8860

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

8861

if (IsUnary && (Zero1 || Zero2)) {

8862

// Don't bother if we can blend instead.

8863

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

8864

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

8865

return false;

8866

8867

bool MatchLo = true, MatchHi = true;

8868

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

8869

int M = TargetMask[i];

8870

8871

// Ignore if the input is known to be zero or the index is undef.

8872

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

8873

(M == SM_SentinelUndef))

8874

continue;

8875

8876

MatchLo &= (M == Unpckl[i]);

8877

MatchHi &= (M == Unpckh[i]);

8878

}

8879

8880

if (MatchLo || MatchHi) {

8881

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

8882

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

8883

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

8884

return true;

8885

}

8886

}

8887

8888

// If a binary shuffle, commute and try again.

8889

if (!IsUnary) {

8890

ShuffleVectorSDNode::commuteMask(Unpckl);

8891

if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {

8892

UnpackOpcode = X86ISD::UNPCKL;

8893

std::swap(V1, V2);

8894

return true;

8895

}

8896

8897

ShuffleVectorSDNode::commuteMask(Unpckh);

8898

if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {

8899

UnpackOpcode = X86ISD::UNPCKH;

8900

std::swap(V1, V2);

8901

return true;

8902

}

8903

}

8904

8905

return false;

8906

}

8907

8908

// X86 has dedicated unpack instructions that can handle specific blend

8909

// operations: UNPCKH and UNPCKL.

8910

static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,

8911

ArrayRef<int> Mask, SDValue V1,

8912

SDValue V2, SelectionDAG &DAG) {

8913

SmallVector<int, 8> Unpckl;

8914

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

8915

if (isShuffleEquivalent(V1, V2, Mask, Unpckl))

8916

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

8917

8918

SmallVector<int, 8> Unpckh;

8919

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

8920

if (isShuffleEquivalent(V1, V2, Mask, Unpckh))

8921

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

8922

8923

// Commute and try again.

8924

ShuffleVectorSDNode::commuteMask(Unpckl);

8925

if (isShuffleEquivalent(V1, V2, Mask, Unpckl))

8926

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

8927

8928

ShuffleVectorSDNode::commuteMask(Unpckh);

8929

if (isShuffleEquivalent(V1, V2, Mask, Unpckh))

8930

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

8931

8932

return SDValue();

8933

}

8934

8935

// X86 has dedicated pack instructions that can handle specific truncation

8936

// operations: PACKSS and PACKUS.

8937

static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,

8938

SDValue &V2, unsigned &PackOpcode,

8939

ArrayRef<int> TargetMask,

8940

SelectionDAG &DAG,

8941

const X86Subtarget &Subtarget) {

8942

unsigned NumElts = VT.getVectorNumElements();

8943

unsigned BitSize = VT.getScalarSizeInBits();

8944

MVT PackSVT = MVT::getIntegerVT(BitSize * 2);

8945

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

8946

8947

auto MatchPACK = [&](SDValue N1, SDValue N2) {

8948

SDValue VV1 = DAG.getBitcast(PackVT, N1);

8949

SDValue VV2 = DAG.getBitcast(PackVT, N2);

8950

if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&

8951

(N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {

8952

V1 = VV1;

8953

V2 = VV2;

8954

SrcVT = PackVT;

8955

PackOpcode = X86ISD::PACKSS;

8956

return true;

8957

}

8958

8959

if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {

8960

APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);

8961

if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&

8962

(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {

8963

V1 = VV1;

8964

V2 = VV2;

8965

SrcVT = PackVT;

8966

PackOpcode = X86ISD::PACKUS;

8967

return true;

8968

}

8969

}

8970

8971

return false;

8972

};

8973

8974

// Try binary shuffle.

8975

SmallVector<int, 32> BinaryMask;

8976

createPackShuffleMask(VT, BinaryMask, false);

8977

if (isTargetShuffleEquivalent(TargetMask, BinaryMask))

8978

if (MatchPACK(V1, V2))

8979

return true;

8980

8981

// Try unary shuffle.

8982

SmallVector<int, 32> UnaryMask;

8983

createPackShuffleMask(VT, UnaryMask, true);

8984

if (isTargetShuffleEquivalent(TargetMask, UnaryMask))

8985

if (MatchPACK(V1, V1))

8986

return true;

8987

8988

return false;

8989

}

8990

8991

static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,

8992

ArrayRef<int> Mask, SDValue V1,

8993

SDValue V2, SelectionDAG &DAG,

8994

const X86Subtarget &Subtarget) {

8995

MVT PackVT;

8996

unsigned PackOpcode;

8997

if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

8998

Subtarget))

8999

return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),

9000

DAG.getBitcast(PackVT, V2));

9001

9002

return SDValue();

9003

}

9004

9005

/// \brief Try to emit a bitmask instruction for a shuffle.

9006

///

9007

/// This handles cases where we can model a blend exactly as a bitmask due to

9008

/// one of the inputs being zeroable.

9009

static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

9010

SDValue V2, ArrayRef<int> Mask,

9011

const APInt &Zeroable,

9012

SelectionDAG &DAG) {

9013

assert(!VT.isFloatingPoint() && "Floating point types are not supported")(static_cast <bool> (!VT.isFloatingPoint() && "Floating point types are not supported"
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"Floating point types are not supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9013, __extension__ __PRETTY_FUNCTION__));

9014

MVT EltVT = VT.getVectorElementType();

9015

SDValue Zero = DAG.getConstant(0, DL, EltVT);

9016

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

9017

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

9018

SDValue V;

9019

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

9020

if (Zeroable[i])

9021

continue;

9022

if (Mask[i] % Size != i)

9023

return SDValue(); // Not a blend.

9024

if (!V)

9025

V = Mask[i] < Size ? V1 : V2;

9026

else if (V != (Mask[i] < Size ? V1 : V2))

9027

return SDValue(); // Can only let one input through the mask.

9028

9029

VMaskOps[i] = AllOnes;

9030

}

9031

if (!V)

9032

return SDValue(); // No non-zeroable elements!

9033

9034

SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);

9035

return DAG.getNode(ISD::AND, DL, VT, V, VMask);

9036

}

9037

9038

/// \brief Try to emit a blend instruction for a shuffle using bit math.

9039

///

9040

/// This is used as a fallback approach when first class blend instructions are

9041

/// unavailable. Currently it is only suitable for integer vectors, but could

9042

/// be generalized for floating point vectors if desirable.

9043

static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

9044

SDValue V2, ArrayRef<int> Mask,

9045

SelectionDAG &DAG) {

9046

assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9046, __extension__ __PRETTY_FUNCTION__));

9047

MVT EltVT = VT.getVectorElementType();

9048

SDValue Zero = DAG.getConstant(0, DL, EltVT);

9049

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

9050

SmallVector<SDValue, 16> MaskOps;

9051

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

9052

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

9053

return SDValue(); // Shuffled input!

9054

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

9055

}

9056

9057

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

9058

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

9059

// We have to cast V2 around.

9060

MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

9061

V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,

9062

DAG.getBitcast(MaskVT, V1Mask),

9063

DAG.getBitcast(MaskVT, V2)));

9064

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

9065

}

9066

9067

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

9068

SDValue PreservedSrc,

9069

const X86Subtarget &Subtarget,

9070

SelectionDAG &DAG);

9071

9072

static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,

9073

MutableArrayRef<int> TargetMask,

9074

bool &ForceV1Zero, bool &ForceV2Zero,

9075

uint64_t &BlendMask) {

9076

bool V1IsZeroOrUndef =

9077

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

9078

bool V2IsZeroOrUndef =

9079

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

9080

9081

BlendMask = 0;

9082

ForceV1Zero = false, ForceV2Zero = false;

9083

assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (TargetMask.size() <= 64 &&
"Shuffle mask too big for blend mask") ? void (0) : __assert_fail
("TargetMask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9083, __extension__ __PRETTY_FUNCTION__));

9084

9085

// Attempt to generate the binary blend mask. If an input is zero then

9086

// we can use any lane.

9087

// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.

9088

for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {

9089

int M = TargetMask[i];

9090

if (M == SM_SentinelUndef)

9091

continue;

9092

if (M == i)

9093

continue;

9094

if (M == i + Size) {

9095

BlendMask |= 1ull << i;

9096

continue;

9097

}

9098

if (M == SM_SentinelZero) {

9099

if (V1IsZeroOrUndef) {

9100

ForceV1Zero = true;

9101

TargetMask[i] = i;

9102

continue;

9103

}

9104

if (V2IsZeroOrUndef) {

9105

ForceV2Zero = true;

9106

BlendMask |= 1ull << i;

9107

TargetMask[i] = i + Size;

9108

continue;

9109

}

9110

}

9111

return false;

9112

}

9113

return true;

9114

}

9115

9116

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

9117

int Scale) {

9118

uint64_t ScaledMask = 0;

9119

for (int i = 0; i != Size; ++i)

9120

if (BlendMask & (1ull << i))

9121

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

9122

return ScaledMask;

9123

}

9124

9125

/// \brief Try to emit a blend instruction for a shuffle.

9126

///

9127

/// This doesn't do any checks for the availability of instructions for blending

9128

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

9129

/// be matched in the backend with the type given. What it does check for is

9130

/// that the shuffle mask is a blend, or convertible into a blend with zero.

9131

static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

9132

SDValue V2, ArrayRef<int> Original,

9133

const APInt &Zeroable,

9134

const X86Subtarget &Subtarget,

9135

SelectionDAG &DAG) {

9136

SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

9137

9138

uint64_t BlendMask = 0;

9139

bool ForceV1Zero = false, ForceV2Zero = false;

9140

if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,

9141

BlendMask))

9142

return SDValue();

9143

9144

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

9145

if (ForceV1Zero)

9146

V1 = getZeroVector(VT, Subtarget, DAG, DL);

9147

if (ForceV2Zero)

9148

V2 = getZeroVector(VT, Subtarget, DAG, DL);

9149

9150

switch (VT.SimpleTy) {

9151

case MVT::v2f64:

9152

case MVT::v4f32:

9153

case MVT::v4f64:

9154

case MVT::v8f32:

9155

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

9156

DAG.getConstant(BlendMask, DL, MVT::i8));

9157

9158

case MVT::v4i64:

9159

case MVT::v8i32:

9160

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9160, __extension__ __PRETTY_FUNCTION__));

9161

LLVM_FALLTHROUGH[[clang::fallthrough]];

9162

case MVT::v2i64:

9163

case MVT::v4i32:

9164

// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into

9165

// that instruction.

9166

if (Subtarget.hasAVX2()) {

9167

// Scale the blend by the number of 32-bit dwords per element.

9168

int Scale = VT.getScalarSizeInBits() / 32;

9169

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);

9170

MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;

9171

V1 = DAG.getBitcast(BlendVT, V1);

9172

V2 = DAG.getBitcast(BlendVT, V2);

9173

return DAG.getBitcast(

9174

VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,

9175

DAG.getConstant(BlendMask, DL, MVT::i8)));

9176

}

9177

LLVM_FALLTHROUGH[[clang::fallthrough]];

9178

case MVT::v8i16: {

9179

// For integer shuffles we need to expand the mask and cast the inputs to

9180

// v8i16s prior to blending.

9181

int Scale = 8 / VT.getVectorNumElements();

9182

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);

9183

V1 = DAG.getBitcast(MVT::v8i16, V1);

9184

V2 = DAG.getBitcast(MVT::v8i16, V2);

9185

return DAG.getBitcast(VT,

9186

DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,

9187

DAG.getConstant(BlendMask, DL, MVT::i8)));

9188

}

9189

9190

case MVT::v16i16: {

9191

9192

SmallVector<int, 8> RepeatedMask;

9193

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

9194

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

9195

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9195, __extension__ __PRETTY_FUNCTION__));

9196

BlendMask = 0;

9197

for (int i = 0; i < 8; ++i)

9198

if (RepeatedMask[i] >= 8)

9199

BlendMask |= 1ull << i;

9200

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

9201

DAG.getConstant(BlendMask, DL, MVT::i8));

9202

}

9203

LLVM_FALLTHROUGH[[clang::fallthrough]];

9204

}

9205

case MVT::v16i8:

9206

case MVT::v32i8: {

9207

assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&(static_cast <bool> ((VT.is128BitVector() || Subtarget.
hasAVX2()) && "256-bit byte-blends require AVX2 support!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || Subtarget.hasAVX2()) && \"256-bit byte-blends require AVX2 support!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9208, __extension__ __PRETTY_FUNCTION__))

9208

"256-bit byte-blends require AVX2 support!")(static_cast <bool> ((VT.is128BitVector() || Subtarget.
hasAVX2()) && "256-bit byte-blends require AVX2 support!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || Subtarget.hasAVX2()) && \"256-bit byte-blends require AVX2 support!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9208, __extension__ __PRETTY_FUNCTION__));

9209

9210

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

9211

MVT IntegerType =

9212

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

9213

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

9214

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

9215

}

9216

9217

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

9218

if (SDValue Masked =

9219

lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))

9220

return Masked;

9221

9222

// Scale the blend by the number of bytes per element.

9223

int Scale = VT.getScalarSizeInBits() / 8;

9224

9225

// This form of blend is always done on bytes. Compute the byte vector

9226

// type.

9227

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

9228

9229

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

9230

// mix of LLVM's code generator and the x86 backend. We tell the code

9231

// generator that boolean values in the elements of an x86 vector register

9232

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

9233

// mapping a select to operand #1, and 'false' mapping to operand #2. The

9234

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

9235

// of the element (the remaining are ignored) and 0 in that high bit would

9236

// mean operand #1 while 1 in the high bit would mean operand #2. So while

9237

// the LLVM model for boolean values in vector elements gets the relevant

9238

// bit set, it is set backwards and over constrained relative to x86's

9239

// actual model.

9240

SmallVector<SDValue, 32> VSELECTMask;

9241

for (int i = 0, Size = Mask.size(); i < Size; ++i)

9242

for (int j = 0; j < Scale; ++j)

9243

VSELECTMask.push_back(

9244

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

9245

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

9246

MVT::i8));

9247

9248

V1 = DAG.getBitcast(BlendVT, V1);

9249

V2 = DAG.getBitcast(BlendVT, V2);

9250

return DAG.getBitcast(

9251

VT,

9252

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

9253

V1, V2));

9254

}

9255

case MVT::v16f32:

9256

case MVT::v8f64:

9257

case MVT::v8i64:

9258

case MVT::v16i32:

9259

case MVT::v32i16:

9260

case MVT::v64i8: {

9261

MVT IntegerType =

9262

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

9263

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

9264

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

9265

}

9266

default:

9267

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9267);

9268

}

9269

}

9270

9271

/// \brief Try to lower as a blend of elements from two inputs followed by

9272

/// a single-input permutation.

9273

///

9274

/// This matches the pattern where we can blend elements from two inputs and

9275

/// then reduce the shuffle to a single-input permutation.

9276

static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

9277

SDValue V1, SDValue V2,

9278

ArrayRef<int> Mask,

9279

SelectionDAG &DAG) {

9280

// We build up the blend mask while checking whether a blend is a viable way

9281

// to reduce the shuffle.

9282

SmallVector<int, 32> BlendMask(Mask.size(), -1);

9283

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

9284

9285

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

9286

if (Mask[i] < 0)

9287

continue;

9288

9289

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9289, __extension__ __PRETTY_FUNCTION__));

9290

9291

if (BlendMask[Mask[i] % Size] < 0)

9292

BlendMask[Mask[i] % Size] = Mask[i];

9293

else if (BlendMask[Mask[i] % Size] != Mask[i])

9294

return SDValue(); // Can't blend in the needed input!

9295

9296

PermuteMask[i] = Mask[i] % Size;

9297

}

9298

9299

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

9300

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

9301

}

9302

9303

/// \brief Generic routine to decompose a shuffle and blend into independent

9304

/// blends and permutes.

9305

///

9306

/// This matches the extremely common pattern for handling combined

9307

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

9308

/// operations. It will try to pick the best arrangement of shuffles and

9309

/// blends.

9310

static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,

9311

MVT VT, SDValue V1,

9312

SDValue V2,

9313

ArrayRef<int> Mask,

9314

SelectionDAG &DAG) {

9315

// Shuffle the input elements into the desired positions in V1 and V2 and

9316

// blend them together.

9317

SmallVector<int, 32> V1Mask(Mask.size(), -1);

9318

SmallVector<int, 32> V2Mask(Mask.size(), -1);

9319

SmallVector<int, 32> BlendMask(Mask.size(), -1);

9320

for (int i = 0, Size = Mask.size(); i < Size; ++i)

9321

if (Mask[i] >= 0 && Mask[i] < Size) {

9322

V1Mask[i] = Mask[i];

9323

BlendMask[i] = i;

9324

} else if (Mask[i] >= Size) {

9325

V2Mask[i] = Mask[i] - Size;

9326

BlendMask[i] = i + Size;

9327

}

9328

9329

// Try to lower with the simpler initial blend strategy unless one of the

9330

// input shuffles would be a no-op. We prefer to shuffle inputs as the

9331

// shuffle may be able to fold with a load or other benefit. However, when

9332

// we'll have to do 2x as many shuffles in order to achieve this, blending

9333

// first is a better strategy.

9334

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))

9335

if (SDValue BlendPerm =

9336

lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))

9337

return BlendPerm;

9338

9339

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

9340

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

9341

return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

9342

}

9343

9344

/// \brief Try to lower a vector shuffle as a rotation.

9345

///

9346

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

9347

static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,

9348

ArrayRef<int> Mask) {

9349

int NumElts = Mask.size();

9350

9351

// We need to detect various ways of spelling a rotation:

9352

// [11, 12, 13, 14, 15, 0, 1, 2]

9353

// [-1, 12, 13, 14, -1, -1, 1, -1]

9354

// [-1, -1, -1, -1, -1, -1, 1, 2]

9355

// [ 3, 4, 5, 6, 7, 8, 9, 10]

9356

// [-1, 4, 5, 6, -1, -1, 9, -1]

9357

// [-1, 4, 5, 6, -1, -1, -1, -1]

9358

int Rotation = 0;

9359

SDValue Lo, Hi;

9360

for (int i = 0; i < NumElts; ++i) {

9361

int M = Mask[i];

9362

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9363, __extension__ __PRETTY_FUNCTION__))

9363

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9363, __extension__ __PRETTY_FUNCTION__));

9364

if (M < 0)

9365

continue;

9366

9367

// Determine where a rotated vector would have started.

9368

int StartIdx = i - (M % NumElts);

9369

if (StartIdx == 0)

9370

// The identity rotation isn't interesting, stop.

9371

return -1;

9372

9373

// If we found the tail of a vector the rotation must be the missing

9374

// front. If we found the head of a vector, it must be how much of the

9375

// head.

9376

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

9377

9378

if (Rotation == 0)

9379

Rotation = CandidateRotation;

9380

else if (Rotation != CandidateRotation)

9381

// The rotations don't match, so we can't match this mask.

9382

return -1;

9383

9384

// Compute which value this mask is pointing at.

9385

SDValue MaskV = M < NumElts ? V1 : V2;

9386

9387

// Compute which of the two target values this index should be assigned

9388

// to. This reflects whether the high elements are remaining or the low

9389

// elements are remaining.

9390

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

9391

9392

// Either set up this value if we've not encountered it before, or check

9393

// that it remains consistent.

9394

if (!TargetV)

9395

TargetV = MaskV;

9396

else if (TargetV != MaskV)

9397

// This may be a rotation, but it pulls from the inputs in some

9398

// unsupported interleaving.

9399

return -1;

9400

}

9401

9402

// Check that we successfully analyzed the mask, and normalize the results.

9403

assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9403, __extension__ __PRETTY_FUNCTION__));

9404

assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9404, __extension__ __PRETTY_FUNCTION__));

9405

if (!Lo)

9406

Lo = Hi;

9407

else if (!Hi)

9408

Hi = Lo;

9409

9410

V1 = Lo;

9411

V2 = Hi;

9412

9413

return Rotation;

9414

}

9415

9416

/// \brief Try to lower a vector shuffle as a byte rotation.

9417

///

9418

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

9419

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

9420

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

9421

/// try to generically lower a vector shuffle through such an pattern. It

9422

/// does not check for the profitability of lowering either as PALIGNR or

9423

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

9424

/// This matches shuffle vectors that look like:

9425

///

9426

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

9427

///

9428

/// Essentially it concatenates V1 and V2, shifts right by some number of

9429

/// elements, and takes the low elements as the result. Note that while this is

9430

/// specified as a *right shift* because x86 is little-endian, it is a *left

9431

/// rotate* of the vector lanes.

9432

static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

9433

ArrayRef<int> Mask) {

9434

// Don't accept any shuffles with zero elements.

9435

if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))

9436

return -1;

9437

9438

// PALIGNR works on 128-bit lanes.

9439

SmallVector<int, 16> RepeatedMask;

9440

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

9441

return -1;

9442

9443

int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);

9444

if (Rotation <= 0)

9445

return -1;

9446

9447

// PALIGNR rotates bytes, so we need to scale the

9448

// rotation based on how many bytes are in the vector lane.

9449

int NumElts = RepeatedMask.size();

9450

int Scale = 16 / NumElts;

9451

return Rotation * Scale;

9452

}

9453

9454

static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,

9455

SDValue V1, SDValue V2,

9456

ArrayRef<int> Mask,

9457

const X86Subtarget &Subtarget,

9458

SelectionDAG &DAG) {

9459

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9459, __extension__ __PRETTY_FUNCTION__));

9460

9461

SDValue Lo = V1, Hi = V2;

9462

int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);

9463

if (ByteRotation <= 0)

9464

return SDValue();

9465

9466

// Cast the inputs to i8 vector of correct length to match PALIGNR or

9467

// PSLLDQ/PSRLDQ.

9468

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

9469

Lo = DAG.getBitcast(ByteVT, Lo);

9470

Hi = DAG.getBitcast(ByteVT, Hi);

9471

9472

// SSSE3 targets can use the palignr instruction.

9473

if (Subtarget.hasSSSE3()) {

9474

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9475, __extension__ __PRETTY_FUNCTION__))

9475

"512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9475, __extension__ __PRETTY_FUNCTION__));

9476

return DAG.getBitcast(

9477

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

9478

DAG.getConstant(ByteRotation, DL, MVT::i8)));

9479

}

9480

9481

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9482, __extension__ __PRETTY_FUNCTION__))

9482

"Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9482, __extension__ __PRETTY_FUNCTION__));

9483

assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9484, __extension__ __PRETTY_FUNCTION__))

9484

"Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9484, __extension__ __PRETTY_FUNCTION__));

9485

assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9486, __extension__ __PRETTY_FUNCTION__))

9486

"SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9486, __extension__ __PRETTY_FUNCTION__));

9487

9488

// Default SSE2 implementation

9489

int LoByteShift = 16 - ByteRotation;

9490

int HiByteShift = ByteRotation;

9491

9492

SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

9493

DAG.getConstant(LoByteShift, DL, MVT::i8));

9494

SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

9495

DAG.getConstant(HiByteShift, DL, MVT::i8));

9496

return DAG.getBitcast(VT,

9497

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

9498

}

9499

9500

/// \brief Try to lower a vector shuffle as a dword/qword rotation.

9501

///

9502

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

9503

/// rotation of the concatenation of two vectors; This routine will

9504

/// try to generically lower a vector shuffle through such an pattern.

9505

///

9506

/// Essentially it concatenates V1 and V2, shifts right by some number of

9507

/// elements, and takes the low elements as the result. Note that while this is

9508

/// specified as a *right shift* because x86 is little-endian, it is a *left

9509

/// rotate* of the vector lanes.

9510

static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,

9511

SDValue V1, SDValue V2,

9512

ArrayRef<int> Mask,

9513

const X86Subtarget &Subtarget,

9514

SelectionDAG &DAG) {

9515

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9516, __extension__ __PRETTY_FUNCTION__))

9516

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9516, __extension__ __PRETTY_FUNCTION__));

9517

9518

// 128/256-bit vectors are only supported with VLX.

9519

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9520, __extension__ __PRETTY_FUNCTION__))

9520

&& "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9520, __extension__ __PRETTY_FUNCTION__));

9521

9522

SDValue Lo = V1, Hi = V2;

9523

int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);

9524

if (Rotation <= 0)

9525

return SDValue();

9526

9527

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

9528

DAG.getConstant(Rotation, DL, MVT::i8));

9529

}

9530

9531

/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).

9532

///

9533

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

9534

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

9535

/// matches elements from one of the input vectors shuffled to the left or

9536

/// right with zeroable elements 'shifted in'. It handles both the strictly

9537

/// bit-wise element shifts and the byte shift across an entire 128-bit double

9538

/// quad word lane.

9539

///

9540

/// PSHL : (little-endian) left bit shift.

9541

/// [ zz, 0, zz, 2 ]

9542

/// [ -1, 4, zz, -1 ]

9543

/// PSRL : (little-endian) right bit shift.

9544

/// [ 1, zz, 3, zz]

9545

/// [ -1, -1, 7, zz]

9546

/// PSLLDQ : (little-endian) left byte shift

9547

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

9548

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

9549

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

9550

/// PSRLDQ : (little-endian) right byte shift

9551

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

9552

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

9553

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

9554

static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

9555

unsigned ScalarSizeInBits,

9556

ArrayRef<int> Mask, int MaskOffset,

9557

const APInt &Zeroable,

9558

const X86Subtarget &Subtarget) {

9559

int Size = Mask.size();

9560

unsigned SizeInBits = Size * ScalarSizeInBits;

9561

9562

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

9563

for (int i = 0; i < Size; i += Scale)

9564

for (int j = 0; j < Shift; ++j)

9565

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

9566

return false;

9567

9568

return true;

9569

};

9570

9571

auto MatchShift = [&](int Shift, int Scale, bool Left) {

9572

for (int i = 0; i != Size; i += Scale) {

9573

unsigned Pos = Left ? i + Shift : i;

9574

unsigned Low = Left ? i : i + Shift;

9575

unsigned Len = Scale - Shift;

9576

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

9577

return -1;

9578

}

9579

9580

int ShiftEltBits = ScalarSizeInBits * Scale;

9581

bool ByteShift = ShiftEltBits > 64;

9582

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

9583

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

9584

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

9585

9586

// Normalize the scale for byte shifts to still produce an i64 element

9587

// type.

9588

Scale = ByteShift ? Scale / 2 : Scale;

9589

9590

// We need to round trip through the appropriate type for the shift.

9591

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

9592

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

9593

: MVT::getVectorVT(ShiftSVT, Size / Scale);

9594

return (int)ShiftAmt;

9595

};

9596

9597

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

9598

// keep doubling the size of the integer elements up to that. We can

9599

// then shift the elements of the integer vector by whole multiples of

9600

// their width within the elements of the larger integer vector. Test each

9601

// multiple to see if we can find a match with the moved element indices

9602

// and that the shifted in elements are all zeroable.

9603

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

9604

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

9605

for (int Shift = 1; Shift != Scale; ++Shift)

9606

for (bool Left : {true, false})

9607

if (CheckZeros(Shift, Scale, Left)) {

9608

int ShiftAmt = MatchShift(Shift, Scale, Left);

9609

if (0 < ShiftAmt)

9610

return ShiftAmt;

9611

}

9612

9613

// no match

9614

return -1;

9615

}

9616

9617

static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

9618

SDValue V2, ArrayRef<int> Mask,

9619

const APInt &Zeroable,

9620

const X86Subtarget &Subtarget,

9621

SelectionDAG &DAG) {

9622

int Size = Mask.size();

9623

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9623, __extension__ __PRETTY_FUNCTION__));

9624

9625

MVT ShiftVT;

9626

SDValue V = V1;

9627

unsigned Opcode;

9628

9629

// Try to match shuffle against V1 shift.

9630

int ShiftAmt = matchVectorShuffleAsShift(

9631

ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);

9632

9633

// If V1 failed, try to match shuffle against V2 shift.

9634

if (ShiftAmt < 0) {

9635

ShiftAmt =

9636

matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

9637

Mask, Size, Zeroable, Subtarget);

9638

V = V2;

9639

}

9640

9641

if (ShiftAmt < 0)

9642

return SDValue();

9643

9644

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9645, __extension__ __PRETTY_FUNCTION__))

9645

"Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9645, __extension__ __PRETTY_FUNCTION__));

9646

V = DAG.getBitcast(ShiftVT, V);

9647

V = DAG.getNode(Opcode, DL, ShiftVT, V,

9648

DAG.getConstant(ShiftAmt, DL, MVT::i8));

9649

return DAG.getBitcast(VT, V);

9650

}

9651

9652

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

9653

// Remainder of lower half result is zero and upper half is all undef.

9654

static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

9655

ArrayRef<int> Mask, uint64_t &BitLen,

9656

uint64_t &BitIdx, const APInt &Zeroable) {

9657

int Size = Mask.size();

9658

int HalfSize = Size / 2;

9659

9660

assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnesValue() &&
"Fully zeroable shuffle mask") ? void (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9660, __extension__ __PRETTY_FUNCTION__));

9661

9662

// Upper half must be undefined.

9663

if (!isUndefInRange(Mask, HalfSize, HalfSize))

9664

return false;

9665

9666

// Determine the extraction length from the part of the

9667

// lower half that isn't zeroable.

9668

int Len = HalfSize;

9669

for (; Len > 0; --Len)

9670

if (!Zeroable[Len - 1])

9671

break;

9672

assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9672, __extension__ __PRETTY_FUNCTION__));

9673

9674

// Attempt to match first Len sequential elements from the lower half.

9675

SDValue Src;

9676

int Idx = -1;

9677

for (int i = 0; i != Len; ++i) {

9678

int M = Mask[i];

9679

if (M == SM_SentinelUndef)

9680

continue;

9681

SDValue &V = (M < Size ? V1 : V2);

9682

M = M % Size;

9683

9684

// The extracted elements must start at a valid index and all mask

9685

// elements must be in the lower half.

9686

if (i > M || M >= HalfSize)

9687

return false;

9688

9689

if (Idx < 0 || (Src == V && Idx == (M - i))) {

9690

Src = V;

9691

Idx = M - i;

9692

continue;

9693

}

9694

return false;

9695

}

9696

9697

if (!Src || Idx < 0)

9698

return false;

9699

9700

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9700, __extension__ __PRETTY_FUNCTION__));

9701

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

9702

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

9703

V1 = Src;

9704

return true;

9705

}

9706

9707

// INSERTQ: Extract lowest Len elements from lower half of second source and

9708

// insert over first source, starting at Idx.

9709

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

9710

static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

9711

ArrayRef<int> Mask, uint64_t &BitLen,

9712

uint64_t &BitIdx) {

9713

int Size = Mask.size();

9714

int HalfSize = Size / 2;

9715

9716

9717

// Upper half must be undefined.

9718

if (!isUndefInRange(Mask, HalfSize, HalfSize))

9719

return false;

9720

9721

for (int Idx = 0; Idx != HalfSize; ++Idx) {

9722

SDValue Base;

9723

9724

// Attempt to match first source from mask before insertion point.

9725

if (isUndefInRange(Mask, 0, Idx)) {

9726

/* EMPTY */

9727

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

9728

Base = V1;

9729

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

9730

Base = V2;

9731

} else {

9732

continue;

9733

}

9734

9735

// Extend the extraction length looking to match both the insertion of

9736

// the second source and the remaining elements of the first.

9737

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

9738

SDValue Insert;

9739

int Len = Hi - Idx;

9740

9741

// Match insertion.

9742

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

9743

Insert = V1;

9744

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

9745

Insert = V2;

9746

} else {

9747

continue;

9748

}

9749

9750

// Match the remaining elements of the lower half.

9751

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

9752

/* EMPTY */

9753

} else if ((!Base || (Base == V1)) &&

9754

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

9755

Base = V1;

9756

} else if ((!Base || (Base == V2)) &&

9757

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

9758

Size + Hi)) {

9759

Base = V2;

9760

} else {

9761

continue;

9762

}

9763

9764

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

9765

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

9766

V1 = Base;

9767

V2 = Insert;

9768

return true;

9769

}

9770

}

9771

9772

return false;

9773

}

9774

9775

/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

9776

static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

9777

SDValue V2, ArrayRef<int> Mask,

9778

const APInt &Zeroable,

9779

SelectionDAG &DAG) {

9780

uint64_t BitLen, BitIdx;

9781

if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

9782

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

9783

DAG.getConstant(BitLen, DL, MVT::i8),

9784

DAG.getConstant(BitIdx, DL, MVT::i8));

9785

9786

if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

9787

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

9788

V2 ? V2 : DAG.getUNDEF(VT),

9789

DAG.getConstant(BitLen, DL, MVT::i8),

9790

DAG.getConstant(BitIdx, DL, MVT::i8));

9791

9792

return SDValue();

9793

}

9794

9795

/// \brief Lower a vector shuffle as a zero or any extension.

9796

///

9797

/// Given a specific number of elements, element bit width, and extension

9798

/// stride, produce either a zero or any extension based on the available

9799

/// features of the subtarget. The extended elements are consecutive and

9800

/// begin and can start from an offsetted element index in the input; to

9801

/// avoid excess shuffling the offset must either being in the bottom lane

9802

/// or at the start of a higher lane. All extended elements must be from

9803

/// the same lane.

9804

static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(

9805

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

9806

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

9807

assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9807, __extension__ __PRETTY_FUNCTION__));

9808

int EltBits = VT.getScalarSizeInBits();

9809

int NumElements = VT.getVectorNumElements();

9810

int NumEltsPerLane = 128 / EltBits;

9811

int OffsetLane = Offset / NumEltsPerLane;

9812

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9813, __extension__ __PRETTY_FUNCTION__))

9813

"Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9813, __extension__ __PRETTY_FUNCTION__));

9814

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9814, __extension__ __PRETTY_FUNCTION__));

9815

assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9815, __extension__ __PRETTY_FUNCTION__));

9816

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9817, __extension__ __PRETTY_FUNCTION__))

9817

"Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9817, __extension__ __PRETTY_FUNCTION__));

9818

9819

// Check that an index is in same lane as the base offset.

9820

auto SafeOffset = [&](int Idx) {

9821

return OffsetLane == (Idx / NumEltsPerLane);

9822

};

9823

9824

// Shift along an input so that the offset base moves to the first element.

9825

auto ShuffleOffset = [&](SDValue V) {

9826

if (!Offset)

9827

return V;

9828

9829

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

9830

for (int i = 0; i * Scale < NumElements; ++i) {

9831

int SrcIdx = i + Offset;

9832

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

9833

}

9834

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

9835

};

9836

9837

// Found a valid zext mask! Try various lowering strategies based on the

9838

// input type and available ISA extensions.

9839

if (Subtarget.hasSSE41()) {

9840

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

9841

// PUNPCK will catch this in a later shuffle match.

9842

if (Offset && Scale == 2 && VT.is128BitVector())

9843

return SDValue();

9844

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

9845

NumElements / Scale);

9846

InputV = ShuffleOffset(InputV);

9847

InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);

9848

return DAG.getBitcast(VT, InputV);

9849

}

9850

9851

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9851, __extension__ __PRETTY_FUNCTION__));

9852

9853

// For any extends we can cheat for larger element sizes and use shuffle

9854

// instructions that can fold with a load and/or copy.

9855

if (AnyExt && EltBits == 32) {

9856

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

9857

-1};

9858

return DAG.getBitcast(

9859

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

9860

DAG.getBitcast(MVT::v4i32, InputV),

9861

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

9862

}

9863

if (AnyExt && EltBits == 16 && Scale > 2) {

9864

int PSHUFDMask[4] = {Offset / 2, -1,

9865

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

9866

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

9867

DAG.getBitcast(MVT::v4i32, InputV),

9868

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

9869

int PSHUFWMask[4] = {1, -1, -1, -1};

9870

unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);

9871

return DAG.getBitcast(

9872

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

9873

DAG.getBitcast(MVT::v8i16, InputV),

9874

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

9875

}

9876

9877

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

9878

// to 64-bits.

9879

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

9880

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9880, __extension__ __PRETTY_FUNCTION__));

9881

assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9881, __extension__ __PRETTY_FUNCTION__));

9882

9883

int LoIdx = Offset * EltBits;

9884

SDValue Lo = DAG.getBitcast(

9885

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

9886

DAG.getConstant(EltBits, DL, MVT::i8),

9887

DAG.getConstant(LoIdx, DL, MVT::i8)));

9888

9889

if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||

9890

!SafeOffset(Offset + 1))

9891

return DAG.getBitcast(VT, Lo);

9892

9893

int HiIdx = (Offset + 1) * EltBits;

9894

SDValue Hi = DAG.getBitcast(

9895

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

9896

DAG.getConstant(EltBits, DL, MVT::i8),

9897

DAG.getConstant(HiIdx, DL, MVT::i8)));

9898

return DAG.getBitcast(VT,

9899

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

9900

}

9901

9902

// If this would require more than 2 unpack instructions to expand, use

9903

// pshufb when available. We can only use more than 2 unpack instructions

9904

// when zero extending i8 elements which also makes it easier to use pshufb.

9905

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

9906

assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9906, __extension__ __PRETTY_FUNCTION__));

9907

SDValue PSHUFBMask[16];

9908

for (int i = 0; i < 16; ++i) {

9909

int Idx = Offset + (i / Scale);

9910

PSHUFBMask[i] = DAG.getConstant(

9911

(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);

9912

}

9913

InputV = DAG.getBitcast(MVT::v16i8, InputV);

9914

return DAG.getBitcast(

9915

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

9916

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

9917

}

9918

9919

// If we are extending from an offset, ensure we start on a boundary that

9920

// we can unpack from.

9921

int AlignToUnpack = Offset % (NumElements / Scale);

9922

if (AlignToUnpack) {

9923

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

9924

for (int i = AlignToUnpack; i < NumElements; ++i)

9925

ShMask[i - AlignToUnpack] = i;

9926

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

9927

Offset -= AlignToUnpack;

9928

}

9929

9930

// Otherwise emit a sequence of unpacks.

9931

do {

9932

unsigned UnpackLoHi = X86ISD::UNPCKL;

9933

if (Offset >= (NumElements / 2)) {

9934

UnpackLoHi = X86ISD::UNPCKH;

9935

Offset -= (NumElements / 2);

9936

}

9937

9938

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

9939

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

9940

: getZeroVector(InputVT, Subtarget, DAG, DL);

9941

InputV = DAG.getBitcast(InputVT, InputV);

9942

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

9943

Scale /= 2;

9944

EltBits *= 2;

9945

NumElements /= 2;

9946

} while (Scale > 1);

9947

return DAG.getBitcast(VT, InputV);

9948

}

9949

9950

/// \brief Try to lower a vector shuffle as a zero extension on any microarch.

9951

///

9952

/// This routine will try to do everything in its power to cleverly lower

9953

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

9954

/// check for the profitability of this lowering, it tries to aggressively

9955

/// match this pattern. It will use all of the micro-architectural details it

9956

/// can to emit an efficient lowering. It handles both blends with all-zero

9957

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

9958

/// masking out later).

9959

///

9960

/// The reason we have dedicated lowering for zext-style shuffles is that they

9961

/// are both incredibly common and often quite performance sensitive.

9962

static SDValue lowerVectorShuffleAsZeroOrAnyExtend(

9963

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

9964

const APInt &Zeroable, const X86Subtarget &Subtarget,

9965

SelectionDAG &DAG) {

9966

int Bits = VT.getSizeInBits();

9967

int NumLanes = Bits / 128;

9968

int NumElements = VT.getVectorNumElements();

9969

int NumEltsPerLane = NumElements / NumLanes;

9970

assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9971, __extension__ __PRETTY_FUNCTION__))

9971

"Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9971, __extension__ __PRETTY_FUNCTION__));

9972

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9972, __extension__ __PRETTY_FUNCTION__));

9973

9974

// Define a helper function to check a particular ext-scale and lower to it if

9975

// valid.

9976

auto Lower = [&](int Scale) -> SDValue {

9977

SDValue InputV;

9978

bool AnyExt = true;

9979

int Offset = 0;

9980

int Matches = 0;

9981

for (int i = 0; i < NumElements; ++i) {

9982

int M = Mask[i];

9983

if (M < 0)

9984

continue; // Valid anywhere but doesn't tell us anything.

9985

if (i % Scale != 0) {

9986

// Each of the extended elements need to be zeroable.

9987

if (!Zeroable[i])

9988

return SDValue();

9989

9990

// We no longer are in the anyext case.

9991

AnyExt = false;

9992

continue;

9993

}

9994

9995

// Each of the base elements needs to be consecutive indices into the

9996

// same input vector.

9997

SDValue V = M < NumElements ? V1 : V2;

9998

M = M % NumElements;

9999

if (!InputV) {

10000

InputV = V;

10001

Offset = M - (i / Scale);

10002

} else if (InputV != V)

10003

return SDValue(); // Flip-flopping inputs.

10004

10005

// Offset must start in the lowest 128-bit lane or at the start of an

10006

// upper lane.

10007

// FIXME: Is it ever worth allowing a negative base offset?

10008

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

10009

(Offset % NumEltsPerLane) == 0))

10010

return SDValue();

10011

10012

// If we are offsetting, all referenced entries must come from the same

10013

// lane.

10014

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

10015

return SDValue();

10016

10017

if ((M % NumElements) != (Offset + (i / Scale)))

10018

return SDValue(); // Non-consecutive strided elements.

10019

Matches++;

10020

}

10021

10022

// If we fail to find an input, we have a zero-shuffle which should always

10023

// have already been handled.

10024

// FIXME: Maybe handle this here in case during blending we end up with one?

10025

if (!InputV)

10026

return SDValue();

10027

10028

// If we are offsetting, don't extend if we only match a single input, we

10029

// can always do better by using a basic PSHUF or PUNPCK.

10030

if (Offset != 0 && Matches < 2)

10031

return SDValue();

10032

10033

return lowerVectorShuffleAsSpecificZeroOrAnyExtend(

10034

DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);

10035

};

10036

10037

// The widest scale possible for extending is to a 64-bit integer.

10038

assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10039, __extension__ __PRETTY_FUNCTION__))

10039

"The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10039, __extension__ __PRETTY_FUNCTION__));

10040

int NumExtElements = Bits / 64;

10041

10042

// Each iteration, try extending the elements half as much, but into twice as

10043

// many elements.

10044

for (; NumExtElements < NumElements; NumExtElements *= 2) {

10045

assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10046, __extension__ __PRETTY_FUNCTION__))

10046

"The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10046, __extension__ __PRETTY_FUNCTION__));

10047

if (SDValue V = Lower(NumElements / NumExtElements))

10048

return V;

10049

}

10050

10051

// General extends failed, but 128-bit vectors may be able to use MOVQ.

10052

if (Bits != 128)

10053

return SDValue();

10054

10055

// Returns one of the source operands if the shuffle can be reduced to a

10056

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

10057

auto CanZExtLowHalf = [&]() {

10058

for (int i = NumElements / 2; i != NumElements; ++i)

10059

if (!Zeroable[i])

10060

return SDValue();

10061

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

10062

return V1;

10063

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

10064

return V2;

10065

return SDValue();

10066

};

10067

10068

if (SDValue V = CanZExtLowHalf()) {

10069

V = DAG.getBitcast(MVT::v2i64, V);

10070

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

10071

return DAG.getBitcast(VT, V);

10072

}

10073

10074

// No viable ext lowering found.

10075

return SDValue();

10076

}

10077

10078

/// \brief Try to get a scalar value for a specific element of a vector.

10079

///

10080

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

10081

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

10082

SelectionDAG &DAG) {

10083

MVT VT = V.getSimpleValueType();

10084

MVT EltVT = VT.getVectorElementType();

10085

V = peekThroughBitcasts(V);

10086

10087

// If the bitcasts shift the element size, we can't extract an equivalent

10088

// element from it.

10089

MVT NewVT = V.getSimpleValueType();

10090

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

10091

return SDValue();

10092

10093

if (V.getOpcode() == ISD::BUILD_VECTOR ||

10094

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

10095

// Ensure the scalar operand is the same size as the destination.

10096

// FIXME: Add support for scalar truncation where possible.

10097

SDValue S = V.getOperand(Idx);

10098

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

10099

return DAG.getBitcast(EltVT, S);

10100

}

10101

10102

return SDValue();

10103

}

10104

10105

/// \brief Helper to test for a load that can be folded with x86 shuffles.

10106

///

10107

/// This is particularly important because the set of instructions varies

10108

/// significantly based on whether the operand is a load or not.

10109

static bool isShuffleFoldableLoad(SDValue V) {

10110

V = peekThroughBitcasts(V);

10111

return ISD::isNON_EXTLoad(V.getNode());

10112

}

10113

10114

/// \brief Try to lower insertion of a single element into a zero vector.

10115

///

10116

/// This is a common pattern that we have especially efficient patterns to lower

10117

/// across all subtarget feature sets.

10118

static SDValue lowerVectorShuffleAsElementInsertion(

10119

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

10120

const APInt &Zeroable, const X86Subtarget &Subtarget,

10121

SelectionDAG &DAG) {

10122

MVT ExtVT = VT;

10123

MVT EltVT = VT.getVectorElementType();

10124

10125

int V2Index =

10126

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

10127

Mask.begin();

10128

bool IsV1Zeroable = true;

10129

for (int i = 0, Size = Mask.size(); i < Size; ++i)

10130

if (i != V2Index && !Zeroable[i]) {

10131

IsV1Zeroable = false;

10132

break;

10133

}

10134

10135

// Check for a single input from a SCALAR_TO_VECTOR node.

10136

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

10137

// all the smarts here sunk into that routine. However, the current

10138

// lowering of BUILD_VECTOR makes that nearly impossible until the old

10139

// vector shuffle lowering is dead.

10140

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

10141

DAG);

10142

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

10143

// We need to zext the scalar if it is smaller than an i32.

10144

V2S = DAG.getBitcast(EltVT, V2S);

10145

if (EltVT == MVT::i8 || EltVT == MVT::i16) {

10146

// Using zext to expand a narrow element won't work for non-zero

10147

// insertions.

10148

if (!IsV1Zeroable)

10149

return SDValue();

10150

10151

// Zero-extend directly to i32.

10152

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

10153

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

10154

}

10155

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

10156

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

10157

EltVT == MVT::i16) {

10158

// Either not inserting from the low element of the input or the input

10159

// element size is too small to use VZEXT_MOVL to clear the high bits.

10160

return SDValue();

10161

}

10162

10163

if (!IsV1Zeroable) {

10164

// If V1 can't be treated as a zero vector we have fewer options to lower

10165

// this. We can't support integer vectors or non-zero targets cheaply, and

10166

// the V1 elements can't be permuted in any way.

10167

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10167, __extension__ __PRETTY_FUNCTION__));

10168

if (!VT.isFloatingPoint() || V2Index != 0)

10169

return SDValue();

10170

SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());

10171

V1Mask[V2Index] = -1;

10172

if (!isNoopShuffleMask(V1Mask))

10173

return SDValue();

10174

if (!VT.is128BitVector())

10175

return SDValue();

10176

10177

// Otherwise, use MOVSD or MOVSS.

10178

assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(static_cast <bool> ((EltVT == MVT::f32 || EltVT == MVT
::f64) && "Only two types of floating point element types to handle!"
) ? void (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10179, __extension__ __PRETTY_FUNCTION__))

10179

"Only two types of floating point element types to handle!")(static_cast <bool> ((EltVT == MVT::f32 || EltVT == MVT
::f64) && "Only two types of floating point element types to handle!"
) ? void (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10179, __extension__ __PRETTY_FUNCTION__));

10180

return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,

10181

ExtVT, V1, V2);

10182

}

10183

10184

// This lowering only works for the low element with floating point vectors.

10185

if (VT.isFloatingPoint() && V2Index != 0)

10186

return SDValue();

10187

10188

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

10189

if (ExtVT != VT)

10190

V2 = DAG.getBitcast(VT, V2);

10191

10192

if (V2Index != 0) {

10193

// If we have 4 or fewer lanes we can cheaply shuffle the element into

10194

// the desired position. Otherwise it is more efficient to do a vector

10195

// shift left. We know that we can do a vector shift left because all

10196

// the inputs are zero.

10197

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

10198

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

10199

V2Shuffle[V2Index] = 0;

10200

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

10201

} else {

10202

V2 = DAG.getBitcast(MVT::v16i8, V2);

10203

V2 = DAG.getNode(

10204

X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

10205

DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,

10206

DAG.getTargetLoweringInfo().getScalarShiftAmountTy(

10207

DAG.getDataLayout(), VT)));

10208

V2 = DAG.getBitcast(VT, V2);

10209

}

10210

}

10211

return V2;

10212

}

10213

10214

/// Try to lower broadcast of a single - truncated - integer element,

10215

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

10216

///

10217

/// This assumes we have AVX2.

10218

static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,

10219

SDValue V0, int BroadcastIdx,

10220

const X86Subtarget &Subtarget,

10221

SelectionDAG &DAG) {

10222

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10223, __extension__ __PRETTY_FUNCTION__))

10223

"We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10223, __extension__ __PRETTY_FUNCTION__));

10224

10225

EVT EltVT = VT.getVectorElementType();

10226

EVT V0VT = V0.getValueType();

10227

10228

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10228, __extension__ __PRETTY_FUNCTION__));

10229

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10229, __extension__ __PRETTY_FUNCTION__));

10230

10231

EVT V0EltVT = V0VT.getVectorElementType();

10232

if (!V0EltVT.isInteger())

10233

return SDValue();

10234

10235

const unsigned EltSize = EltVT.getSizeInBits();

10236

const unsigned V0EltSize = V0EltVT.getSizeInBits();

10237

10238

// This is only a truncation if the original element type is larger.

10239

if (V0EltSize <= EltSize)

10240

return SDValue();

10241

10242

assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10243, __extension__ __PRETTY_FUNCTION__))

10243

"Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10243, __extension__ __PRETTY_FUNCTION__));

10244

10245

const unsigned V0Opc = V0.getOpcode();

10246

const unsigned Scale = V0EltSize / EltSize;

10247

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

10248

10249

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

10250

V0Opc != ISD::BUILD_VECTOR)

10251

return SDValue();

10252

10253

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

10254

10255

// If we're extracting non-least-significant bits, shift so we can truncate.

10256

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

10257

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

10258

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

10259

if (const int OffsetIdx = BroadcastIdx % Scale)

10260

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

10261

DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));

10262

10263

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

10264

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

10265

}

10266

10267

/// \brief Try to lower broadcast of a single element.

10268

///

10269

/// For convenience, this code also bundles all of the subtarget feature set

10270

/// filtering. While a little annoying to re-dispatch on type here, there isn't

10271

/// a convenient way to factor it out.

10272

static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,

10273

SDValue V1, SDValue V2,

10274

ArrayRef<int> Mask,

10275

const X86Subtarget &Subtarget,

10276

SelectionDAG &DAG) {

10277

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

10278

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

10279

(Subtarget.hasAVX2() && VT.isInteger())))

10280

return SDValue();

10281

10282

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

10283

// we can only broadcast from a register with AVX2.

10284

unsigned NumElts = Mask.size();

10285

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

10286

? X86ISD::MOVDDUP

10287

: X86ISD::VBROADCAST;

10288

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

10289

10290

// Check that the mask is a broadcast.

10291

int BroadcastIdx = -1;

10292

for (int i = 0; i != (int)NumElts; ++i) {

10293

SmallVector<int, 8> BroadcastMask(NumElts, i);

10294

if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {

10295

BroadcastIdx = i;

10296

break;

10297

}

10298

}

10299

10300

if (BroadcastIdx < 0)

10301

return SDValue();

10302

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10304, __extension__ __PRETTY_FUNCTION__))

10303

"a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10304, __extension__ __PRETTY_FUNCTION__))

10304

"comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10304, __extension__ __PRETTY_FUNCTION__));

10305

10306

// Go up the chain of (vector) values to find a scalar load that we can

10307

// combine with the broadcast.

10308

SDValue V = V1;

10309

for (;;) {

10310

switch (V.getOpcode()) {

10311

case ISD::BITCAST: {

10312

SDValue VSrc = V.getOperand(0);

10313

MVT SrcVT = VSrc.getSimpleValueType();

10314

if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())

10315

break;

10316

V = VSrc;

10317

continue;

10318

}

10319

case ISD::CONCAT_VECTORS: {

10320

int OperandSize = Mask.size() / V.getNumOperands();

10321

V = V.getOperand(BroadcastIdx / OperandSize);

10322

BroadcastIdx %= OperandSize;

10323

continue;

10324

}

10325

case ISD::INSERT_SUBVECTOR: {

10326

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

10327

auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));

10328

if (!ConstantIdx)

10329

break;

10330

10331

int BeginIdx = (int)ConstantIdx->getZExtValue();

10332

int EndIdx =

10333

BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();

10334

if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {

10335

BroadcastIdx -= BeginIdx;

10336

V = VInner;

10337

} else {

10338

V = VOuter;

10339

}

10340

continue;

10341

}

10342

}

10343

break;

10344

}

10345

10346

// Check if this is a broadcast of a scalar. We special case lowering

10347

// for scalars so that we can more effectively fold with loads.

10348

// First, look through bitcast: if the original value has a larger element

10349

// type than the shuffle, the broadcast element is in essence truncated.

10350

// Make that explicit to ease folding.

10351

if (V.getOpcode() == ISD::BITCAST && VT.isInteger())

10352

if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(

10353

DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))

10354

return TruncBroadcast;

10355

10356

MVT BroadcastVT = VT;

10357

10358

// Peek through any bitcast (only useful for loads).

10359

SDValue BC = peekThroughBitcasts(V);

10360

10361

// Also check the simpler case, where we can directly reuse the scalar.

10362

if (V.getOpcode() == ISD::BUILD_VECTOR ||

10363

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {

10364

V = V.getOperand(BroadcastIdx);

10365

10366

// If we can't broadcast from a register, check that the input is a load.

10367

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

10368

return SDValue();

10369

} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {

10370

// 32-bit targets need to load i64 as a f64 and then bitcast the result.

10371

if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {

10372

BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());

10373

Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())

10374

? X86ISD::MOVDDUP

10375

: Opcode;

10376

}

10377

10378

// If we are broadcasting a load that is only used by the shuffle

10379

// then we can reduce the vector load to the broadcasted scalar load.

10380

LoadSDNode *Ld = cast<LoadSDNode>(BC);

10381

SDValue BaseAddr = Ld->getOperand(1);

10382

EVT SVT = BroadcastVT.getScalarType();

10383

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

10384

SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);

10385

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

10386

DAG.getMachineFunction().getMachineMemOperand(

10387

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

10388

DAG.makeEquivalentMemoryOrdering(Ld, V);

10389

} else if (!BroadcastFromReg) {

10390

// We can't broadcast from a vector register.

10391

return SDValue();

10392

} else if (BroadcastIdx != 0) {

10393

// We can only broadcast from the zero-element of a vector register,

10394

// but it can be advantageous to broadcast from the zero-element of a

10395

// subvector.

10396

if (!VT.is256BitVector() && !VT.is512BitVector())

10397

return SDValue();

10398

10399

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

10400

if (VT == MVT::v4f64 || VT == MVT::v4i64)

10401

return SDValue();

10402

10403

// Only broadcast the zero-element of a 128-bit subvector.

10404

unsigned EltSize = VT.getScalarSizeInBits();

10405

if (((BroadcastIdx * EltSize) % 128) != 0)

10406

return SDValue();

10407

10408

// The shuffle input might have been a bitcast we looked through; look at

10409

// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll

10410

// later bitcast it to BroadcastVT.

10411

MVT SrcVT = V.getSimpleValueType();

10412

assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&(static_cast <bool> (SrcVT.getScalarSizeInBits() == BroadcastVT
.getScalarSizeInBits() && "Unexpected vector element size"
) ? void (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10413, __extension__ __PRETTY_FUNCTION__))

10413

"Unexpected vector element size")(static_cast <bool> (SrcVT.getScalarSizeInBits() == BroadcastVT
.getScalarSizeInBits() && "Unexpected vector element size"
) ? void (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10413, __extension__ __PRETTY_FUNCTION__));

10414

assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&(static_cast <bool> ((SrcVT.is256BitVector() || SrcVT.is512BitVector
()) && "Unexpected vector size") ? void (0) : __assert_fail
("(SrcVT.is256BitVector() || SrcVT.is512BitVector()) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10415, __extension__ __PRETTY_FUNCTION__))

10415

"Unexpected vector size")(static_cast <bool> ((SrcVT.is256BitVector() || SrcVT.is512BitVector
()) && "Unexpected vector size") ? void (0) : __assert_fail
("(SrcVT.is256BitVector() || SrcVT.is512BitVector()) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10415, __extension__ __PRETTY_FUNCTION__));

10416

10417

MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);

10418

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,

10419

DAG.getIntPtrConstant(BroadcastIdx, DL));

10420

}

10421

10422

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())

10423

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

10424

DAG.getBitcast(MVT::f64, V));

10425

10426

// Bitcast back to the same scalar type as BroadcastVT.

10427

MVT SrcVT = V.getSimpleValueType();

10428

if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {

10429

10430

10431

if (SrcVT.isVector()) {

10432

unsigned NumSrcElts = SrcVT.getVectorNumElements();

10433

SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);

10434

} else {

10435

SrcVT = BroadcastVT.getScalarType();

10436

}

10437

V = DAG.getBitcast(SrcVT, V);

10438

}

10439

10440

// 32-bit targets need to load i64 as a f64 and then bitcast the result.

10441

if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {

10442

V = DAG.getBitcast(MVT::f64, V);

10443

unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();

10444

BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);

10445

}

10446

10447

// We only support broadcasting from 128-bit vectors to minimize the

10448

// number of patterns we need to deal with in isel. So extract down to

10449

// 128-bits.

10450

if (SrcVT.getSizeInBits() > 128)

10451

V = extract128BitVector(V, 0, DAG, DL);

10452

10453

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

10454

}

10455

10456

// Check for whether we can use INSERTPS to perform the shuffle. We only use

10457

// INSERTPS when the V1 elements are already in the correct locations

10458

// because otherwise we can just always use two SHUFPS instructions which

10459

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

10460

// perform INSERTPS if a single V1 element is out of place and all V2

10461

// elements are zeroable.

10462

static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,

10463

unsigned &InsertPSMask,

10464

const APInt &Zeroable,

10465

ArrayRef<int> Mask,

10466

SelectionDAG &DAG) {

10467

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10467, __extension__ __PRETTY_FUNCTION__));

10468

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10468, __extension__ __PRETTY_FUNCTION__));

10469

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10469, __extension__ __PRETTY_FUNCTION__));

10470

10471

// Attempt to match INSERTPS with one element from VA or VB being

10472

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

10473

// are updated.

10474

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

10475

ArrayRef<int> CandidateMask) {

10476

unsigned ZMask = 0;

10477

int VADstIndex = -1;

10478

int VBDstIndex = -1;

10479

bool VAUsedInPlace = false;

10480

10481

for (int i = 0; i < 4; ++i) {

10482

// Synthesize a zero mask from the zeroable elements (includes undefs).

10483

if (Zeroable[i]) {

10484

ZMask |= 1 << i;

10485

continue;

10486

}

10487

10488

// Flag if we use any VA inputs in place.

10489

if (i == CandidateMask[i]) {

10490

VAUsedInPlace = true;

10491

continue;

10492

}

10493

10494

// We can only insert a single non-zeroable element.

10495

if (VADstIndex >= 0 || VBDstIndex >= 0)

10496

return false;

10497

10498

if (CandidateMask[i] < 4) {

10499

// VA input out of place for insertion.

10500

VADstIndex = i;

10501

} else {

10502

// VB input for insertion.

10503

VBDstIndex = i;

10504

}

10505

}

10506

10507

// Don't bother if we have no (non-zeroable) element for insertion.

10508

if (VADstIndex < 0 && VBDstIndex < 0)

10509

return false;

10510

10511

// Determine element insertion src/dst indices. The src index is from the

10512

// start of the inserted vector, not the start of the concatenated vector.

10513

unsigned VBSrcIndex = 0;

10514

if (VADstIndex >= 0) {

10515

// If we have a VA input out of place, we use VA as the V2 element

10516

// insertion and don't use the original V2 at all.

10517

VBSrcIndex = CandidateMask[VADstIndex];

10518

VBDstIndex = VADstIndex;

10519

VB = VA;

10520

} else {

10521

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

10522

}

10523

10524

// If no V1 inputs are used in place, then the result is created only from

10525

// the zero mask and the V2 insertion - so remove V1 dependency.

10526

if (!VAUsedInPlace)

10527

VA = DAG.getUNDEF(MVT::v4f32);

10528

10529

// Update V1, V2 and InsertPSMask accordingly.

10530

V1 = VA;

10531

V2 = VB;

10532

10533

// Insert the V2 element into the desired position.

10534

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

10535

10536

return true;

10537

};

10538

10539

if (matchAsInsertPS(V1, V2, Mask))

10540

return true;

10541

10542

// Commute and try again.

10543

SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());

10544

ShuffleVectorSDNode::commuteMask(CommutedMask);

10545

if (matchAsInsertPS(V2, V1, CommutedMask))

10546

return true;

10547

10548

return false;

10549

}

10550

10551

static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,

10552

SDValue V2, ArrayRef<int> Mask,

10553

const APInt &Zeroable,

10554

SelectionDAG &DAG) {

10555

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10555, __extension__ __PRETTY_FUNCTION__));

10556

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10556, __extension__ __PRETTY_FUNCTION__));

10557

10558

// Attempt to match the insertps pattern.

10559

unsigned InsertPSMask;

10560

if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

10561

return SDValue();

10562

10563

// Insert the V2 element into the desired position.

10564

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

10565

DAG.getConstant(InsertPSMask, DL, MVT::i8));

10566

}

10567

10568

/// \brief Try to lower a shuffle as a permute of the inputs followed by an

10569

/// UNPCK instruction.

10570

///

10571

/// This specifically targets cases where we end up with alternating between

10572

/// the two inputs, and so can permute them into something that feeds a single

10573

/// UNPCK instruction. Note that this routine only targets integer vectors

10574

/// because for floating point vectors we have a generalized SHUFPS lowering

10575

/// strategy that handles everything that doesn't *exactly* match an unpack,

10576

/// making this clever lowering unnecessary.

10577

static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,

10578

SDValue V1, SDValue V2,

10579

ArrayRef<int> Mask,

10580

SelectionDAG &DAG) {

10581

assert(!VT.isFloatingPoint() &&(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10582, __extension__ __PRETTY_FUNCTION__))

10582

"This routine only supports integer vectors.")(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10582, __extension__ __PRETTY_FUNCTION__));

10583

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10584, __extension__ __PRETTY_FUNCTION__))

10584

"This routine only works on 128-bit vectors.")(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10584, __extension__ __PRETTY_FUNCTION__));

10585

assert(!V2.isUndef() &&(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __extension__ __PRETTY_FUNCTION__))

10586

"This routine should only be used when blending two inputs.")(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __extension__ __PRETTY_FUNCTION__));

10587

assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10587, __extension__ __PRETTY_FUNCTION__));

10588

10589

int Size = Mask.size();

10590

10591

int NumLoInputs =

10592

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

10593

int NumHiInputs =

10594

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

10595

10596

bool UnpackLo = NumLoInputs >= NumHiInputs;

10597

10598

auto TryUnpack = [&](int ScalarSize, int Scale) {

10599

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

10600

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

10601

10602

for (int i = 0; i < Size; ++i) {

10603

if (Mask[i] < 0)

10604

continue;

10605

10606

// Each element of the unpack contains Scale elements from this mask.

10607

int UnpackIdx = i / Scale;

10608

10609

// We only handle the case where V1 feeds the first slots of the unpack.

10610

// We rely on canonicalization to ensure this is the case.

10611

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

10612

return SDValue();

10613

10614

// Setup the mask for this input. The indexing is tricky as we have to

10615

// handle the unpack stride.

10616

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

10617

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

10618

Mask[i] % Size;

10619

}

10620

10621

// If we will have to shuffle both inputs to use the unpack, check whether

10622

// we can just unpack first and shuffle the result. If so, skip this unpack.

10623

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

10624

!isNoopShuffleMask(V2Mask))

10625

return SDValue();

10626

10627

// Shuffle the inputs into place.

10628

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

10629

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

10630

10631

// Cast the inputs to the type we will use to unpack them.

10632

MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

10633

V1 = DAG.getBitcast(UnpackVT, V1);

10634

V2 = DAG.getBitcast(UnpackVT, V2);

10635

10636

// Unpack the inputs and cast the result back to the desired type.

10637

return DAG.getBitcast(

10638

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

10639

UnpackVT, V1, V2));

10640

};

10641

10642

// We try each unpack from the largest to the smallest to try and find one

10643

// that fits this mask.

10644

int OrigScalarSize = VT.getScalarSizeInBits();

10645

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

10646

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

10647

return Unpack;

10648

10649

// If none of the unpack-rooted lowerings worked (or were profitable) try an

10650

// initial unpack.

10651

if (NumLoInputs == 0 || NumHiInputs == 0) {

10652

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10653, __extension__ __PRETTY_FUNCTION__))

10653

"We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10653, __extension__ __PRETTY_FUNCTION__));

10654

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

10655

10656

// FIXME: We could consider the total complexity of the permute of each

10657

// possible unpacking. Or at the least we should consider how many

10658

// half-crossings are created.

10659

// FIXME: We could consider commuting the unpacks.

10660

10661

SmallVector<int, 32> PermMask((unsigned)Size, -1);

10662

for (int i = 0; i < Size; ++i) {

10663

if (Mask[i] < 0)

10664

continue;

10665

10666

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10666, __extension__ __PRETTY_FUNCTION__));

10667

10668

PermMask[i] =

10669

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

10670

}

10671

return DAG.getVectorShuffle(

10672

VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,

10673

DL, VT, V1, V2),

10674

DAG.getUNDEF(VT), PermMask);

10675

}

10676

10677

return SDValue();

10678

}

10679

10680

/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.

10681

///

10682

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

10683

/// support for floating point shuffles but not integer shuffles. These

10684

/// instructions will incur a domain crossing penalty on some chips though so

10685

/// it is better to avoid lowering through this for integer vectors where

10686

/// possible.

10687

static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

10688

const APInt &Zeroable,

10689

SDValue V1, SDValue V2,

10690

const X86Subtarget &Subtarget,

10691

SelectionDAG &DAG) {

10692

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10692, __extension__ __PRETTY_FUNCTION__));

10693

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10693, __extension__ __PRETTY_FUNCTION__));

10694

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10694, __extension__ __PRETTY_FUNCTION__));

10695

10696

if (V2.isUndef()) {

10697

// Check for being able to broadcast a single element.

10698

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(

10699

DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))

10700

return Broadcast;

10701

10702

// Straight shuffle of a single input vector. Simulate this by using the

10703

// single input as both of the "inputs" to this instruction..

10704

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

10705

10706

if (Subtarget.hasAVX()) {

10707

// If we have AVX, we can use VPERMILPS which will allow folding a load

10708

// into the shuffle.

10709

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

10710

DAG.getConstant(SHUFPDMask, DL, MVT::i8));

10711

}

10712

10713

return DAG.getNode(

10714

X86ISD::SHUFP, DL, MVT::v2f64,

10715

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

10716

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

10717

DAG.getConstant(SHUFPDMask, DL, MVT::i8));

10718

}

10719

assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!")(static_cast <bool> (Mask[0] >= 0 && Mask[0]
< 2 && "Non-canonicalized blend!") ? void (0) : __assert_fail
("Mask[0] >= 0 && Mask[0] < 2 && \"Non-canonicalized blend!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10719, __extension__ __PRETTY_FUNCTION__));

10720

assert(Mask[1] >= 2 && "Non-canonicalized blend!")(static_cast <bool> (Mask[1] >= 2 && "Non-canonicalized blend!"
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"Non-canonicalized blend!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10720, __extension__ __PRETTY_FUNCTION__));

10721

10722

// If we have a single input, insert that into V1 if we can do so cheaply.

10723

if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {

10724

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

10725

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

10726

return Insertion;

10727

// Try inverting the insertion since for v2 masks it is easy to do and we

10728

// can't reliably sort the mask one way or the other.

10729

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

10730

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

10731

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

10732

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

10733

return Insertion;

10734

}

10735

10736

// Try to use one of the special instruction patterns to handle two common

10737

// blend patterns if a zero-blend above didn't work.

10738

if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||

10739

isShuffleEquivalent(V1, V2, Mask, {1, 3}))

10740

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

10741

// We can either use a special instruction to load over the low double or

10742

// to move just the low double.

10743

return DAG.getNode(

10744

isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,

10745

DL, MVT::v2f64, V2,

10746

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

10747

10748

if (Subtarget.hasSSE41())

10749

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

10750

Zeroable, Subtarget, DAG))

10751

return Blend;

10752

10753

// Use dedicated unpack instructions for masks that match their pattern.

10754

if (SDValue V =

10755

lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

10756

return V;

10757

10758

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

10759

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

10760

DAG.getConstant(SHUFPDMask, DL, MVT::i8));

10761

}

10762

10763

/// \brief Handle lowering of 2-lane 64-bit integer shuffles.

10764

///

10765

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

10766

/// the integer unit to minimize domain crossing penalties. However, for blends

10767

/// it falls back to the floating point shuffle operation with appropriate bit

10768

/// casting.

10769

static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

10770

const APInt &Zeroable,

10771

SDValue V1, SDValue V2,

10772

const X86Subtarget &Subtarget,

10773

SelectionDAG &DAG) {

10774

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10774, __extension__ __PRETTY_FUNCTION__));

10775

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10775, __extension__ __PRETTY_FUNCTION__));

10776

10777

10778

if (V2.isUndef()) {

10779

// Check for being able to broadcast a single element.

10780

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(

10781

DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))

10782

return Broadcast;

10783

10784

// Straight shuffle of a single input vector. For everything from SSE2

10785

// onward this has a single fast instruction with no scary immediates.

10786

// We have to map the mask as it is actually a v4i32 shuffle instruction.

10787

V1 = DAG.getBitcast(MVT::v4i32, V1);

10788

int WidenedMask[4] = {

10789

std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,

10790

std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};

10791

return DAG.getBitcast(

10792

MVT::v2i64,

10793

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

10794

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

10795

}

10796

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10796, __extension__ __PRETTY_FUNCTION__));

10797

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10797, __extension__ __PRETTY_FUNCTION__));

10798

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10798, __extension__ __PRETTY_FUNCTION__));

10799

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10799, __extension__ __PRETTY_FUNCTION__));

10800

10801

// Try to use shift instructions.

10802

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,

10803

Zeroable, Subtarget, DAG))

10804

return Shift;

10805

10806

// When loading a scalar and then shuffling it into a vector we can often do

10807

// the insertion cheaply.

10808

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

10809

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

10810

return Insertion;

10811

// Try inverting the insertion since for v2 masks it is easy to do and we

10812

// can't reliably sort the mask one way or the other.

10813

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

10814

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

10815

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

10816

return Insertion;

10817

10818

// We have different paths for blend lowering, but they all must use the

10819

// *exact* same predicate.

10820

bool IsBlendSupported = Subtarget.hasSSE41();

10821

if (IsBlendSupported)

10822

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

10823

Zeroable, Subtarget, DAG))

10824

return Blend;

10825

10826

// Use dedicated unpack instructions for masks that match their pattern.

10827

if (SDValue V =

10828

lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

10829

return V;

10830

10831

// Try to use byte rotation instructions.

10832

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

10833

if (Subtarget.hasSSSE3()) {

10834

if (Subtarget.hasVLX())

10835

if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,

10836

Mask, Subtarget, DAG))

10837

return Rotate;

10838

10839

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

10840

DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))

10841

return Rotate;

10842

}

10843

10844

// If we have direct support for blends, we should lower by decomposing into

10845

// a permute. That will be faster than the domain cross.

10846

if (IsBlendSupported)

10847

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,

10848

Mask, DAG);

10849

10850

// We implement this with SHUFPD which is pretty lame because it will likely

10851

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

10852

// However, all the alternatives are still more cycles and newer chips don't

10853

// have this problem. It would be really nice if x86 had better shuffles here.

10854

V1 = DAG.getBitcast(MVT::v2f64, V1);

10855

V2 = DAG.getBitcast(MVT::v2f64, V2);

10856

return DAG.getBitcast(MVT::v2i64,

10857

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

10858

}

10859

10860

/// \brief Test whether this can be lowered with a single SHUFPS instruction.

10861

///

10862

/// This is used to disable more specialized lowerings when the shufps lowering

10863

/// will happen to be efficient.

10864

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

10865

// This routine only handles 128-bit shufps.

10866

assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10866, __extension__ __PRETTY_FUNCTION__));

10867

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10867, __extension__ __PRETTY_FUNCTION__));

10868

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10868, __extension__ __PRETTY_FUNCTION__));

10869

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10869, __extension__ __PRETTY_FUNCTION__));

10870

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10870, __extension__ __PRETTY_FUNCTION__));

10871

10872

// To lower with a single SHUFPS we need to have the low half and high half

10873

// each requiring a single input.

10874

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

10875

return false;

10876

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

10877

return false;

10878

10879

return true;

10880

}

10881

10882

/// \brief Lower a vector shuffle using the SHUFPS instruction.

10883

///

10884

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

10885

/// It makes no assumptions about whether this is the *best* lowering, it simply

10886

/// uses it.

10887

static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

10888

ArrayRef<int> Mask, SDValue V1,

10889

SDValue V2, SelectionDAG &DAG) {

10890

SDValue LowV = V1, HighV = V2;

10891

int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

10892

10893

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

10894

10895

if (NumV2Elements == 1) {

10896

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

10897

10898

// Compute the index adjacent to V2Index and in the same half by toggling

10899

// the low bit.

10900

int V2AdjIndex = V2Index ^ 1;

10901

10902

if (Mask[V2AdjIndex] < 0) {

10903

// Handles all the cases where we have a single V2 element and an undef.

10904

// This will only ever happen in the high lanes because we commute the

10905

// vector otherwise.

10906

if (V2Index < 2)

10907

std::swap(LowV, HighV);

10908

NewMask[V2Index] -= 4;

10909

} else {

10910

// Handle the case where the V2 element ends up adjacent to a V1 element.

10911

// To make this work, blend them together as the first step.

10912

int V1Index = V2AdjIndex;

10913

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

10914

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

10915

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

10916

10917

// Now proceed to reconstruct the final blend as we have the necessary

10918

// high or low half formed.

10919

if (V2Index < 2) {

10920

LowV = V2;

10921

HighV = V1;

10922

} else {

10923

HighV = V2;

10924

}

10925

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

10926

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

10927

}

10928

} else if (NumV2Elements == 2) {

10929

if (Mask[0] < 4 && Mask[1] < 4) {

10930

// Handle the easy case where we have V1 in the low lanes and V2 in the

10931

// high lanes.

10932

NewMask[2] -= 4;

10933

NewMask[3] -= 4;

10934

} else if (Mask[2] < 4 && Mask[3] < 4) {

10935

// We also handle the reversed case because this utility may get called

10936

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

10937

// arrange things in the right direction.

10938

NewMask[0] -= 4;

10939

NewMask[1] -= 4;

10940

HighV = V1;

10941

LowV = V2;

10942

} else {

10943

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

10944

// trying to place elements directly, just blend them and set up the final

10945

// shuffle to place them.

10946

10947

// The first two blend mask elements are for V1, the second two are for

10948

// V2.

10949

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

10950

Mask[2] < 4 ? Mask[2] : Mask[3],

10951

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

10952

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

10953

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

10954

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

10955

10956

// Now we do a normal shuffle of V1 by giving V1 as both operands to

10957

// a blend.

10958

LowV = HighV = V1;

10959

NewMask[0] = Mask[0] < 4 ? 0 : 2;

10960

NewMask[1] = Mask[0] < 4 ? 2 : 0;

10961

NewMask[2] = Mask[2] < 4 ? 1 : 3;

10962

NewMask[3] = Mask[2] < 4 ? 3 : 1;

10963

}

10964

}

10965

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

10966

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

10967

}

10968

10969

/// \brief Lower 4-lane 32-bit floating point shuffles.

10970

///

10971

/// Uses instructions exclusively from the floating point unit to minimize

10972

/// domain crossing penalties, as these are sufficient to implement all v4f32

10973

/// shuffles.

10974

static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

10975

const APInt &Zeroable,

10976

SDValue V1, SDValue V2,

10977

const X86Subtarget &Subtarget,

10978

SelectionDAG &DAG) {

10979

10980

10981

10982

10983

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

10984

10985

if (NumV2Elements == 0) {

10986

// Check for being able to broadcast a single element.

10987

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(

10988

DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))

10989

return Broadcast;

10990

10991

// Use even/odd duplicate instructions for masks that match their pattern.

10992

if (Subtarget.hasSSE3()) {

10993

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))

10994

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

10995

if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))

10996

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

10997

}

10998

10999

if (Subtarget.hasAVX()) {

11000

// If we have AVX, we can use VPERMILPS which will allow folding a load

11001

// into the shuffle.

11002

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

11003

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

11004

}

11005

11006

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

11007

// in SSE1 because otherwise they are widened to v2f64 and never get here.

11008

if (!Subtarget.hasSSE2()) {

11009

if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))

11010

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

11011

if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))

11012

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

11013

}

11014

11015

// Otherwise, use a straight shuffle of a single input vector. We pass the

11016

// input vector to both operands to simulate this with a SHUFPS.

11017

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

11018

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

11019

}

11020

11021

// There are special ways we can lower some single-element blends. However, we

11022

// have custom ways we can lower more complex single-element blends below that

11023

// we defer to if both this and BLENDPS fail to match, so restrict this to

11024

// when the V2 input is targeting element 0 of the mask -- that is the fast

11025

// case here.

11026

if (NumV2Elements == 1 && Mask[0] >= 4)

11027

if (SDValue V = lowerVectorShuffleAsElementInsertion(

11028

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

11029

return V;

11030

11031

if (Subtarget.hasSSE41()) {

11032

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

11033

Zeroable, Subtarget, DAG))

11034

return Blend;

11035

11036

// Use INSERTPS if we can complete the shuffle efficiently.

11037

if (SDValue V =

11038

lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

11039

return V;

11040

11041

if (!isSingleSHUFPSMask(Mask))

11042

if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(

11043

DL, MVT::v4f32, V1, V2, Mask, DAG))

11044

return BlendPerm;

11045

}

11046

11047

// Use low/high mov instructions. These are only valid in SSE1 because

11048

// otherwise they are widened to v2f64 and never get here.

11049

if (!Subtarget.hasSSE2()) {

11050

if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))

11051

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

11052

if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))

11053

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

11054

}

11055

11056

// Use dedicated unpack instructions for masks that match their pattern.

11057

if (SDValue V =

11058

lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

11059

return V;

11060

11061

// Otherwise fall back to a SHUFPS lowering strategy.

11062

return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

11063

}

11064

11065

/// \brief Lower 4-lane i32 vector shuffles.

11066

///

11067

/// We try to handle these with integer-domain shuffles where we can, but for

11068

/// blends we use the floating point domain blend instructions.

11069

static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

11070

const APInt &Zeroable,

11071

SDValue V1, SDValue V2,

11072

const X86Subtarget &Subtarget,

11073

SelectionDAG &DAG) {

11074

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11074, __extension__ __PRETTY_FUNCTION__));

11075

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11075, __extension__ __PRETTY_FUNCTION__));

11076

11077

11078

// Whenever we can lower this as a zext, that instruction is strictly faster

11079

// than any alternative. It also allows us to fold memory operands into the

11080

// shuffle in many cases.

11081

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

11082

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

11083

return ZExt;

11084

11085

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

11086

11087

if (NumV2Elements == 0) {

11088

// Check for being able to broadcast a single element.

11089

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(

11090

DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))

11091

return Broadcast;

11092

11093

// Straight shuffle of a single input vector. For everything from SSE2

11094

// onward this has a single fast instruction with no scary immediates.

11095

// We coerce the shuffle pattern to be compatible with UNPCK instructions

11096

// but we aren't actually going to use the UNPCK instruction because doing

11097

// so prevents folding a load into this instruction or making a copy.

11098

const int UnpackLoMask[] = {0, 0, 1, 1};

11099

const int UnpackHiMask[] = {2, 2, 3, 3};

11100

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))

11101

Mask = UnpackLoMask;

11102

else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))

11103

Mask = UnpackHiMask;

11104

11105

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

11106

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

11107

}

11108

11109

// Try to use shift instructions.

11110

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,

11111

Zeroable, Subtarget, DAG))

11112

return Shift;

11113

11114

// There are special ways we can lower some single-element blends.

11115

if (NumV2Elements == 1)

11116

if (SDValue V = lowerVectorShuffleAsElementInsertion(

11117

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

11118

return V;

11119

11120

// We have different paths for blend lowering, but they all must use the

11121

// *exact* same predicate.

11122

bool IsBlendSupported = Subtarget.hasSSE41();

11123

if (IsBlendSupported)

11124

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

11125

Zeroable, Subtarget, DAG))

11126

return Blend;

11127

11128

if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

11129

Zeroable, DAG))

11130

return Masked;

11131

11132

// Use dedicated unpack instructions for masks that match their pattern.

11133

if (SDValue V =

11134

lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

11135

return V;

11136

11137

// Try to use byte rotation instructions.

11138

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

11139

if (Subtarget.hasSSSE3()) {

11140

if (Subtarget.hasVLX())

11141

if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,

11142

Mask, Subtarget, DAG))

11143

return Rotate;

11144

11145

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

11146

DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))

11147

return Rotate;

11148

}

11149

11150

// Assume that a single SHUFPS is faster than an alternative sequence of

11151

// multiple instructions (even if the CPU has a domain penalty).

11152

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

11153

if (!isSingleSHUFPSMask(Mask)) {

11154

// If we have direct support for blends, we should lower by decomposing into

11155

// a permute. That will be faster than the domain cross.

11156

if (IsBlendSupported)

11157

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,

11158

Mask, DAG);

11159

11160

// Try to lower by permuting the inputs into an unpack instruction.

11161

if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(

11162

DL, MVT::v4i32, V1, V2, Mask, DAG))

11163

return Unpack;

11164

}

11165

11166

// We implement this with SHUFPS because it can blend from two vectors.

11167

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

11168

// up the inputs, bypassing domain shift penalties that we would incur if we

11169

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

11170

// relevant.

11171

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

11172

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

11173

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

11174

return DAG.getBitcast(MVT::v4i32, ShufPS);

11175

}

11176

11177

/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

11178

/// shuffle lowering, and the most complex part.

11179

///

11180

/// The lowering strategy is to try to form pairs of input lanes which are

11181

/// targeted at the same half of the final vector, and then use a dword shuffle

11182

/// to place them onto the right half, and finally unpack the paired lanes into

11183

/// their final position.

11184

///

11185

/// The exact breakdown of how to form these dword pairs and align them on the

11186

/// correct sides is really tricky. See the comments within the function for

11187

/// more of the details.

11188

///

11189

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

11190

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

11191

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

11192

/// vector, form the analogous 128-bit 8-element Mask.

11193

static SDValue lowerV8I16GeneralSingleInputVectorShuffle(

11194

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

11195

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

11196

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11196, __extension__ __PRETTY_FUNCTION__));

11197

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

11198

11199

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11199, __extension__ __PRETTY_FUNCTION__));

11200

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

11201

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

11202

11203

SmallVector<int, 4> LoInputs;

11204

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

11205

std::sort(LoInputs.begin(), LoInputs.end());

11206

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

11207

SmallVector<int, 4> HiInputs;

11208

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

11209

std::sort(HiInputs.begin(), HiInputs.end());

11210

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

11211

int NumLToL =

11212

std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();

11213

int NumHToL = LoInputs.size() - NumLToL;

11214

int NumLToH =

11215

std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();

11216

int NumHToH = HiInputs.size() - NumLToH;

11217

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

11218

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

11219

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

11220

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

11221

11222

// If we are splatting two values from one half - one to each half, then

11223

// we can shuffle that half so each is splatted to a dword, then splat those

11224

// to their respective halves.

11225

auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,

11226

int DOffset) {

11227

int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};

11228

int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};

11229

V = DAG.getNode(ShufWOp, DL, VT, V,

11230

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

11231

V = DAG.getBitcast(PSHUFDVT, V);

11232

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

11233

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

11234

return DAG.getBitcast(VT, V);

11235

};

11236

11237

if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)

11238

return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);

11239

if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)

11240

return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);

11241

11242

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

11243

// such inputs we can swap two of the dwords across the half mark and end up

11244

// with <=2 inputs to each half in each half. Once there, we can fall through

11245

// to the generic code below. For example:

11246

11247

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

11248

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

11249

11250

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

11251

// and an existing 2-into-2 on the other half. In this case we may have to

11252

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

11253

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

11254

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

11255

// because any other situation (including a 3-into-1 or 1-into-3 in the other

11256

// half than the one we target for fixing) will be fixed when we re-enter this

11257

// path. We will also combine away any sequence of PSHUFD instructions that

11258

// result into a single instruction. Here is an example of the tricky case:

11259

11260

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

11261

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

11262

11263

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

11264

11265

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

11266

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

11267

11268

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

11269

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

11270

11271

// The result is fine to be handled by the generic logic.

11272

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

11273

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

11274

int AOffset, int BOffset) {

11275

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11276, __extension__ __PRETTY_FUNCTION__))

11276

"Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11276, __extension__ __PRETTY_FUNCTION__));

11277

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11278, __extension__ __PRETTY_FUNCTION__))

11278

"Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11278, __extension__ __PRETTY_FUNCTION__));

11279

assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11280, __extension__ __PRETTY_FUNCTION__))

11280

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11280, __extension__ __PRETTY_FUNCTION__));

11281

11282

bool ThreeAInputs = AToAInputs.size() == 3;

11283

11284

// Compute the index of dword with only one word among the three inputs in

11285

// a half by taking the sum of the half with three inputs and subtracting

11286

// the sum of the actual three inputs. The difference is the remaining

11287

// slot.

11288

int ADWord, BDWord;

11289

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

11290

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

11291

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

11292

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

11293

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

11294

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

11295

int TripleNonInputIdx =

11296

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

11297

TripleDWord = TripleNonInputIdx / 2;

11298

11299

// We use xor with one to compute the adjacent DWord to whichever one the

11300

// OneInput is in.

11301

OneInputDWord = (OneInput / 2) ^ 1;

11302

11303

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

11304

// and BToA inputs. If there is also such a problem with the BToB and AToB

11305

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

11306

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

11307

// is essential that we don't *create* a 3<-1 as then we might oscillate.

11308

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

11309

// Compute how many inputs will be flipped by swapping these DWords. We

11310

// need

11311

// to balance this to ensure we don't form a 3-1 shuffle in the other

11312

// half.

11313

int NumFlippedAToBInputs =

11314

std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +

11315

std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);

11316

int NumFlippedBToBInputs =

11317

std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +

11318

std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);

11319

if ((NumFlippedAToBInputs == 1 &&

11320

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

11321

(NumFlippedBToBInputs == 1 &&

11322

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

11323

// We choose whether to fix the A half or B half based on whether that

11324

// half has zero flipped inputs. At zero, we may not be able to fix it

11325

// with that half. We also bias towards fixing the B half because that

11326

// will more commonly be the high half, and we have to bias one way.

11327

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

11328

ArrayRef<int> Inputs) {

11329

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

11330

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

11331

// Determine whether the free index is in the flipped dword or the

11332

// unflipped dword based on where the pinned index is. We use this bit

11333

// in an xor to conditionally select the adjacent dword.

11334

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

11335

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

11336

if (IsFixIdxInput == IsFixFreeIdxInput)

11337

FixFreeIdx += 1;

11338

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

11339

assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11340, __extension__ __PRETTY_FUNCTION__))

11340

"We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11340, __extension__ __PRETTY_FUNCTION__));

11341

int PSHUFHalfMask[] = {0, 1, 2, 3};

11342

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

11343

V = DAG.getNode(

11344

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

11345

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

11346

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

11347

11348

for (int &M : Mask)

11349

if (M >= 0 && M == FixIdx)

11350

M = FixFreeIdx;

11351

else if (M >= 0 && M == FixFreeIdx)

11352

M = FixIdx;

11353

};

11354

if (NumFlippedBToBInputs != 0) {

11355

int BPinnedIdx =

11356

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

11357

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

11358

} else {

11359

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11359, __extension__ __PRETTY_FUNCTION__));

11360

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

11361

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

11362

}

11363

}

11364

}

11365

11366

int PSHUFDMask[] = {0, 1, 2, 3};

11367

PSHUFDMask[ADWord] = BDWord;

11368

PSHUFDMask[BDWord] = ADWord;

11369

V = DAG.getBitcast(

11370

VT,

11371

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

11372

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

11373

11374

// Adjust the mask to match the new locations of A and B.

11375

for (int &M : Mask)

11376

if (M >= 0 && M/2 == ADWord)

11377

M = 2 * BDWord + M % 2;

11378

else if (M >= 0 && M/2 == BDWord)

11379

M = 2 * ADWord + M % 2;

11380

11381

// Recurse back into this routine to re-compute state now that this isn't

11382

// a 3 and 1 problem.

11383

return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,

11384

DAG);

11385

};

11386

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

11387

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

11388

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

11389

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

11390

11391

// At this point there are at most two inputs to the low and high halves from

11392

// each half. That means the inputs can always be grouped into dwords and

11393

// those dwords can then be moved to the correct half with a dword shuffle.

11394

// We use at most one low and one high word shuffle to collect these paired

11395

// inputs into dwords, and finally a dword shuffle to place them.

11396

int PSHUFLMask[4] = {-1, -1, -1, -1};

11397

int PSHUFHMask[4] = {-1, -1, -1, -1};

11398

int PSHUFDMask[4] = {-1, -1, -1, -1};

11399

11400

// First fix the masks for all the inputs that are staying in their

11401

// original halves. This will then dictate the targets of the cross-half

11402

// shuffles.

11403

auto fixInPlaceInputs =

11404

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

11405

MutableArrayRef<int> SourceHalfMask,

11406

MutableArrayRef<int> HalfMask, int HalfOffset) {

11407

if (InPlaceInputs.empty())

11408

return;

11409

if (InPlaceInputs.size() == 1) {

11410

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

11411

InPlaceInputs[0] - HalfOffset;

11412

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

11413

return;

11414

}

11415

if (IncomingInputs.empty()) {

11416

// Just fix all of the in place inputs.

11417

for (int Input : InPlaceInputs) {

11418

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

11419

PSHUFDMask[Input / 2] = Input / 2;

11420

}

11421

return;

11422

}

11423

11424

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11424, __extension__ __PRETTY_FUNCTION__));

11425

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

11426

InPlaceInputs[0] - HalfOffset;

11427

// Put the second input next to the first so that they are packed into

11428

// a dword. We find the adjacent index by toggling the low bit.

11429

int AdjIndex = InPlaceInputs[0] ^ 1;

11430

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

11431

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

11432

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

11433

};

11434

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

11435

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

11436

11437

// Now gather the cross-half inputs and place them into a free dword of

11438

// their target half.

11439

// FIXME: This operation could almost certainly be simplified dramatically to

11440

// look more like the 3-1 fixing operation.

11441

auto moveInputsToRightHalf = [&PSHUFDMask](

11442

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

11443

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

11444

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

11445

int DestOffset) {

11446

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

11447

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

11448

};

11449

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

11450

int Word) {

11451

int LowWord = Word & ~1;

11452

int HighWord = Word | 1;

11453

return isWordClobbered(SourceHalfMask, LowWord) ||

11454

isWordClobbered(SourceHalfMask, HighWord);

11455

};

11456

11457

if (IncomingInputs.empty())

11458

return;

11459

11460

if (ExistingInputs.empty()) {

11461

// Map any dwords with inputs from them into the right half.

11462

for (int Input : IncomingInputs) {

11463

// If the source half mask maps over the inputs, turn those into

11464

// swaps and use the swapped lane.

11465

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

11466

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

11467

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

11468

Input - SourceOffset;

11469

// We have to swap the uses in our half mask in one sweep.

11470

for (int &M : HalfMask)

11471

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

11472

M = Input;

11473

else if (M == Input)

11474

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

11475

} else {

11476

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11478, __extension__ __PRETTY_FUNCTION__))

11477

Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11478, __extension__ __PRETTY_FUNCTION__))

11478

"Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11478, __extension__ __PRETTY_FUNCTION__));

11479

}

11480

// Note that this correctly re-maps both when we do a swap and when

11481

// we observe the other side of the swap above. We rely on that to

11482

// avoid swapping the members of the input list directly.

11483

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

11484

}

11485

11486

// Map the input's dword into the correct half.

11487

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

11488

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

11489

else

11490

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11492, __extension__ __PRETTY_FUNCTION__))

11491

Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11492, __extension__ __PRETTY_FUNCTION__))

11492

"Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11492, __extension__ __PRETTY_FUNCTION__));

11493

}

11494

11495

// And just directly shift any other-half mask elements to be same-half

11496

// as we will have mirrored the dword containing the element into the

11497

// same position within that half.

11498

for (int &M : HalfMask)

11499

if (M >= SourceOffset && M < SourceOffset + 4) {

11500

M = M - SourceOffset + DestOffset;

11501

assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11501, __extension__ __PRETTY_FUNCTION__));

11502

}

11503

return;

11504

}

11505

11506

// Ensure we have the input in a viable dword of its current half. This

11507

// is particularly tricky because the original position may be clobbered

11508

// by inputs being moved and *staying* in that half.

11509

if (IncomingInputs.size() == 1) {

11510

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

11511

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

11512

SourceOffset;

11513

SourceHalfMask[InputFixed - SourceOffset] =

11514

IncomingInputs[0] - SourceOffset;

11515

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

11516

InputFixed);

11517

IncomingInputs[0] = InputFixed;

11518

}

11519

} else if (IncomingInputs.size() == 2) {

11520

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

11521

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

11522

// We have two non-adjacent or clobbered inputs we need to extract from

11523

// the source half. To do this, we need to map them into some adjacent

11524

// dword slot in the source mask.

11525

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

11526

IncomingInputs[1] - SourceOffset};

11527

11528

// If there is a free slot in the source half mask adjacent to one of

11529

// the inputs, place the other input in it. We use (Index XOR 1) to

11530

// compute an adjacent index.

11531

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

11532

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

11533

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

11534

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

11535

InputsFixed[1] = InputsFixed[0] ^ 1;

11536

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

11537

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

11538

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

11539

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

11540

InputsFixed[0] = InputsFixed[1] ^ 1;

11541

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

11542

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

11543

// The two inputs are in the same DWord but it is clobbered and the

11544

// adjacent DWord isn't used at all. Move both inputs to the free

11545

// slot.

11546

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

11547

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

11548

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

11549

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

11550

} else {

11551

// The only way we hit this point is if there is no clobbering

11552

// (because there are no off-half inputs to this half) and there is no

11553

// free slot adjacent to one of the inputs. In this case, we have to

11554

// swap an input with a non-input.

11555

for (int i = 0; i < 4; ++i)

11556

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11557, __extension__ __PRETTY_FUNCTION__))

11557

"We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11557, __extension__ __PRETTY_FUNCTION__));

11558

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11559, __extension__ __PRETTY_FUNCTION__))

11559

"Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11559, __extension__ __PRETTY_FUNCTION__));

11560

11561

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

11562

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

11563

11564

// We also have to update the final source mask in this case because

11565

// it may need to undo the above swap.

11566

for (int &M : FinalSourceHalfMask)

11567

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

11568

M = InputsFixed[1] + SourceOffset;

11569

else if (M == InputsFixed[1] + SourceOffset)

11570

M = (InputsFixed[0] ^ 1) + SourceOffset;

11571

11572

InputsFixed[1] = InputsFixed[0] ^ 1;

11573

}

11574

11575

// Point everything at the fixed inputs.

11576

for (int &M : HalfMask)

11577

if (M == IncomingInputs[0])

11578

M = InputsFixed[0] + SourceOffset;

11579

else if (M == IncomingInputs[1])

11580

M = InputsFixed[1] + SourceOffset;

11581

11582

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

11583

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

11584

}

11585

} else {

11586

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11586);

11587

}

11588

11589

// Now hoist the DWord down to the right half.

11590

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

11591

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11591, __extension__ __PRETTY_FUNCTION__));

11592

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

11593

for (int &M : HalfMask)

11594

for (int Input : IncomingInputs)

11595

if (M == Input)

11596

M = FreeDWord * 2 + Input % 2;

11597

};

11598

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

11599

/*SourceOffset*/ 4, /*DestOffset*/ 0);

11600

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

11601

/*SourceOffset*/ 0, /*DestOffset*/ 4);

11602

11603

// Now enact all the shuffles we've computed to move the inputs into their

11604

// target half.

11605

if (!isNoopShuffleMask(PSHUFLMask))

11606

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

11607

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

11608

if (!isNoopShuffleMask(PSHUFHMask))

11609

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

11610

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

11611

if (!isNoopShuffleMask(PSHUFDMask))

11612

V = DAG.getBitcast(

11613

VT,

11614

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

11615

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

11616

11617

// At this point, each half should contain all its inputs, and we can then

11618

// just shuffle them into their final position.

11619

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11620, __extension__ __PRETTY_FUNCTION__))

11620

"Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11620, __extension__ __PRETTY_FUNCTION__));

11621

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11622, __extension__ __PRETTY_FUNCTION__))

11622

"Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11622, __extension__ __PRETTY_FUNCTION__));

11623

11624

// Do a half shuffle for the low mask.

11625

if (!isNoopShuffleMask(LoMask))

11626

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

11627

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

11628

11629

// Do a half shuffle with the high mask after shifting its values down.

11630

for (int &M : HiMask)

11631

if (M >= 0)

11632

M -= 4;

11633

if (!isNoopShuffleMask(HiMask))

11634

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

11635

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

11636

11637

return V;

11638

}

11639

11640

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

11641

/// blend if only one input is used.

11642

static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(

11643

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

11644

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,

11645

bool &V2InUse) {

11646

SDValue V1Mask[16];

11647

SDValue V2Mask[16];

11648

V1InUse = false;

11649

V2InUse = false;

11650

11651

int Size = Mask.size();

11652

int Scale = 16 / Size;

11653

for (int i = 0; i < 16; ++i) {

11654

if (Mask[i / Scale] < 0) {

11655

V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);

11656

} else {

11657

const int ZeroMask = 0x80;

11658

int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale

11659

: ZeroMask;

11660

int V2Idx = Mask[i / Scale] < Size

11661

? ZeroMask

11662

: (Mask[i / Scale] - Size) * Scale + i % Scale;

11663

if (Zeroable[i / Scale])

11664

V1Idx = V2Idx = ZeroMask;

11665

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

11666

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

11667

V1InUse |= (ZeroMask != V1Idx);

11668

V2InUse |= (ZeroMask != V2Idx);

11669

}

11670

}

11671

11672

if (V1InUse)

11673

V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,

11674

DAG.getBitcast(MVT::v16i8, V1),

11675

DAG.getBuildVector(MVT::v16i8, DL, V1Mask));

11676

if (V2InUse)

11677

V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,

11678

DAG.getBitcast(MVT::v16i8, V2),

11679

DAG.getBuildVector(MVT::v16i8, DL, V2Mask));

11680

11681

// If we need shuffled inputs from both, blend the two.

11682

SDValue V;

11683

if (V1InUse && V2InUse)

11684

V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);

11685

else

11686

V = V1InUse ? V1 : V2;

11687

11688

// Cast the result back to the correct type.

11689

return DAG.getBitcast(VT, V);

11690

}

11691

11692

/// \brief Generic lowering of 8-lane i16 shuffles.

11693

///

11694

/// This handles both single-input shuffles and combined shuffle/blends with

11695

/// two inputs. The single input shuffles are immediately delegated to

11696

/// a dedicated lowering routine.

11697

///

11698

/// The blends are lowered in one of three fundamental ways. If there are few

11699

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

11700

/// of the input is significantly cheaper when lowered as an interleaving of

11701

/// the two inputs, try to interleave them. Otherwise, blend the low and high

11702

/// halves of the inputs separately (making them have relatively few inputs)

11703

/// and then concatenate them.

11704

static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

11705

const APInt &Zeroable,

11706

SDValue V1, SDValue V2,

11707

const X86Subtarget &Subtarget,

11708

SelectionDAG &DAG) {

11709

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11709, __extension__ __PRETTY_FUNCTION__));

11710

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11710, __extension__ __PRETTY_FUNCTION__));

11711

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11711, __extension__ __PRETTY_FUNCTION__));

11712

11713

// Whenever we can lower this as a zext, that instruction is strictly faster

11714

// than any alternative.

11715

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

11716

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

11717

return ZExt;

11718

11719

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

11720

11721

if (NumV2Inputs == 0) {

11722

// Check for being able to broadcast a single element.

11723

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(

11724

DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))

11725

return Broadcast;

11726

11727

// Try to use shift instructions.

11728

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,

11729

Zeroable, Subtarget, DAG))

11730

return Shift;

11731

11732

// Use dedicated unpack instructions for masks that match their pattern.

11733

if (SDValue V =

11734

lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

11735

return V;

11736

11737

// Use dedicated pack instructions for masks that match their pattern.

11738

if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,

11739

DAG, Subtarget))

11740

return V;

11741

11742

// Try to use byte rotation instructions.

11743

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,

11744

Mask, Subtarget, DAG))

11745

return Rotate;

11746

11747

// Make a copy of the mask so it can be modified.

11748

SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());

11749

return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,

11750

MutableMask, Subtarget,

11751

DAG);

11752

}

11753

11754

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11756, __extension__ __PRETTY_FUNCTION__))

11755

"All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11756, __extension__ __PRETTY_FUNCTION__))

11756

"shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11756, __extension__ __PRETTY_FUNCTION__));

11757

11758

// Try to use shift instructions.

11759

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,

11760

Zeroable, Subtarget, DAG))

11761

return Shift;

11762

11763

// See if we can use SSE4A Extraction / Insertion.

11764

if (Subtarget.hasSSE4A())

11765

if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

11766

Zeroable, DAG))

11767

return V;

11768

11769

// There are special ways we can lower some single-element blends.

11770

if (NumV2Inputs == 1)

11771

if (SDValue V = lowerVectorShuffleAsElementInsertion(

11772

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

11773

return V;

11774

11775

// We have different paths for blend lowering, but they all must use the

11776

// *exact* same predicate.

11777

bool IsBlendSupported = Subtarget.hasSSE41();

11778

if (IsBlendSupported)

11779

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

11780

Zeroable, Subtarget, DAG))

11781

return Blend;

11782

11783

if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

11784

Zeroable, DAG))

11785

return Masked;

11786

11787

// Use dedicated unpack instructions for masks that match their pattern.

11788

if (SDValue V =

11789

lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

11790

return V;

11791

11792

// Use dedicated pack instructions for masks that match their pattern.

11793

if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

11794

Subtarget))

11795

return V;

11796

11797

// Try to use byte rotation instructions.

11798

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

11799

DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))

11800

return Rotate;

11801

11802

if (SDValue BitBlend =

11803

lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

11804

return BitBlend;

11805

11806

// Try to lower by permuting the inputs into an unpack instruction.

11807

if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,

11808

V2, Mask, DAG))

11809

return Unpack;

11810

11811

// If we can't directly blend but can use PSHUFB, that will be better as it

11812

// can both shuffle and set up the inefficient blend.

11813

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

11814

bool V1InUse, V2InUse;

11815

return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

11816

Zeroable, DAG, V1InUse, V2InUse);

11817

}

11818

11819

// We can always bit-blend if we have to so the fallback strategy is to

11820

// decompose into single-input permutes and blends.

11821

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,

11822

Mask, DAG);

11823

}

11824

11825

/// \brief Check whether a compaction lowering can be done by dropping even

11826

/// elements and compute how many times even elements must be dropped.

11827

///

11828

/// This handles shuffles which take every Nth element where N is a power of

11829

/// two. Example shuffle masks:

11830

///

11831

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

11832

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

11833

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

11834

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

11835

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

11836

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

11837

///

11838

/// Any of these lanes can of course be undef.

11839

///

11840

/// This routine only supports N <= 3.

11841

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

11842

/// for larger N.

11843

///

11844

/// \returns N above, or the number of times even elements must be dropped if

11845

/// there is such a number. Otherwise returns zero.

11846

static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,

11847

bool IsSingleInput) {

11848

// The modulus for the shuffle vector entries is based on whether this is

11849

// a single input or not.

11850

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

11851

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11852, __extension__ __PRETTY_FUNCTION__))

11852

"We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11852, __extension__ __PRETTY_FUNCTION__));

11853

11854

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

11855

11856

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

11857

// and 2^3 simultaneously. This is because we may have ambiguity with

11858

// partially undef inputs.

11859

bool ViableForN[3] = {true, true, true};

11860

11861

for (int i = 0, e = Mask.size(); i < e; ++i) {

11862

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

11863

// want.

11864

if (Mask[i] < 0)

11865

continue;

11866

11867

bool IsAnyViable = false;

11868

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

11869

if (ViableForN[j]) {

11870

uint64_t N = j + 1;

11871

11872

// The shuffle mask must be equal to (i * 2^N) % M.

11873

if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))

11874

IsAnyViable = true;

11875

else

11876

ViableForN[j] = false;

11877

}

11878

// Early exit if we exhaust the possible powers of two.

11879

if (!IsAnyViable)

11880

break;

11881

}

11882

11883

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

11884

if (ViableForN[j])

11885

return j + 1;

11886

11887

// Return 0 as there is no viable power of two.

11888

return 0;

11889

}

11890

11891

/// \brief Generic lowering of v16i8 shuffles.

11892

///

11893

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

11894

/// detect any complexity reducing interleaving. If that doesn't help, it uses

11895

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

11896

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

11897

/// back together.

11898

static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

11899

const APInt &Zeroable,

11900

SDValue V1, SDValue V2,

11901

const X86Subtarget &Subtarget,

11902

SelectionDAG &DAG) {

11903

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11903, __extension__ __PRETTY_FUNCTION__));

11904

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11904, __extension__ __PRETTY_FUNCTION__));

11905

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11905, __extension__ __PRETTY_FUNCTION__));

11906

11907

// Try to use shift instructions.

11908

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,

11909

Zeroable, Subtarget, DAG))

11910

return Shift;

11911

11912

// Try to use byte rotation instructions.

11913

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

11914

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

11915

return Rotate;

11916

11917

// Use dedicated pack instructions for masks that match their pattern.

11918

if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

11919

Subtarget))

11920

return V;

11921

11922

// Try to use a zext lowering.

11923

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

11924

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

11925

return ZExt;

11926

11927

// See if we can use SSE4A Extraction / Insertion.

11928

if (Subtarget.hasSSE4A())

11929

if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

11930

Zeroable, DAG))

11931

return V;

11932

11933

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

11934

11935

// For single-input shuffles, there are some nicer lowering tricks we can use.

11936

if (NumV2Elements == 0) {

11937

// Check for being able to broadcast a single element.

11938

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(

11939

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

11940

return Broadcast;

11941

11942

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

11943

// Notably, this handles splat and partial-splat shuffles more efficiently.

11944

// However, it only makes sense if the pre-duplication shuffle simplifies

11945

// things significantly. Currently, this means we need to be able to

11946

// express the pre-duplication shuffle as an i16 shuffle.

11947

11948

// FIXME: We should check for other patterns which can be widened into an

11949

// i16 shuffle as well.

11950

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

11951

for (int i = 0; i < 16; i += 2)

11952

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

11953

return false;

11954

11955

return true;

11956

};

11957

auto tryToWidenViaDuplication = [&]() -> SDValue {

11958

if (!canWidenViaDuplication(Mask))

11959

return SDValue();

11960

SmallVector<int, 4> LoInputs;

11961

copy_if(Mask, std::back_inserter(LoInputs),

11962

[](int M) { return M >= 0 && M < 8; });

11963

std::sort(LoInputs.begin(), LoInputs.end());

11964

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

11965

LoInputs.end());

11966

SmallVector<int, 4> HiInputs;

11967

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

11968

std::sort(HiInputs.begin(), HiInputs.end());

11969

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

11970

HiInputs.end());

11971

11972

bool TargetLo = LoInputs.size() >= HiInputs.size();

11973

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

11974

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

11975

11976

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

11977

SmallDenseMap<int, int, 8> LaneMap;

11978

for (int I : InPlaceInputs) {

11979

PreDupI16Shuffle[I/2] = I/2;

11980

LaneMap[I] = I;

11981

}

11982

int j = TargetLo ? 0 : 4, je = j + 4;

11983

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

11984

// Check if j is already a shuffle of this input. This happens when

11985

// there are two adjacent bytes after we move the low one.

11986

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

11987

// If we haven't yet mapped the input, search for a slot into which

11988

// we can map it.

11989

while (j < je && PreDupI16Shuffle[j] >= 0)

11990

++j;

11991

11992

if (j == je)

11993

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

11994

return SDValue();

11995

11996

// Map this input with the i16 shuffle.

11997

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

11998

}

11999

12000

// Update the lane map based on the mapping we ended up with.

12001

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

12002

}

12003

V1 = DAG.getBitcast(

12004

MVT::v16i8,

12005

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

12006

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

12007

12008

// Unpack the bytes to form the i16s that will be shuffled into place.

12009

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

12010

MVT::v16i8, V1, V1);

12011

12012

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

12013

for (int i = 0; i < 16; ++i)

12014

if (Mask[i] >= 0) {

12015

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

12016

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12016, __extension__ __PRETTY_FUNCTION__));

12017

if (PostDupI16Shuffle[i / 2] < 0)

12018

PostDupI16Shuffle[i / 2] = MappedMask;

12019

else

12020

assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12021, __extension__ __PRETTY_FUNCTION__))

12021

"Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12021, __extension__ __PRETTY_FUNCTION__));

12022

}

12023

return DAG.getBitcast(

12024

MVT::v16i8,

12025

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

12026

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

12027

};

12028

if (SDValue V = tryToWidenViaDuplication())

12029

return V;

12030

}

12031

12032

if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

12033

Zeroable, DAG))

12034

return Masked;

12035

12036

// Use dedicated unpack instructions for masks that match their pattern.

12037

if (SDValue V =

12038

lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

12039

return V;

12040

12041

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

12042

// with PSHUFB. It is important to do this before we attempt to generate any

12043

// blends but after all of the single-input lowerings. If the single input

12044

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

12045

// want to preserve that and we can DAG combine any longer sequences into

12046

// a PSHUFB in the end. But once we start blending from multiple inputs,

12047

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

12048

// and there are *very* few patterns that would actually be faster than the

12049

// PSHUFB approach because of its ability to zero lanes.

12050

12051

// FIXME: The only exceptions to the above are blends which are exact

12052

// interleavings with direct instructions supporting them. We currently don't

12053

// handle those well here.

12054

if (Subtarget.hasSSSE3()) {

12055

bool V1InUse = false;

12056

bool V2InUse = false;

12057

12058

SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(

12059

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

12060

12061

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

12062

// do so. This avoids using them to handle blends-with-zero which is

12063

// important as a single pshufb is significantly faster for that.

12064

if (V1InUse && V2InUse) {

12065

if (Subtarget.hasSSE41())

12066

if (SDValue Blend = lowerVectorShuffleAsBlend(

12067

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

12068

return Blend;

12069

12070

// We can use an unpack to do the blending rather than an or in some

12071

// cases. Even though the or may be (very minorly) more efficient, we

12072

// preference this lowering because there are common cases where part of

12073

// the complexity of the shuffles goes away when we do the final blend as

12074

// an unpack.

12075

// FIXME: It might be worth trying to detect if the unpack-feeding

12076

// shuffles will both be pshufb, in which case we shouldn't bother with

12077

// this.

12078

if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(

12079

DL, MVT::v16i8, V1, V2, Mask, DAG))

12080

return Unpack;

12081

}

12082

12083

return PSHUFB;

12084

}

12085

12086

// There are special ways we can lower some single-element blends.

12087

if (NumV2Elements == 1)

12088

if (SDValue V = lowerVectorShuffleAsElementInsertion(

12089

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

12090

return V;

12091

12092

if (SDValue BitBlend =

12093

lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

12094

return BitBlend;

12095

12096

// Check whether a compaction lowering can be done. This handles shuffles

12097

// which take every Nth element for some even N. See the helper function for

12098

// details.

12099

12100

// We special case these as they can be particularly efficiently handled with

12101

// the PACKUSB instruction on x86 and they show up in common patterns of

12102

// rearranging bytes to truncate wide elements.

12103

bool IsSingleInput = V2.isUndef();

12104

if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {

12105

// NumEvenDrops is the power of two stride of the elements. Another way of

12106

// thinking about it is that we need to drop the even elements this many

12107

// times to get the original input.

12108

12109

// First we need to zero all the dropped bytes.

12110

assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12111, __extension__ __PRETTY_FUNCTION__))

12111

"No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12111, __extension__ __PRETTY_FUNCTION__));

12112

// We use the mask type to pick which bytes are preserved based on how many

12113

// elements are dropped.

12114

MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };

12115

SDValue ByteClearMask = DAG.getBitcast(

12116

MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));

12117

V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);

12118

if (!IsSingleInput)

12119

V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

12120

12121

// Now pack things back together.

12122

V1 = DAG.getBitcast(MVT::v8i16, V1);

12123

V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);

12124

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);

12125

for (int i = 1; i < NumEvenDrops; ++i) {

12126

Result = DAG.getBitcast(MVT::v8i16, Result);

12127

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

12128

}

12129

12130

return Result;

12131

}

12132

12133

// Handle multi-input cases by blending single-input shuffles.

12134

if (NumV2Elements > 0)

12135

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,

12136

Mask, DAG);

12137

12138

// The fallback path for single-input shuffles widens this into two v8i16

12139

// vectors with unpacks, shuffles those, and then pulls them back together

12140

// with a pack.

12141

SDValue V = V1;

12142

12143

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

12144

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

12145

for (int i = 0; i < 16; ++i)

12146

if (Mask[i] >= 0)

12147

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

12148

12149

SDValue VLoHalf, VHiHalf;

12150

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

12151

// them out and avoid using UNPCK{L,H} to extract the elements of V as

12152

// i16s.

12153

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

12154

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

12155

// Use a mask to drop the high bytes.

12156

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

12157

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

12158

DAG.getConstant(0x00FF, DL, MVT::v8i16));

12159

12160

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

12161

VHiHalf = DAG.getUNDEF(MVT::v8i16);

12162

12163

// Squash the masks to point directly into VLoHalf.

12164

for (int &M : LoBlendMask)

12165

if (M >= 0)

12166

M /= 2;

12167

for (int &M : HiBlendMask)

12168

if (M >= 0)

12169

M /= 2;

12170

} else {

12171

// Otherwise just unpack the low half of V into VLoHalf and the high half into

12172

// VHiHalf so that we can blend them as i16s.

12173

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

12174

12175

VLoHalf = DAG.getBitcast(

12176

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

12177

VHiHalf = DAG.getBitcast(

12178

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

12179

}

12180

12181

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

12182

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

12183

12184

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

12185

}

12186

12187

/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.

12188

///

12189

/// This routine breaks down the specific type of 128-bit shuffle and

12190

/// dispatches to the lowering routines accordingly.

12191

static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

12192

MVT VT, SDValue V1, SDValue V2,

12193

const APInt &Zeroable,

12194

const X86Subtarget &Subtarget,

12195

SelectionDAG &DAG) {

12196

switch (VT.SimpleTy) {

12197

case MVT::v2i64:

12198

return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

12199

case MVT::v2f64:

12200

return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

12201

case MVT::v4i32:

12202

return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

12203

case MVT::v4f32:

12204

return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

12205

case MVT::v8i16:

12206

return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

12207

case MVT::v16i8:

12208

return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

12209

12210

default:

12211

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12211);

12212

}

12213

}

12214

12215

/// \brief Generic routine to split vector shuffle into half-sized shuffles.

12216

///

12217

/// This routine just extracts two subvectors, shuffles them independently, and

12218

/// then concatenates them back together. This should work effectively with all

12219

/// AVX vector shuffle types.

12220

static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,

12221

SDValue V2, ArrayRef<int> Mask,

12222

SelectionDAG &DAG) {

12223

assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12224, __extension__ __PRETTY_FUNCTION__))

12224

"Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12224, __extension__ __PRETTY_FUNCTION__));

12225

assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12225, __extension__ __PRETTY_FUNCTION__));

12226

assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12226, __extension__ __PRETTY_FUNCTION__));

12227

12228

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

12229

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

12230

12231

int NumElements = VT.getVectorNumElements();

12232

int SplitNumElements = NumElements / 2;

12233

MVT ScalarVT = VT.getVectorElementType();

12234

MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

12235

12236

// Rather than splitting build-vectors, just build two narrower build

12237

// vectors. This helps shuffling with splats and zeros.

12238

auto SplitVector = [&](SDValue V) {

12239

V = peekThroughBitcasts(V);

12240

12241

MVT OrigVT = V.getSimpleValueType();

12242

int OrigNumElements = OrigVT.getVectorNumElements();

12243

int OrigSplitNumElements = OrigNumElements / 2;

12244

MVT OrigScalarVT = OrigVT.getVectorElementType();

12245

MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

12246

12247

SDValue LoV, HiV;

12248

12249

auto *BV = dyn_cast<BuildVectorSDNode>(V);

12250

if (!BV) {

12251

LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,

12252

DAG.getIntPtrConstant(0, DL));

12253

HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,

12254

DAG.getIntPtrConstant(OrigSplitNumElements, DL));

12255

} else {

12256

12257

SmallVector<SDValue, 16> LoOps, HiOps;

12258

for (int i = 0; i < OrigSplitNumElements; ++i) {

12259

LoOps.push_back(BV->getOperand(i));

12260

HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));

12261

}

12262

LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);

12263

HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);

12264

}

12265

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

12266

DAG.getBitcast(SplitVT, HiV));

12267

};

12268

12269

SDValue LoV1, HiV1, LoV2, HiV2;

12270

std::tie(LoV1, HiV1) = SplitVector(V1);

12271

std::tie(LoV2, HiV2) = SplitVector(V2);

12272

12273

// Now create two 4-way blends of these half-width vectors.

12274

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

12275

bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;

12276

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

12277

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

12278

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

12279

for (int i = 0; i < SplitNumElements; ++i) {

12280

int M = HalfMask[i];

12281

if (M >= NumElements) {

12282

if (M >= NumElements + SplitNumElements)

12283

UseHiV2 = true;

12284

else

12285

UseLoV2 = true;

12286

V2BlendMask[i] = M - NumElements;

12287

BlendMask[i] = SplitNumElements + i;

12288

} else if (M >= 0) {

12289

if (M >= SplitNumElements)

12290

UseHiV1 = true;

12291

else

12292

UseLoV1 = true;

12293

V1BlendMask[i] = M;

12294

BlendMask[i] = i;

12295

}

12296

}

12297

12298

// Because the lowering happens after all combining takes place, we need to

12299

// manually combine these blend masks as much as possible so that we create

12300

// a minimal number of high-level vector shuffle nodes.

12301

12302

// First try just blending the halves of V1 or V2.

12303

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

12304

return DAG.getUNDEF(SplitVT);

12305

if (!UseLoV2 && !UseHiV2)

12306

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

12307

if (!UseLoV1 && !UseHiV1)

12308

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

12309

12310

SDValue V1Blend, V2Blend;

12311

if (UseLoV1 && UseHiV1) {

12312

V1Blend =

12313

DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

12314

} else {

12315

// We only use half of V1 so map the usage down into the final blend mask.

12316

V1Blend = UseLoV1 ? LoV1 : HiV1;

12317

for (int i = 0; i < SplitNumElements; ++i)

12318

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

12319

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

12320

}

12321

if (UseLoV2 && UseHiV2) {

12322

V2Blend =

12323

DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

12324

} else {

12325

// We only use half of V2 so map the usage down into the final blend mask.

12326

V2Blend = UseLoV2 ? LoV2 : HiV2;

12327

for (int i = 0; i < SplitNumElements; ++i)

12328

if (BlendMask[i] >= SplitNumElements)

12329

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

12330

}

12331

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

12332

};

12333

SDValue Lo = HalfBlend(LoMask);

12334

SDValue Hi = HalfBlend(HiMask);

12335

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

12336

}

12337

12338

/// \brief Either split a vector in halves or decompose the shuffles and the

12339

/// blend.

12340

///

12341

/// This is provided as a good fallback for many lowerings of non-single-input

12342

/// shuffles with more than one 128-bit lane. In those cases, we want to select

12343

/// between splitting the shuffle into 128-bit components and stitching those

12344

/// back together vs. extracting the single-input shuffles and blending those

12345

/// results.

12346

static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,

12347

SDValue V1, SDValue V2,

12348

ArrayRef<int> Mask,

12349

SelectionDAG &DAG) {

12350

assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12351, __extension__ __PRETTY_FUNCTION__))

12351

"shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12351, __extension__ __PRETTY_FUNCTION__));

12352

int Size = Mask.size();

12353

12354

// If this can be modeled as a broadcast of two elements followed by a blend,

12355

// prefer that lowering. This is especially important because broadcasts can

12356

// often fold with memory operands.

12357

auto DoBothBroadcast = [&] {

12358

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

12359

for (int M : Mask)

12360

if (M >= Size) {

12361

if (V2BroadcastIdx < 0)

12362

V2BroadcastIdx = M - Size;

12363

else if (M - Size != V2BroadcastIdx)

12364

return false;

12365

} else if (M >= 0) {

12366

if (V1BroadcastIdx < 0)

12367

V1BroadcastIdx = M;

12368

else if (M != V1BroadcastIdx)

12369

return false;

12370

}

12371

return true;

12372

};

12373

if (DoBothBroadcast())

12374

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,

12375

DAG);

12376

12377

// If the inputs all stem from a single 128-bit lane of each input, then we

12378

// split them rather than blending because the split will decompose to

12379

// unusually few instructions.

12380

int LaneCount = VT.getSizeInBits() / 128;

12381

int LaneSize = Size / LaneCount;

12382

SmallBitVector LaneInputs[2];

12383

LaneInputs[0].resize(LaneCount, false);

12384

LaneInputs[1].resize(LaneCount, false);

12385

for (int i = 0; i < Size; ++i)

12386

if (Mask[i] >= 0)

12387

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

12388

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

12389

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

12390

12391

// Otherwise, just fall back to decomposed shuffles and a blend. This requires

12392

// that the decomposed single-input shuffles don't end up here.

12393

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);

12394

}

12395

12396

/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as

12397

/// a permutation and blend of those lanes.

12398

///

12399

/// This essentially blends the out-of-lane inputs to each lane into the lane

12400

/// from a permuted copy of the vector. This lowering strategy results in four

12401

/// instructions in the worst case for a single-input cross lane shuffle which

12402

/// is lower than any other fully general cross-lane shuffle strategy I'm aware

12403

/// of. Special cases for each particular shuffle pattern should be handled

12404

/// prior to trying this lowering.

12405

static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,

12406

SDValue V1, SDValue V2,

12407

ArrayRef<int> Mask,

12408

SelectionDAG &DAG,

12409

const X86Subtarget &Subtarget) {

12410

// FIXME: This should probably be generalized for 512-bit vectors as well.

12411

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12411, __extension__ __PRETTY_FUNCTION__));

12412

int Size = Mask.size();

12413

int LaneSize = Size / 2;

12414

12415

// If there are only inputs from one 128-bit lane, splitting will in fact be

12416

// less expensive. The flags track whether the given lane contains an element

12417

// that crosses to another lane.

12418

if (!Subtarget.hasAVX2()) {

12419

bool LaneCrossing[2] = {false, false};

12420

for (int i = 0; i < Size; ++i)

12421

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

12422

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

12423

if (!LaneCrossing[0] || !LaneCrossing[1])

12424

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

12425

} else {

12426

bool LaneUsed[2] = {false, false};

12427

for (int i = 0; i < Size; ++i)

12428

if (Mask[i] >= 0)

12429

LaneUsed[(Mask[i] / LaneSize)] = true;

12430

if (!LaneUsed[0] || !LaneUsed[1])

12431

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

12432

}

12433

12434

assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12435, __extension__ __PRETTY_FUNCTION__))

12435

"This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12435, __extension__ __PRETTY_FUNCTION__));

12436

12437

SmallVector<int, 32> FlippedBlendMask(Size);

12438

for (int i = 0; i < Size; ++i)

12439

FlippedBlendMask[i] =

12440

Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)

12441

? Mask[i]

12442

: Mask[i] % LaneSize +

12443

(i / LaneSize) * LaneSize + Size);

12444

12445

// Flip the vector, and blend the results which should now be in-lane.

12446

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

12447

SDValue Flipped = DAG.getBitcast(PVT, V1);

12448

Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),

12449

{ 2, 3, 0, 1 });

12450

Flipped = DAG.getBitcast(VT, Flipped);

12451

return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);

12452

}

12453

12454

/// \brief Handle lowering 2-lane 128-bit shuffles.

12455

static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,

12456

SDValue V2, ArrayRef<int> Mask,

12457

const APInt &Zeroable,

12458

const X86Subtarget &Subtarget,

12459

SelectionDAG &DAG) {

12460

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

12461

if (Subtarget.hasAVX2() && V2.isUndef())

12462

return SDValue();

12463

12464

SmallVector<int, 4> WidenedMask;

12465

if (!canWidenShuffleElements(Mask, WidenedMask))

12466

return SDValue();

12467

12468

// TODO: If minimizing size and one of the inputs is a zero vector and the

12469

// the zero vector has only one use, we could use a VPERM2X128 to save the

12470

// instruction bytes needed to explicitly generate the zero vector.

12471

12472

// Blends are faster and handle all the non-lane-crossing cases.

12473

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,

12474

Zeroable, Subtarget, DAG))

12475

return Blend;

12476

12477

bool IsLowZero = (Zeroable & 0x3) == 0x3;

12478

bool IsHighZero = (Zeroable & 0xc) == 0xc;

12479

12480

// If either input operand is a zero vector, use VPERM2X128 because its mask

12481

// allows us to replace the zero input with an implicit zero.

12482

if (!IsLowZero && !IsHighZero) {

12483

// Check for patterns which can be matched with a single insert of a 128-bit

12484

// subvector.

12485

bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});

12486

if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

12487

12488

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

12489

// this will likely become vinsertf128 which can't fold a 256-bit memop.

12490

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

12491

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

12492

VT.getVectorNumElements() / 2);

12493

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

12494

DAG.getIntPtrConstant(0, DL));

12495

SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

12496

OnlyUsesV1 ? V1 : V2,

12497

DAG.getIntPtrConstant(0, DL));

12498

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);

12499

}

12500

}

12501

12502

// Try to use SHUF128 if possible.

12503

if (Subtarget.hasVLX()) {

12504

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

12505

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

12506

((WidenedMask[1] % 2) << 1);

12507

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

12508

DAG.getConstant(PermMask, DL, MVT::i8));

12509

}

12510

}

12511

}

12512

12513

// Otherwise form a 128-bit permutation. After accounting for undefs,

12514

// convert the 64-bit shuffle mask selection values into 128-bit

12515

// selection bits by dividing the indexes by 2 and shifting into positions

12516

// defined by a vperm2*128 instruction's immediate control byte.

12517

12518

// The immediate permute control byte looks like this:

12519

// [1:0] - select 128 bits from sources for low half of destination

12520

// [2] - ignore

12521

// [3] - zero low half of destination

12522

// [5:4] - select 128 bits from sources for high half of destination

12523

// [6] - ignore

12524

// [7] - zero high half of destination

12525

12526

assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?")(static_cast <bool> (WidenedMask[0] >= 0 && WidenedMask
[1] >= 0 && "Undef half?") ? void (0) : __assert_fail
("WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12526, __extension__ __PRETTY_FUNCTION__));

12527

12528

unsigned PermMask = 0;

12529

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

12530

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

12531

12532

// Check the immediate mask and replace unused sources with undef.

12533

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

12534

V1 = DAG.getUNDEF(VT);

12535

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

12536

V2 = DAG.getUNDEF(VT);

12537

12538

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

12539

DAG.getConstant(PermMask, DL, MVT::i8));

12540

}

12541

12542

/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then

12543

/// shuffling each lane.

12544

///

12545

/// This will only succeed when the result of fixing the 128-bit lanes results

12546

/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in

12547

/// each 128-bit lanes. This handles many cases where we can quickly blend away

12548

/// the lane crosses early and then use simpler shuffles within each lane.

12549

///

12550

/// FIXME: It might be worthwhile at some point to support this without

12551

/// requiring the 128-bit lane-relative shuffles to be repeating, but currently

12552

/// in x86 only floating point has interesting non-repeating shuffles, and even

12553

/// those are still *marginally* more expensive.

12554

static SDValue lowerVectorShuffleByMerging128BitLanes(

12555

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

12556

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

12557

assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12557, __extension__ __PRETTY_FUNCTION__));

12558

12559

int Size = Mask.size();

12560

int LaneSize = 128 / VT.getScalarSizeInBits();

12561

int NumLanes = Size / LaneSize;

12562

assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.")(static_cast <bool> (NumLanes > 1 && "Only handles 256-bit and wider shuffles."
) ? void (0) : __assert_fail ("NumLanes > 1 && \"Only handles 256-bit and wider shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12562, __extension__ __PRETTY_FUNCTION__));

12563

12564

// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also

12565

// check whether the in-128-bit lane shuffles share a repeating pattern.

12566

SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);

12567

SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);

12568

for (int i = 0; i < Size; ++i) {

12569

if (Mask[i] < 0)

12570

continue;

12571

12572

int j = i / LaneSize;

12573

12574

if (Lanes[j] < 0) {

12575

// First entry we've seen for this lane.

12576

Lanes[j] = Mask[i] / LaneSize;

12577

} else if (Lanes[j] != Mask[i] / LaneSize) {

12578

// This doesn't match the lane selected previously!

12579

return SDValue();

12580

}

12581

12582

// Check that within each lane we have a consistent shuffle mask.

12583

int k = i % LaneSize;

12584

if (InLaneMask[k] < 0) {

12585

InLaneMask[k] = Mask[i] % LaneSize;

12586

} else if (InLaneMask[k] != Mask[i] % LaneSize) {

12587

// This doesn't fit a repeating in-lane mask.

12588

return SDValue();

12589

}

12590

}

12591

12592

// First shuffle the lanes into place.

12593

MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,

12594

VT.getSizeInBits() / 64);

12595

SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);

12596

for (int i = 0; i < NumLanes; ++i)

12597

if (Lanes[i] >= 0) {

12598

LaneMask[2 * i + 0] = 2*Lanes[i] + 0;

12599

LaneMask[2 * i + 1] = 2*Lanes[i] + 1;

12600

}

12601

12602

V1 = DAG.getBitcast(LaneVT, V1);

12603

V2 = DAG.getBitcast(LaneVT, V2);

12604

SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);

12605

12606

// Cast it back to the type we actually want.

12607

LaneShuffle = DAG.getBitcast(VT, LaneShuffle);

12608

12609

// Now do a simple shuffle that isn't lane crossing.

12610

SmallVector<int, 8> NewMask((unsigned)Size, -1);

12611

for (int i = 0; i < Size; ++i)

12612

if (Mask[i] >= 0)

12613

NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;

12614

assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, NewMask) && "Must not introduce lane crosses at this point!"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12615, __extension__ __PRETTY_FUNCTION__))

12615

"Must not introduce lane crosses at this point!")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, NewMask) && "Must not introduce lane crosses at this point!"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12615, __extension__ __PRETTY_FUNCTION__));

12616

12617

return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);

12618

}

12619

12620

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

12621

/// This allows for fast cases such as subvector extraction/insertion

12622

/// or shuffling smaller vector types which can lower more efficiently.

12623

static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,

12624

SDValue V1, SDValue V2,

12625

ArrayRef<int> Mask,

12626

const X86Subtarget &Subtarget,

12627

SelectionDAG &DAG) {

12628

assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12629, __extension__ __PRETTY_FUNCTION__))

12629

"Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12629, __extension__ __PRETTY_FUNCTION__));

12630

12631

unsigned NumElts = VT.getVectorNumElements();

12632

unsigned HalfNumElts = NumElts / 2;

12633

MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

12634

12635

bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);

12636

bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);

12637

if (!UndefLower && !UndefUpper)

12638

return SDValue();

12639

12640

// Upper half is undef and lower half is whole upper subvector.

12641

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

12642

if (UndefUpper &&

12643

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

12644

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

12645

DAG.getIntPtrConstant(HalfNumElts, DL));

12646

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

12647

DAG.getIntPtrConstant(0, DL));

12648

}

12649

12650

// Lower half is undef and upper half is whole lower subvector.

12651

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

12652

if (UndefLower &&

12653

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

12654

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

12655

DAG.getIntPtrConstant(0, DL));

12656

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

12657

DAG.getIntPtrConstant(HalfNumElts, DL));

12658

}

12659

12660

// If the shuffle only uses two of the four halves of the input operands,

12661

// then extract them and perform the 'half' shuffle at half width.

12662

// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>

12663

int HalfIdx1 = -1, HalfIdx2 = -1;

12664

SmallVector<int, 8> HalfMask(HalfNumElts);

12665

unsigned Offset = UndefLower ? HalfNumElts : 0;

12666

for (unsigned i = 0; i != HalfNumElts; ++i) {

12667

int M = Mask[i + Offset];

12668

if (M < 0) {

12669

HalfMask[i] = M;

12670

continue;

12671

}

12672

12673

// Determine which of the 4 half vectors this element is from.

12674

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

12675

int HalfIdx = M / HalfNumElts;

12676

12677

// Determine the element index into its half vector source.

12678

int HalfElt = M % HalfNumElts;

12679

12680

// We can shuffle with up to 2 half vectors, set the new 'half'

12681

// shuffle mask accordingly.

12682

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

12683

HalfMask[i] = HalfElt;

12684

HalfIdx1 = HalfIdx;

12685

continue;

12686

}

12687

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

12688

HalfMask[i] = HalfElt + HalfNumElts;

12689

HalfIdx2 = HalfIdx;

12690

continue;

12691

}

12692

12693

// Too many half vectors referenced.

12694

return SDValue();

12695

}

12696

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12696, __extension__ __PRETTY_FUNCTION__));

12697

12698

// Only shuffle the halves of the inputs when useful.

12699

int NumLowerHalves =

12700

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

12701

int NumUpperHalves =

12702

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

12703

12704

// uuuuXXXX - don't extract uppers just to insert again.

12705

if (UndefLower && NumUpperHalves != 0)

12706

return SDValue();

12707

12708

// XXXXuuuu - don't extract both uppers, instead shuffle and then extract.

12709

if (UndefUpper && NumUpperHalves == 2)

12710

return SDValue();

12711

12712

// AVX2 - XXXXuuuu - always extract lowers.

12713

if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {

12714

// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.

12715

if (VT == MVT::v4f64 || VT == MVT::v4i64)

12716

return SDValue();

12717

// AVX2 supports variable 32-bit element cross-lane shuffles.

12718

if (VT == MVT::v8f32 || VT == MVT::v8i32) {

12719

// XXXXuuuu - don't extract lowers and uppers.

12720

if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)

12721

return SDValue();

12722

}

12723

}

12724

12725

// AVX512 - XXXXuuuu - always extract lowers.

12726

if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))

12727

return SDValue();

12728

12729

auto GetHalfVector = [&](int HalfIdx) {

12730

if (HalfIdx < 0)

12731

return DAG.getUNDEF(HalfVT);

12732

SDValue V = (HalfIdx < 2 ? V1 : V2);

12733

HalfIdx = (HalfIdx % 2) * HalfNumElts;

12734

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

12735

DAG.getIntPtrConstant(HalfIdx, DL));

12736

};

12737

12738

SDValue Half1 = GetHalfVector(HalfIdx1);

12739

SDValue Half2 = GetHalfVector(HalfIdx2);

12740

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

12741

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

12742

DAG.getIntPtrConstant(Offset, DL));

12743

}

12744

12745

/// \brief Test whether the specified input (0 or 1) is in-place blended by the

12746

/// given mask.

12747

///

12748

/// This returns true if the elements from a particular input are already in the

12749

/// slot required by the given mask and require no permutation.

12750

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

12751

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12751, __extension__ __PRETTY_FUNCTION__));

12752

int Size = Mask.size();

12753

for (int i = 0; i < Size; ++i)

12754

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

12755

return false;

12756

12757

return true;

12758

}

12759

12760

/// Handle case where shuffle sources are coming from the same 128-bit lane and

12761

/// every lane can be represented as the same repeating mask - allowing us to

12762

/// shuffle the sources with the repeating shuffle and then permute the result

12763

/// to the destination lanes.

12764

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

12765

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

12766

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

12767

int NumElts = VT.getVectorNumElements();

12768

int NumLanes = VT.getSizeInBits() / 128;

12769

int NumLaneElts = NumElts / NumLanes;

12770

12771

// On AVX2 we may be able to just shuffle the lowest elements and then

12772

// broadcast the result.

12773

if (Subtarget.hasAVX2()) {

12774

for (unsigned BroadcastSize : {16, 32, 64}) {

12775

if (BroadcastSize <= VT.getScalarSizeInBits())

12776

continue;

12777

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

12778

12779

// Attempt to match a repeating pattern every NumBroadcastElts,

12780

// accounting for UNDEFs but only references the lowest 128-bit

12781

// lane of the inputs.

12782

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

12783

for (int i = 0; i != NumElts; i += NumBroadcastElts)

12784

for (int j = 0; j != NumBroadcastElts; ++j) {

12785

int M = Mask[i + j];

12786

if (M < 0)

12787

continue;

12788

int &R = RepeatMask[j];

12789

if (0 != ((M % NumElts) / NumLaneElts))

12790

return false;

12791

if (0 <= R && R != M)

12792

return false;

12793

R = M;

12794

}

12795

return true;

12796

};

12797

12798

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

12799

if (!FindRepeatingBroadcastMask(RepeatMask))

12800

continue;

12801

12802

// Shuffle the (lowest) repeated elements in place for broadcast.

12803

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

12804

12805

// Shuffle the actual broadcast.

12806

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

12807

for (int i = 0; i != NumElts; i += NumBroadcastElts)

12808

for (int j = 0; j != NumBroadcastElts; ++j)

12809

BroadcastMask[i + j] = j;

12810

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

12811

BroadcastMask);

12812

}

12813

}

12814

12815

// Bail if the shuffle mask doesn't cross 128-bit lanes.

12816

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

12817

return SDValue();

12818

12819

// Bail if we already have a repeated lane shuffle mask.

12820

SmallVector<int, 8> RepeatedShuffleMask;

12821

if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))

12822

return SDValue();

12823

12824

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

12825

// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.

12826

int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;

12827

int NumSubLanes = NumLanes * SubLaneScale;

12828

int NumSubLaneElts = NumLaneElts / SubLaneScale;

12829

12830

// Check that all the sources are coming from the same lane and see if we can

12831

// form a repeating shuffle mask (local to each sub-lane). At the same time,

12832

// determine the source sub-lane for each destination sub-lane.

12833

int TopSrcSubLane = -1;

12834

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

12835

SmallVector<int, 8> RepeatedSubLaneMasks[2] = {

12836

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),

12837

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

12838

12839

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

12840

// Extract the sub-lane mask, check that it all comes from the same lane

12841

// and normalize the mask entries to come from the first lane.

12842

int SrcLane = -1;

12843

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

12844

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

12845

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

12846

if (M < 0)

12847

continue;

12848

int Lane = (M % NumElts) / NumLaneElts;

12849

if ((0 <= SrcLane) && (SrcLane != Lane))

12850

return SDValue();

12851

SrcLane = Lane;

12852

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

12853

SubLaneMask[Elt] = LocalM;

12854

}

12855

12856

// Whole sub-lane is UNDEF.

12857

if (SrcLane < 0)

12858

continue;

12859

12860

// Attempt to match against the candidate repeated sub-lane masks.

12861

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

12862

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

12863

for (int i = 0; i != NumSubLaneElts; ++i) {

12864

if (M1[i] < 0 || M2[i] < 0)

12865

continue;

12866

if (M1[i] != M2[i])

12867

return false;

12868

}

12869

return true;

12870

};

12871

12872

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

12873

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

12874

continue;

12875

12876

// Merge the sub-lane mask into the matching repeated sub-lane mask.

12877

for (int i = 0; i != NumSubLaneElts; ++i) {

12878

int M = SubLaneMask[i];

12879

if (M < 0)

12880

continue;

12881

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12882, __extension__ __PRETTY_FUNCTION__))

12882

"Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12882, __extension__ __PRETTY_FUNCTION__));

12883

RepeatedSubLaneMask[i] = M;

12884

}

12885

12886

// Track the top most source sub-lane - by setting the remaining to UNDEF

12887

// we can greatly simplify shuffle matching.

12888

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

12889

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

12890

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

12891

break;

12892

}

12893

12894

// Bail if we failed to find a matching repeated sub-lane mask.

12895

if (Dst2SrcSubLanes[DstSubLane] < 0)

12896

return SDValue();

12897

}

12898

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12899, __extension__ __PRETTY_FUNCTION__))

12899

"Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12899, __extension__ __PRETTY_FUNCTION__));

12900

12901

// Create a repeating shuffle mask for the entire vector.

12902

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

12903

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

12904

int Lane = SubLane / SubLaneScale;

12905

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

12906

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

12907

int M = RepeatedSubLaneMask[Elt];

12908

if (M < 0)

12909

continue;

12910

int Idx = (SubLane * NumSubLaneElts) + Elt;

12911

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

12912

}

12913

}

12914

SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

12915

12916

// Shuffle each source sub-lane to its destination.

12917

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

12918

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

12919

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

12920

if (SrcSubLane < 0)

12921

continue;

12922

for (int j = 0; j != NumSubLaneElts; ++j)

12923

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

12924

}

12925

12926

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

12927

SubLaneMask);

12928

}

12929

12930

static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

12931

unsigned &ShuffleImm,

12932

ArrayRef<int> Mask) {

12933

int NumElts = VT.getVectorNumElements();

12934

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12936, __extension__ __PRETTY_FUNCTION__))

12935

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12936, __extension__ __PRETTY_FUNCTION__))

12936

"Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12936, __extension__ __PRETTY_FUNCTION__));

12937

12938

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

12939

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

12940

ShuffleImm = 0;

12941

bool ShufpdMask = true;

12942

bool CommutableMask = true;

12943

for (int i = 0; i < NumElts; ++i) {

12944

if (Mask[i] == SM_SentinelUndef)

12945

continue;

12946

if (Mask[i] < 0)

12947

return false;

12948

int Val = (i & 6) + NumElts * (i & 1);

12949

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

12950

if (Mask[i] < Val || Mask[i] > Val + 1)

12951

ShufpdMask = false;

12952

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

12953

CommutableMask = false;

12954

ShuffleImm |= (Mask[i] % 2) << i;

12955

}

12956

12957

if (ShufpdMask)

12958

return true;

12959

if (CommutableMask) {

12960

std::swap(V1, V2);

12961

return true;

12962

}

12963

12964

return false;

12965

}

12966

12967

static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,

12968

ArrayRef<int> Mask, SDValue V1,

12969

SDValue V2, SelectionDAG &DAG) {

12970

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12971, __extension__ __PRETTY_FUNCTION__))

12971

"Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12971, __extension__ __PRETTY_FUNCTION__));

12972

12973

unsigned Immediate = 0;

12974

if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))

12975

return SDValue();

12976

12977

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

12978

DAG.getConstant(Immediate, DL, MVT::i8));

12979

}

12980

12981

static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,

12982

ArrayRef<int> Mask, SDValue V1,

12983

SDValue V2, SelectionDAG &DAG) {

12984

MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());

12985

MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

12986

12987

SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);

12988

if (V2.isUndef())

12989

return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

12990

12991

return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);

12992

}

12993

12994

/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.

12995

///

12996

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

12997

/// isn't available.

12998

static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

12999

const APInt &Zeroable,

13000

SDValue V1, SDValue V2,

13001

const X86Subtarget &Subtarget,

13002

SelectionDAG &DAG) {

13003

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13003, __extension__ __PRETTY_FUNCTION__));

13004

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13004, __extension__ __PRETTY_FUNCTION__));

13005

13006

13007

if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,

13008

Zeroable, Subtarget, DAG))

13009

return V;

13010

13011

if (V2.isUndef()) {

13012

// Check for being able to broadcast a single element.

13013

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(

13014

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

13015

return Broadcast;

13016

13017

// Use low duplicate instructions for masks that match their pattern.

13018

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))

13019

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

13020

13021

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

13022

// Non-half-crossing single input shuffles can be lowered with an

13023

// interleaved permutation.

13024

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

13025

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

13026

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

13027

DAG.getConstant(VPERMILPMask, DL, MVT::i8));

13028

}

13029

13030

// With AVX2 we have direct support for this permutation.

13031

if (Subtarget.hasAVX2())

13032

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

13033

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

13034

13035

// Try to create an in-lane repeating shuffle mask and then shuffle the

13036

// the results into the target lanes.

13037

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

13038

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

13039

return V;

13040

13041

// Otherwise, fall back.

13042

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,

13043

DAG, Subtarget);

13044

}

13045

13046

// Use dedicated unpack instructions for masks that match their pattern.

13047

if (SDValue V =

13048

lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

13049

return V;

13050

13051

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

13052

Zeroable, Subtarget, DAG))

13053

return Blend;

13054

13055

// Check if the blend happens to exactly fit that of SHUFPD.

13056

if (SDValue Op =

13057

lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))

13058

return Op;

13059

13060

// Try to create an in-lane repeating shuffle mask and then shuffle the

13061

// the results into the target lanes.

13062

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

13063

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

13064

return V;

13065

13066

// Try to simplify this by merging 128-bit lanes to enable a lane-based

13067

// shuffle. However, if we have AVX2 and either inputs are already in place,

13068

// we will be able to shuffle even across lanes the other input in a single

13069

// instruction so skip this pattern.

13070

if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||

13071

isShuffleMaskInputInPlace(1, Mask))))

13072

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

13073

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

13074

return Result;

13075

// If we have VLX support, we can use VEXPAND.

13076

if (Subtarget.hasVLX())

13077

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,

13078

V1, V2, DAG, Subtarget))

13079

return V;

13080

13081

// If we have AVX2 then we always want to lower with a blend because an v4 we

13082

// can fully permute the elements.

13083

if (Subtarget.hasAVX2())

13084

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,

13085

Mask, DAG);

13086

13087

// Otherwise fall back on generic lowering.

13088

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);

13089

}

13090

13091

/// \brief Handle lowering of 4-lane 64-bit integer shuffles.

13092

///

13093

/// This routine is only called when we have AVX2 and thus a reasonable

13094

/// instruction set for v4i64 shuffling..

13095

static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13096

const APInt &Zeroable,

13097

SDValue V1, SDValue V2,

13098

const X86Subtarget &Subtarget,

13099

SelectionDAG &DAG) {

13100

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13100, __extension__ __PRETTY_FUNCTION__));

13101

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13101, __extension__ __PRETTY_FUNCTION__));

13102

13103

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13103, __extension__ __PRETTY_FUNCTION__));

13104

13105

if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,

13106

Zeroable, Subtarget, DAG))

13107

return V;

13108

13109

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

13110

Zeroable, Subtarget, DAG))

13111

return Blend;

13112

13113

// Check for being able to broadcast a single element.

13114

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,

13115

Mask, Subtarget, DAG))

13116

return Broadcast;

13117

13118

if (V2.isUndef()) {

13119

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

13120

// can use lower latency instructions that will operate on both lanes.

13121

SmallVector<int, 2> RepeatedMask;

13122

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

13123

SmallVector<int, 4> PSHUFDMask;

13124

scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);

13125

return DAG.getBitcast(

13126

MVT::v4i64,

13127

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

13128

DAG.getBitcast(MVT::v8i32, V1),

13129

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

13130

}

13131

13132

// AVX2 provides a direct instruction for permuting a single input across

13133

// lanes.

13134

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

13135

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

13136

}

13137

13138

// Try to use shift instructions.

13139

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,

13140

Zeroable, Subtarget, DAG))

13141

return Shift;

13142

13143

// If we have VLX support, we can use VALIGN or VEXPAND.

13144

if (Subtarget.hasVLX()) {

13145

if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,

13146

Mask, Subtarget, DAG))

13147

return Rotate;

13148

13149

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,

13150

V1, V2, DAG, Subtarget))

13151

return V;

13152

}

13153

13154

// Try to use PALIGNR.

13155

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,

13156

Mask, Subtarget, DAG))

13157

return Rotate;

13158

13159

// Use dedicated unpack instructions for masks that match their pattern.

13160

if (SDValue V =

13161

lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

13162

return V;

13163

13164

// Try to simplify this by merging 128-bit lanes to enable a lane-based

13165

// shuffle. However, if we have AVX2 and either inputs are already in place,

13166

// we will be able to shuffle even across lanes the other input in a single

13167

// instruction so skip this pattern.

13168

if (!isShuffleMaskInputInPlace(0, Mask) &&

13169

!isShuffleMaskInputInPlace(1, Mask))

13170

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

13171

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

13172

return Result;

13173

13174

// Otherwise fall back on generic blend lowering.

13175

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,

13176

Mask, DAG);

13177

}

13178

13179

/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.

13180

///

13181

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

13182

/// isn't available.

13183

static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13184

const APInt &Zeroable,

13185

SDValue V1, SDValue V2,

13186

const X86Subtarget &Subtarget,

13187

SelectionDAG &DAG) {

13188

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13188, __extension__ __PRETTY_FUNCTION__));

13189

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13189, __extension__ __PRETTY_FUNCTION__));

13190

13191

13192

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

13193

Zeroable, Subtarget, DAG))

13194

return Blend;

13195

13196

// Check for being able to broadcast a single element.

13197

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,

13198

Mask, Subtarget, DAG))

13199

return Broadcast;

13200

13201

// If the shuffle mask is repeated in each 128-bit lane, we have many more

13202

// options to efficiently lower the shuffle.

13203

SmallVector<int, 4> RepeatedMask;

13204

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

13205

assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13206, __extension__ __PRETTY_FUNCTION__))

13206

"Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13206, __extension__ __PRETTY_FUNCTION__));

13207

13208

// Use even/odd duplicate instructions for masks that match their pattern.

13209

if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))

13210

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

13211

if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))

13212

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

13213

13214

if (V2.isUndef())

13215

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

13216

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

13217

13218

// Use dedicated unpack instructions for masks that match their pattern.

13219

if (SDValue V =

13220

lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

13221

return V;

13222

13223

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

13224

// have already handled any direct blends.

13225

return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

13226

}

13227

13228

// Try to create an in-lane repeating shuffle mask and then shuffle the

13229

// the results into the target lanes.

13230

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

13231

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

13232

return V;

13233

13234

// If we have a single input shuffle with different shuffle patterns in the

13235

// two 128-bit lanes use the variable mask to VPERMILPS.

13236

if (V2.isUndef()) {

13237

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

13238

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))

13239

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

13240

13241

if (Subtarget.hasAVX2())

13242

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

13243

13244

// Otherwise, fall back.

13245

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,

13246

DAG, Subtarget);

13247

}

13248

13249

// Try to simplify this by merging 128-bit lanes to enable a lane-based

13250

// shuffle.

13251

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

13252

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

13253

return Result;

13254

// If we have VLX support, we can use VEXPAND.

13255

if (Subtarget.hasVLX())

13256

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,

13257

V1, V2, DAG, Subtarget))

13258

return V;

13259

13260

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

13261

// since after split we get a more efficient code using vpunpcklwd and

13262

// vpunpckhwd instrs than vblend.

13263

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))

13264

if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,

13265

Mask, DAG))

13266

return V;

13267

13268

// If we have AVX2 then we always want to lower with a blend because at v8 we

13269

// can fully permute the elements.

13270

if (Subtarget.hasAVX2())

13271

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,

13272

Mask, DAG);

13273

13274

// Otherwise fall back on generic lowering.

13275

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);

13276

}

13277

13278

/// \brief Handle lowering of 8-lane 32-bit integer shuffles.

13279

///

13280

/// This routine is only called when we have AVX2 and thus a reasonable

13281

/// instruction set for v8i32 shuffling..

13282

static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13283

const APInt &Zeroable,

13284

SDValue V1, SDValue V2,

13285

const X86Subtarget &Subtarget,

13286

SelectionDAG &DAG) {

13287

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13287, __extension__ __PRETTY_FUNCTION__));

13288

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13288, __extension__ __PRETTY_FUNCTION__));

13289

13290

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13290, __extension__ __PRETTY_FUNCTION__));

13291

13292

// Whenever we can lower this as a zext, that instruction is strictly faster

13293

// than any alternative. It also allows us to fold memory operands into the

13294

// shuffle in many cases.

13295

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

13296

DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

13297

return ZExt;

13298

13299

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

13300

// since after split we get a more efficient code than vblend by using

13301

// vpunpcklwd and vpunpckhwd instrs.

13302

if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&

13303

!Subtarget.hasAVX512())

13304

if (SDValue V =

13305

lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))

13306

return V;

13307

13308

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

13309

Zeroable, Subtarget, DAG))

13310

return Blend;

13311

13312

// Check for being able to broadcast a single element.

13313

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,

13314

Mask, Subtarget, DAG))

13315

return Broadcast;

13316

13317

// If the shuffle mask is repeated in each 128-bit lane we can use more

13318

// efficient instructions that mirror the shuffles across the two 128-bit

13319

// lanes.

13320

SmallVector<int, 4> RepeatedMask;

13321

bool Is128BitLaneRepeatedShuffle =

13322

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

13323

if (Is128BitLaneRepeatedShuffle) {

13324

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13324, __extension__ __PRETTY_FUNCTION__));

13325

if (V2.isUndef())

13326

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

13327

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

13328

13329

// Use dedicated unpack instructions for masks that match their pattern.

13330

if (SDValue V =

13331

lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

13332

return V;

13333

}

13334

13335

// Try to use shift instructions.

13336

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,

13337

Zeroable, Subtarget, DAG))

13338

return Shift;

13339

13340

// If we have VLX support, we can use VALIGN or EXPAND.

13341

if (Subtarget.hasVLX()) {

13342

if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,

13343

Mask, Subtarget, DAG))

13344

return Rotate;

13345

13346

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,

13347

V1, V2, DAG, Subtarget))

13348

return V;

13349

}

13350

13351

// Try to use byte rotation instructions.

13352

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

13353

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

13354

return Rotate;

13355

13356

// Try to create an in-lane repeating shuffle mask and then shuffle the

13357

// results into the target lanes.

13358

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

13359

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

13360

return V;

13361

13362

// If the shuffle patterns aren't repeated but it is a single input, directly

13363

// generate a cross-lane VPERMD instruction.

13364

if (V2.isUndef()) {

13365

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

13366

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

13367

}

13368

13369

// Assume that a single SHUFPS is faster than an alternative sequence of

13370

// multiple instructions (even if the CPU has a domain penalty).

13371

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

13372

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

13373

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

13374

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

13375

SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

13376

CastV1, CastV2, DAG);

13377

return DAG.getBitcast(MVT::v8i32, ShufPS);

13378

}

13379

13380

// Try to simplify this by merging 128-bit lanes to enable a lane-based

13381

// shuffle.

13382

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

13383

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

13384

return Result;

13385

13386

// Otherwise fall back on generic blend lowering.

13387

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,

13388

Mask, DAG);

13389

}

13390

13391

/// \brief Handle lowering of 16-lane 16-bit integer shuffles.

13392

///

13393

/// This routine is only called when we have AVX2 and thus a reasonable

13394

/// instruction set for v16i16 shuffling..

13395

static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13396

const APInt &Zeroable,

13397

SDValue V1, SDValue V2,

13398

const X86Subtarget &Subtarget,

13399

SelectionDAG &DAG) {

13400

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13400, __extension__ __PRETTY_FUNCTION__));

13401

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13401, __extension__ __PRETTY_FUNCTION__));

13402

13403

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13403, __extension__ __PRETTY_FUNCTION__));

13404

13405

// Whenever we can lower this as a zext, that instruction is strictly faster

13406

// than any alternative. It also allows us to fold memory operands into the

13407

// shuffle in many cases.

13408

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

13409

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

13410

return ZExt;

13411

13412

// Check for being able to broadcast a single element.

13413

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,

13414

Mask, Subtarget, DAG))

13415

return Broadcast;

13416

13417

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

13418

Zeroable, Subtarget, DAG))

13419

return Blend;

13420

13421

// Use dedicated unpack instructions for masks that match their pattern.

13422

if (SDValue V =

13423

lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

13424

return V;

13425

13426

// Use dedicated pack instructions for masks that match their pattern.

13427

if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

13428

Subtarget))

13429

return V;

13430

13431

// Try to use shift instructions.

13432

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,

13433

Zeroable, Subtarget, DAG))

13434

return Shift;

13435

13436

// Try to use byte rotation instructions.

13437

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

13438

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

13439

return Rotate;

13440

13441

// Try to create an in-lane repeating shuffle mask and then shuffle the

13442

// the results into the target lanes.

13443

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

13444

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

13445

return V;

13446

13447

if (V2.isUndef()) {

13448

// There are no generalized cross-lane shuffle operations available on i16

13449

// element types.

13450

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))

13451

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,

13452

Mask, DAG, Subtarget);

13453

13454

SmallVector<int, 8> RepeatedMask;

13455

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

13456

// As this is a single-input shuffle, the repeated mask should be

13457

// a strictly valid v8i16 mask that we can pass through to the v8i16

13458

// lowering to handle even the v16 case.

13459

return lowerV8I16GeneralSingleInputVectorShuffle(

13460

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

13461

}

13462

}

13463

13464

if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(

13465

DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))

13466

return PSHUFB;

13467

13468

// AVX512BWVL can lower to VPERMW.

13469

if (Subtarget.hasBWI() && Subtarget.hasVLX())

13470

return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

13471

13472

// Try to simplify this by merging 128-bit lanes to enable a lane-based

13473

// shuffle.

13474

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

13475

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

13476

return Result;

13477

13478

// Otherwise fall back on generic lowering.

13479

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);

13480

}

13481

13482

/// \brief Handle lowering of 32-lane 8-bit integer shuffles.

13483

///

13484

/// This routine is only called when we have AVX2 and thus a reasonable

13485

/// instruction set for v32i8 shuffling..

13486

static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13487

const APInt &Zeroable,

13488

SDValue V1, SDValue V2,

13489

const X86Subtarget &Subtarget,

13490

SelectionDAG &DAG) {

13491

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13491, __extension__ __PRETTY_FUNCTION__));

13492

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13492, __extension__ __PRETTY_FUNCTION__));

13493

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13493, __extension__ __PRETTY_FUNCTION__));

13494

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13494, __extension__ __PRETTY_FUNCTION__));

13495

13496

// Whenever we can lower this as a zext, that instruction is strictly faster

13497

// than any alternative. It also allows us to fold memory operands into the

13498

// shuffle in many cases.

13499

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

13500

DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

13501

return ZExt;

13502

13503

// Check for being able to broadcast a single element.

13504

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,

13505

Mask, Subtarget, DAG))

13506

return Broadcast;

13507

13508

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

13509

Zeroable, Subtarget, DAG))

13510

return Blend;

13511

13512

// Use dedicated unpack instructions for masks that match their pattern.

13513

if (SDValue V =

13514

lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

13515

return V;

13516

13517

// Use dedicated pack instructions for masks that match their pattern.

13518

if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

13519

Subtarget))

13520

return V;

13521

13522

// Try to use shift instructions.

13523

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,

13524

Zeroable, Subtarget, DAG))

13525

return Shift;

13526

13527

// Try to use byte rotation instructions.

13528

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

13529

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

13530

return Rotate;

13531

13532

// Try to create an in-lane repeating shuffle mask and then shuffle the

13533

// the results into the target lanes.

13534

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

13535

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

13536

return V;

13537

13538

// There are no generalized cross-lane shuffle operations available on i8

13539

// element types.

13540

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))

13541

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,

13542

DAG, Subtarget);

13543

13544

if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(

13545

DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))

13546

return PSHUFB;

13547

13548

// Try to simplify this by merging 128-bit lanes to enable a lane-based

13549

// shuffle.

13550

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

13551

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

13552

return Result;

13553

13554

// Otherwise fall back on generic lowering.

13555

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);

13556

}

13557

13558

/// \brief High-level routine to lower various 256-bit x86 vector shuffles.

13559

///

13560

/// This routine either breaks down the specific type of a 256-bit x86 vector

13561

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

13562

/// together based on the available instructions.

13563

static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13564

MVT VT, SDValue V1, SDValue V2,

13565

const APInt &Zeroable,

13566

const X86Subtarget &Subtarget,

13567

SelectionDAG &DAG) {

13568

// If we have a single input to the zero element, insert that into V1 if we

13569

// can do so cheaply.

13570

int NumElts = VT.getVectorNumElements();

13571

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

13572

13573

if (NumV2Elements == 1 && Mask[0] >= NumElts)

13574

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

13575

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

13576

return Insertion;

13577

13578

// Handle special cases where the lower or upper half is UNDEF.

13579

if (SDValue V =

13580

lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

13581

return V;

13582

13583

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

13584

// can check for those subtargets here and avoid much of the subtarget

13585

// querying in the per-vector-type lowering routines. With AVX1 we have

13586

// essentially *zero* ability to manipulate a 256-bit vector with integer

13587

// types. Since we'll use floating point types there eventually, just

13588

// immediately cast everything to a float and operate entirely in that domain.

13589

if (VT.isInteger() && !Subtarget.hasAVX2()) {

13590

int ElementBits = VT.getScalarSizeInBits();

13591

if (ElementBits < 32) {

13592

// No floating point type available, if we can't use the bit operations

13593

// for masking/blending then decompose into 128-bit vectors.

13594

if (SDValue V =

13595

lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))

13596

return V;

13597

if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

13598

return V;

13599

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

13600

}

13601

13602

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

13603

VT.getVectorNumElements());

13604

V1 = DAG.getBitcast(FpVT, V1);

13605

V2 = DAG.getBitcast(FpVT, V2);

13606

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

13607

}

13608

13609

switch (VT.SimpleTy) {

13610

case MVT::v4f64:

13611

return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

13612

case MVT::v4i64:

13613

return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

13614

case MVT::v8f32:

13615

return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

13616

case MVT::v8i32:

13617

return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

13618

case MVT::v16i16:

13619

return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

13620

case MVT::v32i8:

13621

return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

13622

13623

default:

13624

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13624);

13625

}

13626

}

13627

13628

/// \brief Try to lower a vector shuffle as a 128-bit shuffles.

13629

static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,

13630

ArrayRef<int> Mask, SDValue V1,

13631

SDValue V2, SelectionDAG &DAG) {

13632

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13633, __extension__ __PRETTY_FUNCTION__))

13633

"Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13633, __extension__ __PRETTY_FUNCTION__));

13634

13635

// To handle 256 bit vector requires VLX and most probably

13636

// function lowerV2X128VectorShuffle() is better solution.

13637

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13637, __extension__ __PRETTY_FUNCTION__));

13638

13639

SmallVector<int, 4> WidenedMask;

13640

if (!canWidenShuffleElements(Mask, WidenedMask))

13641

return SDValue();

13642

13643

// Check for patterns which can be matched with a single insert of a 256-bit

13644

// subvector.

13645

bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,

13646

{0, 1, 2, 3, 0, 1, 2, 3});

13647

if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,

13648

{0, 1, 2, 3, 8, 9, 10, 11})) {

13649

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

13650

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

13651

DAG.getIntPtrConstant(0, DL));

13652

SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

13653

OnlyUsesV1 ? V1 : V2,

13654

DAG.getIntPtrConstant(0, DL));

13655

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);

13656

}

13657

13658

assert(WidenedMask.size() == 4)(static_cast <bool> (WidenedMask.size() == 4) ? void (0
) : __assert_fail ("WidenedMask.size() == 4", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13658, __extension__ __PRETTY_FUNCTION__));

13659

13660

// See if this is an insertion of the lower 128-bits of V2 into V1.

13661

bool IsInsert = true;

13662

int V2Index = -1;

13663

for (int i = 0; i < 4; ++i) {

13664

assert(WidenedMask[i] >= -1)(static_cast <bool> (WidenedMask[i] >= -1) ? void (0
) : __assert_fail ("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13664, __extension__ __PRETTY_FUNCTION__));

13665

if (WidenedMask[i] < 0)

13666

continue;

13667

13668

// Make sure all V1 subvectors are in place.

13669

if (WidenedMask[i] < 4) {

13670

if (WidenedMask[i] != i) {

13671

IsInsert = false;

13672

break;

13673

}

13674

} else {

13675

// Make sure we only have a single V2 index and its the lowest 128-bits.

13676

if (V2Index >= 0 || WidenedMask[i] != 4) {

13677

IsInsert = false;

13678

break;

13679

}

13680

V2Index = i;

13681

}

13682

}

13683

if (IsInsert && V2Index >= 0) {

13684

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

13685

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

13686

DAG.getIntPtrConstant(0, DL));

13687

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

13688

}

13689

13690

// Try to lower to to vshuf64x2/vshuf32x4.

13691

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

13692

unsigned PermMask = 0;

13693

// Insure elements came from the same Op.

13694

for (int i = 0; i < 4; ++i) {

13695

13696

if (WidenedMask[i] < 0)

13697

continue;

13698

13699

SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;

13700

unsigned OpIndex = i / 2;

13701

if (Ops[OpIndex].isUndef())

13702

Ops[OpIndex] = Op;

13703

else if (Ops[OpIndex] != Op)

13704

return SDValue();

13705

13706

// Convert the 128-bit shuffle mask selection values into 128-bit selection

13707

// bits defined by a vshuf64x2 instruction's immediate control byte.

13708

PermMask |= (WidenedMask[i] % 4) << (i * 2);

13709

}

13710

13711

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

13712

DAG.getConstant(PermMask, DL, MVT::i8));

13713

}

13714

13715

/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.

13716

static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13717

const APInt &Zeroable,

13718

SDValue V1, SDValue V2,

13719

const X86Subtarget &Subtarget,

13720

SelectionDAG &DAG) {

13721

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13721, __extension__ __PRETTY_FUNCTION__));

13722

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13722, __extension__ __PRETTY_FUNCTION__));

13723

13724

13725

if (V2.isUndef()) {

13726

// Use low duplicate instructions for masks that match their pattern.

13727

if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))

13728

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

13729

13730

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

13731

// Non-half-crossing single input shuffles can be lowered with an

13732

// interleaved permutation.

13733

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

13734

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

13735

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

13736

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

13737

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

13738

DAG.getConstant(VPERMILPMask, DL, MVT::i8));

13739

}

13740

13741

SmallVector<int, 4> RepeatedMask;

13742

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

13743

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

13744

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

13745

}

13746

13747

if (SDValue Shuf128 =

13748

lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))

13749

return Shuf128;

13750

13751

if (SDValue Unpck =

13752

lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

13753

return Unpck;

13754

13755

// Check if the blend happens to exactly fit that of SHUFPD.

13756

if (SDValue Op =

13757

lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))

13758

return Op;

13759

13760

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,

13761

V2, DAG, Subtarget))

13762

return V;

13763

13764

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

13765

Zeroable, Subtarget, DAG))

13766

return Blend;

13767

13768

return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);

13769

}

13770

13771

/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.

13772

static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13773

const APInt &Zeroable,

13774

SDValue V1, SDValue V2,

13775

const X86Subtarget &Subtarget,

13776

SelectionDAG &DAG) {

13777

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13777, __extension__ __PRETTY_FUNCTION__));

13778

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13778, __extension__ __PRETTY_FUNCTION__));

13779

13780

13781

// If the shuffle mask is repeated in each 128-bit lane, we have many more

13782

// options to efficiently lower the shuffle.

13783

SmallVector<int, 4> RepeatedMask;

13784

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

13785

13786

13787

// Use even/odd duplicate instructions for masks that match their pattern.

13788

if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))

13789

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

13790

if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))

13791

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

13792

13793

if (V2.isUndef())

13794

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

13795

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

13796

13797

// Use dedicated unpack instructions for masks that match their pattern.

13798

if (SDValue Unpck =

13799

lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

13800

return Unpck;

13801

13802

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

13803

Zeroable, Subtarget, DAG))

13804

return Blend;

13805

13806

// Otherwise, fall back to a SHUFPS sequence.

13807

return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

13808

}

13809

13810

// If we have a single input shuffle with different shuffle patterns in the

13811

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

13812

if (V2.isUndef() &&

13813

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

13814

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

13815

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

13816

}

13817

13818

// If we have AVX512F support, we can use VEXPAND.

13819

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

13820

V1, V2, DAG, Subtarget))

13821

return V;

13822

13823

return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);

13824

}

13825

13826

/// \brief Handle lowering of 8-lane 64-bit integer shuffles.

13827

static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13828

const APInt &Zeroable,

13829

SDValue V1, SDValue V2,

13830

const X86Subtarget &Subtarget,

13831

SelectionDAG &DAG) {

13832

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13832, __extension__ __PRETTY_FUNCTION__));

13833

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13833, __extension__ __PRETTY_FUNCTION__));

13834

13835

13836

if (V2.isUndef()) {

13837

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

13838

// can use lower latency instructions that will operate on all four

13839

// 128-bit lanes.

13840

SmallVector<int, 2> Repeated128Mask;

13841

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

13842

SmallVector<int, 4> PSHUFDMask;

13843

scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);

13844

return DAG.getBitcast(

13845

MVT::v8i64,

13846

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

13847

DAG.getBitcast(MVT::v16i32, V1),

13848

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

13849

}

13850

13851

SmallVector<int, 4> Repeated256Mask;

13852

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

13853

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

13854

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

13855

}

13856

13857

if (SDValue Shuf128 =

13858

lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))

13859

return Shuf128;

13860

13861

// Try to use shift instructions.

13862

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,

13863

Zeroable, Subtarget, DAG))

13864

return Shift;

13865

13866

// Try to use VALIGN.

13867

if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,

13868

Mask, Subtarget, DAG))

13869

return Rotate;

13870

13871

// Try to use PALIGNR.

13872

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,

13873

Mask, Subtarget, DAG))

13874

return Rotate;

13875

13876

if (SDValue Unpck =

13877

lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

13878

return Unpck;

13879

// If we have AVX512F support, we can use VEXPAND.

13880

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,

13881

V2, DAG, Subtarget))

13882

return V;

13883

13884

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

13885

Zeroable, Subtarget, DAG))

13886

return Blend;

13887

13888

return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);

13889

}

13890

13891

/// \brief Handle lowering of 16-lane 32-bit integer shuffles.

13892

static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13893

const APInt &Zeroable,

13894

SDValue V1, SDValue V2,

13895

const X86Subtarget &Subtarget,

13896

SelectionDAG &DAG) {

13897

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13897, __extension__ __PRETTY_FUNCTION__));

13898

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13898, __extension__ __PRETTY_FUNCTION__));

13899

13900

13901

// Whenever we can lower this as a zext, that instruction is strictly faster

13902

// than any alternative. It also allows us to fold memory operands into the

13903

// shuffle in many cases.

13904

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

13905

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

13906

return ZExt;

13907

13908

// If the shuffle mask is repeated in each 128-bit lane we can use more

13909

// efficient instructions that mirror the shuffles across the four 128-bit

13910

// lanes.

13911

SmallVector<int, 4> RepeatedMask;

13912

bool Is128BitLaneRepeatedShuffle =

13913

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

13914

if (Is128BitLaneRepeatedShuffle) {

13915

13916

if (V2.isUndef())

13917

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

13918

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

13919

13920

// Use dedicated unpack instructions for masks that match their pattern.

13921

if (SDValue V =

13922

lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

13923

return V;

13924

}

13925

13926

// Try to use shift instructions.

13927

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,

13928

Zeroable, Subtarget, DAG))

13929

return Shift;

13930

13931

// Try to use VALIGN.

13932

if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,

13933

Mask, Subtarget, DAG))

13934

return Rotate;

13935

13936

// Try to use byte rotation instructions.

13937

if (Subtarget.hasBWI())

13938

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

13939

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

13940

return Rotate;

13941

13942

// Assume that a single SHUFPS is faster than using a permv shuffle.

13943

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

13944

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

13945

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

13946

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

13947

SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

13948

CastV1, CastV2, DAG);

13949

return DAG.getBitcast(MVT::v16i32, ShufPS);

13950

}

13951

// If we have AVX512F support, we can use VEXPAND.

13952

if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,

13953

V1, V2, DAG, Subtarget))

13954

return V;

13955

13956

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

13957

Zeroable, Subtarget, DAG))

13958

return Blend;

13959

return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);

13960

}

13961

13962

/// \brief Handle lowering of 32-lane 16-bit integer shuffles.

13963

static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

13964

const APInt &Zeroable,

13965

SDValue V1, SDValue V2,

13966

const X86Subtarget &Subtarget,

13967

SelectionDAG &DAG) {

13968

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13968, __extension__ __PRETTY_FUNCTION__));

13969

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13969, __extension__ __PRETTY_FUNCTION__));

13970

13971

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13971, __extension__ __PRETTY_FUNCTION__));

13972

13973

// Whenever we can lower this as a zext, that instruction is strictly faster

13974

// than any alternative. It also allows us to fold memory operands into the

13975

// shuffle in many cases.

13976

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

13977

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

13978

return ZExt;

13979

13980

// Use dedicated unpack instructions for masks that match their pattern.

13981

if (SDValue V =

13982

lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

13983

return V;

13984

13985

// Try to use shift instructions.

13986

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,

13987

Zeroable, Subtarget, DAG))

13988

return Shift;

13989

13990

// Try to use byte rotation instructions.

13991

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

13992

DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))

13993

return Rotate;

13994

13995

if (V2.isUndef()) {

13996

SmallVector<int, 8> RepeatedMask;

13997

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

13998

// As this is a single-input shuffle, the repeated mask should be

13999

// a strictly valid v8i16 mask that we can pass through to the v8i16

14000

// lowering to handle even the v32 case.

14001

return lowerV8I16GeneralSingleInputVectorShuffle(

14002

DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);

14003

}

14004

}

14005

14006

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

14007

Zeroable, Subtarget, DAG))

14008

return Blend;

14009

14010

return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);

14011

}

14012

14013

/// \brief Handle lowering of 64-lane 8-bit integer shuffles.

14014

static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

14015

const APInt &Zeroable,

14016

SDValue V1, SDValue V2,

14017

const X86Subtarget &Subtarget,

14018

SelectionDAG &DAG) {

14019

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14019, __extension__ __PRETTY_FUNCTION__));

14020

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14020, __extension__ __PRETTY_FUNCTION__));

14021

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14021, __extension__ __PRETTY_FUNCTION__));

14022

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14022, __extension__ __PRETTY_FUNCTION__));

14023

14024

// Whenever we can lower this as a zext, that instruction is strictly faster

14025

// than any alternative. It also allows us to fold memory operands into the

14026

// shuffle in many cases.

14027

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

14028

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

14029

return ZExt;

14030

14031

// Use dedicated unpack instructions for masks that match their pattern.

14032

if (SDValue V =

14033

lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

14034

return V;

14035

14036

// Try to use shift instructions.

14037

if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,

14038

Zeroable, Subtarget, DAG))

14039

return Shift;

14040

14041

// Try to use byte rotation instructions.

14042

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

14043

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

14044

return Rotate;

14045

14046

if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(

14047

DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))

14048

return PSHUFB;

14049

14050

// VBMI can use VPERMV/VPERMV3 byte shuffles.

14051

if (Subtarget.hasVBMI())

14052

return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

14053

14054

// Try to create an in-lane repeating shuffle mask and then shuffle the

14055

// the results into the target lanes.

14056

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

14057

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

14058

return V;

14059

14060

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

14061

Zeroable, Subtarget, DAG))

14062

return Blend;

14063

14064

// FIXME: Implement direct support for this type!

14065

return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);

14066

}

14067

14068

/// \brief High-level routine to lower various 512-bit x86 vector shuffles.

14069

///

14070

/// This routine either breaks down the specific type of a 512-bit x86 vector

14071

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

14072

/// together based on the available instructions.

14073

static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

14074

MVT VT, SDValue V1, SDValue V2,

14075

const APInt &Zeroable,

14076

const X86Subtarget &Subtarget,

14077

SelectionDAG &DAG) {

14078

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14079, __extension__ __PRETTY_FUNCTION__))

14079

"Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14079, __extension__ __PRETTY_FUNCTION__));

14080

14081

// If we have a single input to the zero element, insert that into V1 if we

14082

// can do so cheaply.

14083

int NumElts = Mask.size();

14084

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

14085

14086

if (NumV2Elements == 1 && Mask[0] >= NumElts)

14087

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

14088

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

14089

return Insertion;

14090

14091

// Handle special cases where the lower or upper half is UNDEF.

14092

if (SDValue V =

14093

lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

14094

return V;

14095

14096

// Check for being able to broadcast a single element.

14097

if (SDValue Broadcast =

14098

lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))

14099

return Broadcast;

14100

14101

// Dispatch to each element type for lowering. If we don't have support for

14102

// specific element type shuffles at 512 bits, immediately split them and

14103

// lower them. Each lowering routine of a given type is allowed to assume that

14104

// the requisite ISA extensions for that element type are available.

14105

switch (VT.SimpleTy) {

14106

case MVT::v8f64:

14107

return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

14108

case MVT::v16f32:

14109

return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

14110

case MVT::v8i64:

14111

return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

14112

case MVT::v16i32:

14113

return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

14114

case MVT::v32i16:

14115

return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

14116

case MVT::v64i8:

14117

return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

14118

14119

default:

14120

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14120);

14121

}

14122

}

14123

14124

// Lower vXi1 vector shuffles.

14125

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

14126

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

14127

// vector, shuffle and then truncate it back.

14128

static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,

14129

MVT VT, SDValue V1, SDValue V2,

14130

const X86Subtarget &Subtarget,

14131

SelectionDAG &DAG) {

14132

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14133, __extension__ __PRETTY_FUNCTION__))

14133

"Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14133, __extension__ __PRETTY_FUNCTION__));

14134

MVT ExtVT;

14135

switch (VT.SimpleTy) {

14136

default:

14137

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14137);

14138

case MVT::v2i1:

14139

ExtVT = MVT::v2i64;

14140

break;

14141

case MVT::v4i1:

14142

ExtVT = MVT::v4i32;

14143

break;

14144

case MVT::v8i1:

14145

ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL

14146

break;

14147

case MVT::v16i1:

14148

ExtVT = MVT::v16i32;

14149

break;

14150

case MVT::v32i1:

14151

ExtVT = MVT::v32i16;

14152

break;

14153

case MVT::v64i1:

14154

ExtVT = MVT::v64i8;

14155

break;

14156

}

14157

14158

if (ISD::isBuildVectorAllZeros(V1.getNode()))

14159

V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);

14160

else if (ISD::isBuildVectorAllOnes(V1.getNode()))

14161

V1 = getOnesVector(ExtVT, DAG, DL);

14162

else

14163

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

14164

14165

if (V2.isUndef())

14166

V2 = DAG.getUNDEF(ExtVT);

14167

else if (ISD::isBuildVectorAllZeros(V2.getNode()))

14168

V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);

14169

else if (ISD::isBuildVectorAllOnes(V2.getNode()))

14170

V2 = getOnesVector(ExtVT, DAG, DL);

14171

else

14172

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

14173

14174

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

14175

// i1 was sign extended we can use X86ISD::CVT2MASK.

14176

int NumElems = VT.getVectorNumElements();

14177

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

14178

(Subtarget.hasDQI() && (NumElems < 32)))

14179

return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);

14180

14181

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

14182

}

14183

14184

/// Helper function that returns true if the shuffle mask should be

14185

/// commuted to improve canonicalization.

14186

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

14187

int NumElements = Mask.size();

14188

14189

int NumV1Elements = 0, NumV2Elements = 0;

14190

for (int M : Mask)

14191

if (M < 0)

14192

continue;

14193

else if (M < NumElements)

14194

++NumV1Elements;

14195

else

14196

++NumV2Elements;

14197

14198

// Commute the shuffle as needed such that more elements come from V1 than

14199

// V2. This allows us to match the shuffle pattern strictly on how many

14200

// elements come from V1 without handling the symmetric cases.

14201

if (NumV2Elements > NumV1Elements)

14202

return true;

14203

14204

assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14204, __extension__ __PRETTY_FUNCTION__));

14205

14206

if (NumV2Elements == 0)

14207

return false;

14208

14209

// When the number of V1 and V2 elements are the same, try to minimize the

14210

// number of uses of V2 in the low half of the vector. When that is tied,

14211

// ensure that the sum of indices for V1 is equal to or lower than the sum

14212

// indices for V2. When those are equal, try to ensure that the number of odd

14213

// indices for V1 is lower than the number of odd indices for V2.

14214

if (NumV1Elements == NumV2Elements) {

14215

int LowV1Elements = 0, LowV2Elements = 0;

14216

for (int M : Mask.slice(0, NumElements / 2))

14217

if (M >= NumElements)

14218

++LowV2Elements;

14219

else if (M >= 0)

14220

++LowV1Elements;

14221

if (LowV2Elements > LowV1Elements)

14222

return true;

14223

if (LowV2Elements == LowV1Elements) {

14224

int SumV1Indices = 0, SumV2Indices = 0;

14225

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14226

if (Mask[i] >= NumElements)

14227

SumV2Indices += i;

14228

else if (Mask[i] >= 0)

14229

SumV1Indices += i;

14230

if (SumV2Indices < SumV1Indices)

14231

return true;

14232

if (SumV2Indices == SumV1Indices) {

14233

int NumV1OddIndices = 0, NumV2OddIndices = 0;

14234

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14235

if (Mask[i] >= NumElements)

14236

NumV2OddIndices += i % 2;

14237

else if (Mask[i] >= 0)

14238

NumV1OddIndices += i % 2;

14239

if (NumV2OddIndices < NumV1OddIndices)

14240

return true;

14241

}

14242

}

14243

}

14244

14245

return false;

14246

}

14247

14248

/// \brief Top-level lowering for x86 vector shuffles.

14249

///

14250

/// This handles decomposition, canonicalization, and lowering of all x86

14251

/// vector shuffles. Most of the specific lowering strategies are encapsulated

14252

/// above in helper routines. The canonicalization attempts to widen shuffles

14253

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

14254

/// s.t. only one of the two inputs needs to be tested, etc.

14255

static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,

14256

SelectionDAG &DAG) {

14257

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

14258

ArrayRef<int> Mask = SVOp->getMask();

14259

SDValue V1 = Op.getOperand(0);

14260

SDValue V2 = Op.getOperand(1);

14261

MVT VT = Op.getSimpleValueType();

14262

int NumElements = VT.getVectorNumElements();

14263

SDLoc DL(Op);

14264

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

14265

14266

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14267, __extension__ __PRETTY_FUNCTION__))

14267

"Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14267, __extension__ __PRETTY_FUNCTION__));

14268

14269

bool V1IsUndef = V1.isUndef();

14270

bool V2IsUndef = V2.isUndef();

14271

if (V1IsUndef && V2IsUndef)

14272

return DAG.getUNDEF(VT);

14273

14274

// When we create a shuffle node we put the UNDEF node to second operand,

14275

// but in some cases the first operand may be transformed to UNDEF.

14276

// In this case we should just commute the node.

14277

if (V1IsUndef)

14278

return DAG.getCommutedVectorShuffle(*SVOp);

14279

14280

// Check for non-undef masks pointing at an undef vector and make the masks

14281

// undef as well. This makes it easier to match the shuffle based solely on

14282

// the mask.

14283

if (V2IsUndef)

14284

for (int M : Mask)

14285

if (M >= NumElements) {

14286

SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());

14287

for (int &M : NewMask)

14288

if (M >= NumElements)

14289

M = -1;

14290

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

14291

}

14292

14293

// Check for illegal shuffle mask element index values.

14294

int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;

14295

assert(llvm::all_of(Mask,(static_cast <bool> (llvm::all_of(Mask, [&](int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14297, __extension__ __PRETTY_FUNCTION__))

14296

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(Mask, [&](int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14297, __extension__ __PRETTY_FUNCTION__))

14297

"Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(Mask, [&](int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14297, __extension__ __PRETTY_FUNCTION__));

14298

14299

// We actually see shuffles that are entirely re-arrangements of a set of

14300

// zero inputs. This mostly happens while decomposing complex shuffles into

14301

// simple ones. Directly lower these as a buildvector of zeros.

14302

APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

14303

if (Zeroable.isAllOnesValue())

14304

return getZeroVector(VT, Subtarget, DAG, DL);

14305

14306

// Try to collapse shuffles into using a vector type with fewer elements but

14307

// wider element types. We cap this to not form integers or floating point

14308

// elements wider than 64 bits, but it might be interesting to form i128

14309

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

14310

SmallVector<int, 16> WidenedMask;

14311

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

14312

canWidenShuffleElements(Mask, WidenedMask)) {

14313

MVT NewEltVT = VT.isFloatingPoint()

14314

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

14315

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

14316

MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);

14317

// Make sure that the new vector type is legal. For example, v2f64 isn't

14318

// legal on SSE1.

14319

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

14320

V1 = DAG.getBitcast(NewVT, V1);

14321

V2 = DAG.getBitcast(NewVT, V2);

14322

return DAG.getBitcast(

14323

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

14324

}

14325

}

14326

14327

// Commute the shuffle if it will improve canonicalization.

14328

if (canonicalizeShuffleMaskWithCommute(Mask))

14329

return DAG.getCommutedVectorShuffle(*SVOp);

14330

14331

// For each vector width, delegate to a specialized lowering routine.

14332

if (VT.is128BitVector())

14333

return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,

14334

DAG);

14335

14336

if (VT.is256BitVector())

14337

return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,

14338

DAG);

14339

14340

if (VT.is512BitVector())

14341

return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,

14342

DAG);

14343

14344

if (Is1BitVector)

14345

return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);

14346

14347

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14347);

14348

}

14349

14350

/// \brief Try to lower a VSELECT instruction to a vector shuffle.

14351

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

14352

const X86Subtarget &Subtarget,

14353

SelectionDAG &DAG) {

14354

SDValue Cond = Op.getOperand(0);

14355

SDValue LHS = Op.getOperand(1);

14356

SDValue RHS = Op.getOperand(2);

14357

SDLoc dl(Op);

14358

MVT VT = Op.getSimpleValueType();

14359

14360

if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

14361

return SDValue();

14362

auto *CondBV = cast<BuildVectorSDNode>(Cond);

14363

14364

// Only non-legal VSELECTs reach this lowering, convert those into generic

14365

// shuffles and re-use the shuffle lowering path for blends.

14366

SmallVector<int, 32> Mask;

14367

for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {

14368

SDValue CondElt = CondBV->getOperand(i);

14369

Mask.push_back(

14370

isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)

14371

: -1);

14372

}

14373

return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);

14374

}

14375

14376

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

14377

// A vselect where all conditions and data are constants can be optimized into

14378

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

14379

if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&

14380

ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&

14381

ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))

14382

return SDValue();

14383

14384

// Try to lower this to a blend-style vector shuffle. This can handle all

14385

// constant condition cases.

14386

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

14387

return BlendOp;

14388

14389

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

14390

// with patterns on the mask registers on AVX-512.

14391

if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)

14392

return Op;

14393

14394

// Variable blends are only legal from SSE4.1 onward.

14395

if (!Subtarget.hasSSE41())

14396

return SDValue();

14397

14398

SDLoc dl(Op);

14399

MVT VT = Op.getSimpleValueType();

14400

14401

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

14402

// into an i1 condition so that we can use the mask-based 512-bit blend

14403

// instructions.

14404

if (VT.getSizeInBits() == 512) {

14405

SDValue Cond = Op.getOperand(0);

14406

// The vNi1 condition case should be handled above as it can be trivially

14407

// lowered.

14408

assert(Cond.getValueType().getScalarSizeInBits() ==(static_cast <bool> (Cond.getValueType().getScalarSizeInBits
() == VT.getScalarSizeInBits() && "Should have a size-matched integer condition!"
) ? void (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14410, __extension__ __PRETTY_FUNCTION__))

14409

VT.getScalarSizeInBits() &&(static_cast <bool> (Cond.getValueType().getScalarSizeInBits
() == VT.getScalarSizeInBits() && "Should have a size-matched integer condition!"
) ? void (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14410, __extension__ __PRETTY_FUNCTION__))

14410

"Should have a size-matched integer condition!")(static_cast <bool> (Cond.getValueType().getScalarSizeInBits
() == VT.getScalarSizeInBits() && "Should have a size-matched integer condition!"
) ? void (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14410, __extension__ __PRETTY_FUNCTION__));

14411

// Build a mask by testing the condition against itself (tests for zero).

14412

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

14413

SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);

14414

// Now return a new VSELECT using the mask.

14415

return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));

14416

}

14417

14418

// Only some types will be legal on some subtargets. If we can emit a legal

14419

// VSELECT-matching blend, return Op, and but if we need to expand, return

14420

// a null value.

14421

switch (VT.SimpleTy) {

14422

default:

14423

// Most of the vector types have blends past SSE4.1.

14424

return Op;

14425

14426

case MVT::v32i8:

14427

// The byte blends for AVX vectors were introduced only in AVX2.

14428

if (Subtarget.hasAVX2())

14429

return Op;

14430

14431

return SDValue();

14432

14433

case MVT::v8i16:

14434

case MVT::v16i16:

14435

// FIXME: We should custom lower this by fixing the condition and using i8

14436

// blends.

14437

return SDValue();

14438

}

14439

}

14440

14441

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

14442

MVT VT = Op.getSimpleValueType();

14443

SDLoc dl(Op);

14444

14445

if (!Op.getOperand(0).getSimpleValueType().is128BitVector())

14446

return SDValue();

14447

14448

if (VT.getSizeInBits() == 8) {

14449

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,

14450

Op.getOperand(0), Op.getOperand(1));

14451

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

14452

}

14453

14454

if (VT == MVT::f32) {

14455

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

14456

// the result back to FR32 register. It's only worth matching if the

14457

// result has a single use which is a store or a bitcast to i32. And in

14458

// the case of a store, it's not worth it if the index is a constant 0,

14459

// because a MOVSSmr can be used instead, which is smaller and faster.

14460

if (!Op.hasOneUse())

14461

return SDValue();

14462

SDNode *User = *Op.getNode()->use_begin();

14463

if ((User->getOpcode() != ISD::STORE ||

14464

isNullConstant(Op.getOperand(1))) &&

14465

(User->getOpcode() != ISD::BITCAST ||

14466

User->getValueType(0) != MVT::i32))

14467

return SDValue();

14468

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

14469

DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),

14470

Op.getOperand(1));

14471

return DAG.getBitcast(MVT::f32, Extract);

14472

}

14473

14474

if (VT == MVT::i32 || VT == MVT::i64) {

14475

// ExtractPS/pextrq works with constant index.

14476

if (isa<ConstantSDNode>(Op.getOperand(1)))

14477

return Op;

14478

}

14479

14480

return SDValue();

14481

}

14482

14483

/// Extract one bit from mask vector, like v16i1 or v8i1.

14484

/// AVX-512 feature.

14485

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

14486

const X86Subtarget &Subtarget) {

14487

SDValue Vec = Op.getOperand(0);

14488

SDLoc dl(Vec);

14489

MVT VecVT = Vec.getSimpleValueType();

14490

SDValue Idx = Op.getOperand(1);

14491

MVT EltVT = Op.getSimpleValueType();

14492

14493

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14494, __extension__ __PRETTY_FUNCTION__))

14494

"Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14494, __extension__ __PRETTY_FUNCTION__));

14495

14496

// variable index can't be handled in mask registers,

14497

// extend vector to VR512/128

14498

if (!isa<ConstantSDNode>(Idx)) {

14499

unsigned NumElts = VecVT.getVectorNumElements();

14500

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

14501

// than extending to 128/256bit.

14502

unsigned VecSize = (NumElts <= 4 ? 128 : 512);

14503

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);

14504

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);

14505

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,

14506

ExtVT.getVectorElementType(), Ext, Idx);

14507

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

14508

}

14509

14510

// If the kshift instructions of the correct width aren't natively supported

14511

// then we need to promote the vector to the native size to get the correct

14512

// zeroing behavior.

14513

if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||

14514

(VecVT.getVectorNumElements() < 8)) {

14515

VecVT = MVT::v16i1;

14516

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,

14517

DAG.getUNDEF(VecVT),

14518

Vec,

14519

DAG.getIntPtrConstant(0, dl));

14520

}

14521

14522

// Use kshiftlw/rw instruction.

14523

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

14524

unsigned MaxShift = VecVT.getVectorNumElements() - 1;

14525

if (MaxShift - IdxVal)

14526

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,

14527

DAG.getConstant(MaxShift - IdxVal, dl, MVT::i8));

14528

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,

14529

DAG.getConstant(MaxShift, dl, MVT::i8));

14530

return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,

14531

DAG.getIntPtrConstant(0, dl));

14532

}

14533

14534

SDValue

14535

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

14536

SelectionDAG &DAG) const {

14537

SDLoc dl(Op);

14538

SDValue Vec = Op.getOperand(0);

14539

MVT VecVT = Vec.getSimpleValueType();

14540

SDValue Idx = Op.getOperand(1);

14541

14542

if (VecVT.getVectorElementType() == MVT::i1)

14543

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

14544

14545

if (!isa<ConstantSDNode>(Idx)) {

14546

// Its more profitable to go through memory (1 cycles throughput)

14547

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

14548

// IACA tool was used to get performance estimation

14549

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

14550

14551

// example : extractelement <16 x i8> %a, i32 %i

14552

14553

// Block Throughput: 3.00 Cycles

14554

// Throughput Bottleneck: Port5

14555

14556

// | Num Of | Ports pressure in cycles | |

14557

// | Uops | 0 - DV | 5 | 6 | 7 | |

14558

// ---------------------------------------------

14559

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

14560

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

14561

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

14562

// Total Num Of Uops: 4

14563

14564

14565

// Block Throughput: 1.00 Cycles

14566

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

14567

14568

// | | Ports pressure in cycles | |

14569

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

14570

// ---------------------------------------------------------

14571

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

14572

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

14573

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

14574

// Total Num Of Uops: 4

14575

14576

return SDValue();

14577

}

14578

14579

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

14580

14581

// If this is a 256-bit vector result, first extract the 128-bit vector and

14582

// then extract the element from the 128-bit vector.

14583

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

14584

// Get the 128-bit vector.

14585

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

14586

MVT EltVT = VecVT.getVectorElementType();

14587

14588

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

14589

14590

14591

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

14592

// this can be done with a mask.

14593

IdxVal &= ElemsPerChunk - 1;

14594

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

14595

DAG.getConstant(IdxVal, dl, MVT::i32));

14596

}

14597

14598

assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14598, __extension__ __PRETTY_FUNCTION__));

14599

14600

MVT VT = Op.getSimpleValueType();

14601

14602

if (VT.getSizeInBits() == 16) {

14603

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

14604

// we're going to zero extend the register or fold the store (SSE41 only).

14605

if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&

14606

!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))

14607

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

14608

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

14609

DAG.getBitcast(MVT::v4i32, Vec), Idx));

14610

14611

// Transform it so it match pextrw which produces a 32-bit result.

14612

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,

14613

Op.getOperand(0), Op.getOperand(1));

14614

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

14615

}

14616

14617

if (Subtarget.hasSSE41())

14618

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

14619

return Res;

14620

14621

// TODO: We only extract a single element from v16i8, we can probably afford

14622

// to be more aggressive here before using the default approach of spilling to

14623

// stack.

14624

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

14625

// Extract either the lowest i32 or any i16, and extract the sub-byte.

14626

int DWordIdx = IdxVal / 4;

14627

if (DWordIdx == 0) {

14628

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

14629

DAG.getBitcast(MVT::v4i32, Vec),

14630

DAG.getIntPtrConstant(DWordIdx, dl));

14631

int ShiftVal = (IdxVal % 4) * 8;

14632

if (ShiftVal != 0)

14633

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

14634

DAG.getConstant(ShiftVal, dl, MVT::i32));

14635

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

14636

}

14637

14638

int WordIdx = IdxVal / 2;

14639

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

14640

DAG.getBitcast(MVT::v8i16, Vec),

14641

DAG.getIntPtrConstant(WordIdx, dl));

14642

int ShiftVal = (IdxVal % 2) * 8;

14643

if (ShiftVal != 0)

14644

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

14645

DAG.getConstant(ShiftVal, dl, MVT::i16));

14646

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

14647

}

14648

14649

if (VT.getSizeInBits() == 32) {

14650

if (IdxVal == 0)

14651

return Op;

14652

14653

// SHUFPS the element to the lowest double word, then movss.

14654

int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };

14655

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

14656

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

14657

DAG.getIntPtrConstant(0, dl));

14658

}

14659

14660

if (VT.getSizeInBits() == 64) {

14661

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

14662

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

14663

// to match extract_elt for f64.

14664

if (IdxVal == 0)

14665

return Op;

14666

14667

// UNPCKHPD the element to the lowest double word, then movsd.

14668

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

14669

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

14670

int Mask[2] = { 1, -1 };

14671

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

14672

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

14673

DAG.getIntPtrConstant(0, dl));

14674

}

14675

14676

return SDValue();

14677

}

14678

14679

/// Insert one bit to mask vector, like v16i1 or v8i1.

14680

/// AVX-512 feature.

14681

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

14682

const X86Subtarget &Subtarget) {

14683

SDLoc dl(Op);

14684

SDValue Vec = Op.getOperand(0);

14685

SDValue Elt = Op.getOperand(1);

14686

SDValue Idx = Op.getOperand(2);

14687

MVT VecVT = Vec.getSimpleValueType();

14688

14689

if (!isa<ConstantSDNode>(Idx)) {

14690

// Non constant index. Extend source and destination,

14691

// insert element and then truncate the result.

14692

MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);

14693

MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);

14694

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

14695

DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),

14696

DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);

14697

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

14698

}

14699

14700

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

14701

unsigned NumElems = VecVT.getVectorNumElements();

14702

14703

// If the kshift instructions of the correct width aren't natively supported

14704

// then we need to promote the vector to the native size to get the correct

14705

// zeroing behavior.

14706

if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {

14707

// Need to promote to v16i1, do the insert, then extract back.

14708

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

14709

DAG.getUNDEF(MVT::v16i1), Vec,

14710

DAG.getIntPtrConstant(0, dl));

14711

Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);

14712

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,

14713

DAG.getIntPtrConstant(0, dl));

14714

}

14715

14716

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);

14717

14718

if (Vec.isUndef()) {

14719

if (IdxVal)

14720

EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,

14721

DAG.getConstant(IdxVal, dl, MVT::i8));

14722

return EltInVec;

14723

}

14724

14725

// Insertion of one bit into first position

14726

if (IdxVal == 0 ) {

14727

// Clean top bits of vector.

14728

EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,

14729

DAG.getConstant(NumElems - 1, dl, MVT::i8));

14730

EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,

14731

DAG.getConstant(NumElems - 1, dl, MVT::i8));

14732

// Clean the first bit in source vector.

14733

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,

14734

DAG.getConstant(1 , dl, MVT::i8));

14735

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,

14736

DAG.getConstant(1, dl, MVT::i8));

14737

14738

return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);

14739

}

14740

// Insertion of one bit into last position

14741

if (IdxVal == NumElems - 1) {

14742

// Move the bit to the last position inside the vector.

14743

EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,

14744

DAG.getConstant(IdxVal, dl, MVT::i8));

14745

// Clean the last bit in the source vector.

14746

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,

14747

DAG.getConstant(1, dl, MVT::i8));

14748

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,

14749

DAG.getConstant(1 , dl, MVT::i8));

14750

14751

return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);

14752

}

14753

14754

// Move the current value of the bit to be replace to bit 0.

14755

SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,

14756

DAG.getConstant(IdxVal, dl, MVT::i8));

14757

// Xor with the new bit.

14758

Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);

14759

// Shift to MSB, filling bottom bits with 0.

14760

Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,

14761

DAG.getConstant(NumElems - 1, dl, MVT::i8));

14762

// Shift to the final position, filling upper bits with 0.

14763

Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,

14764

DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));

14765

// Xor with original vector to cancel out the original bit value that's still

14766

// present.

14767

return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);

14768

}

14769

14770

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

14771

SelectionDAG &DAG) const {

14772

MVT VT = Op.getSimpleValueType();

14773

MVT EltVT = VT.getVectorElementType();

14774

unsigned NumElts = VT.getVectorNumElements();

14775

14776

if (EltVT == MVT::i1)

14777

return InsertBitToMaskVector(Op, DAG, Subtarget);

14778

14779

SDLoc dl(Op);

14780

SDValue N0 = Op.getOperand(0);

14781

SDValue N1 = Op.getOperand(1);

14782

SDValue N2 = Op.getOperand(2);

14783

if (!isa<ConstantSDNode>(N2))

14784

return SDValue();

14785

auto *N2C = cast<ConstantSDNode>(N2);

14786

unsigned IdxVal = N2C->getZExtValue();

14787

14788

bool IsZeroElt = X86::isZeroNode(N1);

14789

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

14790

14791

// If we are inserting a element, see if we can do this more efficiently with

14792

// a blend shuffle with a rematerializable vector than a costly integer

14793

// insertion.

14794

if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&

14795

16 <= EltVT.getSizeInBits()) {

14796

SmallVector<int, 8> BlendMask;

14797

for (unsigned i = 0; i != NumElts; ++i)

14798

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

14799

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

14800

: getOnesVector(VT, DAG, dl);

14801

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

14802

}

14803

14804

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

14805

// into that, and then insert the subvector back into the result.

14806

if (VT.is256BitVector() || VT.is512BitVector()) {

14807

// With a 256-bit vector, we can insert into the zero element efficiently

14808

// using a blend if we have AVX or AVX2 and the right data type.

14809

if (VT.is256BitVector() && IdxVal == 0) {

14810

// TODO: It is worthwhile to cast integer to floating point and back

14811

// and incur a domain crossing penalty if that's what we'll end up

14812

// doing anyway after extracting to a 128-bit vector.

14813

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

14814

(Subtarget.hasAVX2() && EltVT == MVT::i32)) {

14815

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

14816

N2 = DAG.getIntPtrConstant(1, dl);

14817

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);

14818

}

14819

}

14820

14821

// Get the desired 128-bit vector chunk.

14822

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

14823

14824

// Insert the element into the desired chunk.

14825

unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();

14826

assert(isPowerOf2_32(NumEltsIn128))(static_cast <bool> (isPowerOf2_32(NumEltsIn128)) ? void
(0) : __assert_fail ("isPowerOf2_32(NumEltsIn128)", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14826, __extension__ __PRETTY_FUNCTION__));

14827

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

14828

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

14829

14830

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

14831

DAG.getConstant(IdxIn128, dl, MVT::i32));

14832

14833

// Insert the changed part back into the bigger vector

14834

return insert128BitVector(N0, V, IdxVal, DAG, dl);

14835

}

14836

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14836, __extension__ __PRETTY_FUNCTION__));

14837

14838

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

14839

// argument. SSE41 required for pinsrb.

14840

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

14841

unsigned Opc;

14842

if (VT == MVT::v8i16) {

14843

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14843, __extension__ __PRETTY_FUNCTION__));

14844

Opc = X86ISD::PINSRW;

14845

} else {

14846

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14846, __extension__ __PRETTY_FUNCTION__));

14847

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14847, __extension__ __PRETTY_FUNCTION__));

14848

Opc = X86ISD::PINSRB;

14849

}

14850

14851

if (N1.getValueType() != MVT::i32)

14852

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

14853

if (N2.getValueType() != MVT::i32)

14854

N2 = DAG.getIntPtrConstant(IdxVal, dl);

14855

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

14856

}

14857

14858

if (Subtarget.hasSSE41()) {

14859

if (EltVT == MVT::f32) {

14860

// Bits [7:6] of the constant are the source select. This will always be

14861

// zero here. The DAG Combiner may combine an extract_elt index into

14862

// these bits. For example (insert (extract, 3), 2) could be matched by

14863

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

14864

// Bits [5:4] of the constant are the destination select. This is the

14865

// value of the incoming immediate.

14866

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

14867

// combine either bitwise AND or insert of float 0.0 to set these bits.

14868

14869

bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();

14870

if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {

14871

// If this is an insertion of 32-bits into the low 32-bits of

14872

// a vector, we prefer to generate a blend with immediate rather

14873

// than an insertps. Blends are simpler operations in hardware and so

14874

// will always have equal or better performance than insertps.

14875

// But if optimizing for size and there's a load folding opportunity,

14876

// generate insertps because blendps does not have a 32-bit memory

14877

// operand form.

14878

N2 = DAG.getIntPtrConstant(1, dl);

14879

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

14880

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);

14881

}

14882

N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);

14883

// Create this as a scalar to vector..

14884

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

14885

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);

14886

}

14887

14888

// PINSR* works with constant index.

14889

if (EltVT == MVT::i32 || EltVT == MVT::i64)

14890

return Op;

14891

}

14892

14893

return SDValue();

14894

}

14895

14896

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

14897

SelectionDAG &DAG) {

14898

SDLoc dl(Op);

14899

MVT OpVT = Op.getSimpleValueType();

14900

14901

// It's always cheaper to replace a xor+movd with xorps and simplifies further

14902

// combines.

14903

if (X86::isZeroNode(Op.getOperand(0)))

14904

return getZeroVector(OpVT, Subtarget, DAG, dl);

14905

14906

// If this is a 256-bit vector result, first insert into a 128-bit

14907

// vector and then insert into the 256-bit vector.

14908

if (!OpVT.is128BitVector()) {

14909

// Insert into a 128-bit vector.

14910

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

14911

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

14912

OpVT.getVectorNumElements() / SizeFactor);

14913

14914

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

14915

14916

// Insert the 128-bit vector.

14917

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

14918

}

14919

assert(OpVT.is128BitVector() && "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14919, __extension__ __PRETTY_FUNCTION__));

14920

14921

// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.

14922

if (OpVT == MVT::v4i32)

14923

return Op;

14924

14925

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

14926

return DAG.getBitcast(

14927

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

14928

}

14929

14930

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

14931

// simple superregister reference or explicit instructions to insert

14932

// the upper bits of a vector.

14933

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

14934

SelectionDAG &DAG) {

14935

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14935, __extension__ __PRETTY_FUNCTION__));

14936

14937

return insert1BitVector(Op, DAG, Subtarget);

14938

}

14939

14940

// Returns the appropriate wrapper opcode for a global reference.

14941

unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {

14942

// References to absolute symbols are never PC-relative.

14943

if (GV && GV->isAbsoluteSymbolRef())

14944

return X86ISD::Wrapper;

14945

14946

CodeModel::Model M = getTargetMachine().getCodeModel();

14947

if (Subtarget.isPICStyleRIPRel() &&

14948

(M == CodeModel::Small || M == CodeModel::Kernel))

14949

return X86ISD::WrapperRIP;

14950

14951

return X86ISD::Wrapper;

14952

}

14953

14954

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

14955

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

14956

// one of the above mentioned nodes. It has to be wrapped because otherwise

14957

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

14958

// be used to form addressing mode. These wrapped nodes will be selected

14959

// into MOV32ri.

14960

SDValue

14961

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

14962

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

14963

14964

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

14965

// global base reg.

14966

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

14967

14968

auto PtrVT = getPointerTy(DAG.getDataLayout());

14969

SDValue Result = DAG.getTargetConstantPool(

14970

CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);

14971

SDLoc DL(CP);

14972

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

14973

// With PIC, the address is actually $g + Offset.

14974

if (OpFlag) {

14975

Result =

14976

DAG.getNode(ISD::ADD, DL, PtrVT,

14977

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

14978

}

14979

14980

return Result;

14981

}

14982

14983

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

14984

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

14985

14986

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

14987

// global base reg.

14988

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

14989

14990

auto PtrVT = getPointerTy(DAG.getDataLayout());

14991

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

14992

SDLoc DL(JT);

14993

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

14994

14995

// With PIC, the address is actually $g + Offset.

14996

if (OpFlag)

14997

Result =

14998

DAG.getNode(ISD::ADD, DL, PtrVT,

14999

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

15000

15001

return Result;

15002

}

15003

15004

SDValue

15005

X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {

15006

const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

15007

15008

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

15009

// global base reg.

15010

const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();

15011

unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);

15012

15013

auto PtrVT = getPointerTy(DAG.getDataLayout());

15014

SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);

15015

15016

SDLoc DL(Op);

15017

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

15018

15019

// With PIC, the address is actually $g + Offset.

15020

if (isPositionIndependent() && !Subtarget.is64Bit()) {

15021

Result =

15022

DAG.getNode(ISD::ADD, DL, PtrVT,

15023

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

15024

}

15025

15026

// For symbols that require a load from a stub to get the address, emit the

15027

// load.

15028

if (isGlobalStubReference(OpFlag))

15029

Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,

15030

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

15031

15032

return Result;

15033

}

15034

15035

SDValue

15036

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

15037

// Create the TargetBlockAddressAddress node.

15038

unsigned char OpFlags =

15039

Subtarget.classifyBlockAddressReference();

15040

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

15041

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

15042

SDLoc dl(Op);

15043

auto PtrVT = getPointerTy(DAG.getDataLayout());

15044

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

15045

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

15046

15047

// With PIC, the address is actually $g + Offset.

15048

if (isGlobalRelativeToPICBase(OpFlags)) {

15049

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

15050

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

15051

}

15052

15053

return Result;

15054

}

15055

15056

SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,

15057

const SDLoc &dl, int64_t Offset,

15058

SelectionDAG &DAG) const {

15059

// Create the TargetGlobalAddress node, folding in the constant

15060

// offset if it is legal.

15061

unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);

15062

CodeModel::Model M = DAG.getTarget().getCodeModel();

15063

auto PtrVT = getPointerTy(DAG.getDataLayout());

15064

SDValue Result;

15065

if (OpFlags == X86II::MO_NO_FLAG &&

15066

X86::isOffsetSuitableForCodeModel(Offset, M)) {

15067

// A direct static reference to a global.

15068

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);

15069

Offset = 0;

15070

} else {

15071

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);

15072

}

15073

15074

Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);

15075

15076

// With PIC, the address is actually $g + Offset.

15077

if (isGlobalRelativeToPICBase(OpFlags)) {

15078

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

15079

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

15080

}

15081

15082

// For globals that require a load from a stub to get the address, emit the

15083

// load.

15084

if (isGlobalStubReference(OpFlags))

15085

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

15086

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

15087

15088

// If there was a non-zero offset that we didn't fold, create an explicit

15089

// addition for it.

15090

if (Offset != 0)

15091

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

15092

DAG.getConstant(Offset, dl, PtrVT));

15093

15094

return Result;

15095

}

15096

15097

SDValue

15098

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

15099

const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

15100

int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();

15101

return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);

15102

}

15103

15104

static SDValue

15105

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

15106

SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,

15107

unsigned char OperandFlags, bool LocalDynamic = false) {

15108

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

15109

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

15110

SDLoc dl(GA);

15111

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

15112

GA->getValueType(0),

15113

GA->getOffset(),

15114

OperandFlags);

15115

15116

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

15117

: X86ISD::TLSADDR;

15118

15119

if (InFlag) {

15120

SDValue Ops[] = { Chain, TGA, *InFlag };

15121

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

15122

} else {

15123

SDValue Ops[] = { Chain, TGA };

15124

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

15125

}

15126

15127

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

15128

MFI.setAdjustsStack(true);

15129

MFI.setHasCalls(true);

15130

15131

SDValue Flag = Chain.getValue(1);

15132

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);

15133

}

15134

15135

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

15136

static SDValue

15137

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

15138

const EVT PtrVT) {

15139

SDValue InFlag;

15140

SDLoc dl(GA); // ? function entry point might be better

15141

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

15142

DAG.getNode(X86ISD::GlobalBaseReg,

15143

SDLoc(), PtrVT), InFlag);

15144

InFlag = Chain.getValue(1);

15145

15146

return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);

15147

}

15148

15149

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit

15150

static SDValue

15151

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

15152

const EVT PtrVT) {

15153

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

15154

X86::RAX, X86II::MO_TLSGD);

15155

}

15156

15157

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

15158

SelectionDAG &DAG,

15159

const EVT PtrVT,

15160

bool is64Bit) {

15161

SDLoc dl(GA);

15162

15163

// Get the start address of the TLS block for this module.

15164

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

15165

.getInfo<X86MachineFunctionInfo>();

15166

MFI->incNumLocalDynamicTLSAccesses();

15167

15168

SDValue Base;

15169

if (is64Bit) {

15170

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,

15171

X86II::MO_TLSLD, /*LocalDynamic=*/true);

15172

} else {

15173

SDValue InFlag;

15174

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

15175

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);

15176

InFlag = Chain.getValue(1);

15177

Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,

15178

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

15179

}

15180

15181

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

15182

// of Base.

15183

15184

// Build x@dtpoff.

15185

unsigned char OperandFlags = X86II::MO_DTPOFF;

15186

unsigned WrapperKind = X86ISD::Wrapper;

15187

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

15188

GA->getValueType(0),

15189

GA->getOffset(), OperandFlags);

15190

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

15191

15192

// Add x@dtpoff with the base.

15193

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

15194

}

15195

15196

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

15197

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

15198

const EVT PtrVT, TLSModel::Model model,

15199

bool is64Bit, bool isPIC) {

15200

SDLoc dl(GA);

15201

15202

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

15203

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

15204

is64Bit ? 257 : 256));

15205

15206

SDValue ThreadPointer =

15207

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

15208

MachinePointerInfo(Ptr));

15209

15210

unsigned char OperandFlags = 0;

15211

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

15212

// initialexec.

15213

unsigned WrapperKind = X86ISD::Wrapper;

15214

if (model == TLSModel::LocalExec) {

15215

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

15216

} else if (model == TLSModel::InitialExec) {

15217

if (is64Bit) {

15218

OperandFlags = X86II::MO_GOTTPOFF;

15219

WrapperKind = X86ISD::WrapperRIP;

15220

} else {

15221

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

15222

}

15223

} else {

15224

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15224);

15225

}

15226

15227

// emit "addl x@ntpoff,%eax" (local exec)

15228

// or "addl x@indntpoff,%eax" (initial exec)

15229

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

15230

SDValue TGA =

15231

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

15232

GA->getOffset(), OperandFlags);

15233

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

15234

15235

if (model == TLSModel::InitialExec) {

15236

if (isPIC && !is64Bit) {

15237

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

15238

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

15239

Offset);

15240

}

15241

15242

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

15243

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

15244

}

15245

15246

// The address of the thread local variable is the add of the thread

15247

// pointer with the offset of the variable.

15248

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

15249

}

15250

15251

SDValue

15252

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

15253

15254

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

15255

15256

if (DAG.getTarget().Options.EmulatedTLS)

15257

return LowerToTLSEmulatedModel(GA, DAG);

15258

15259

const GlobalValue *GV = GA->getGlobal();

15260

auto PtrVT = getPointerTy(DAG.getDataLayout());

15261

bool PositionIndependent = isPositionIndependent();

15262

15263

if (Subtarget.isTargetELF()) {

15264

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

15265

switch (model) {

15266

case TLSModel::GeneralDynamic:

15267

if (Subtarget.is64Bit())

15268

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

15269

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

15270

case TLSModel::LocalDynamic:

15271

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,

15272

Subtarget.is64Bit());

15273

case TLSModel::InitialExec:

15274

case TLSModel::LocalExec:

15275

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

15276

PositionIndependent);

15277

}

15278

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15278);

15279

}

15280

15281

if (Subtarget.isTargetDarwin()) {

15282

// Darwin only has one model of TLS. Lower to that.

15283

unsigned char OpFlag = 0;

15284

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

15285

X86ISD::WrapperRIP : X86ISD::Wrapper;

15286

15287

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

15288

// global base reg.

15289

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

15290

if (PIC32)

15291

OpFlag = X86II::MO_TLVP_PIC_BASE;

15292

else

15293

OpFlag = X86II::MO_TLVP;

15294

SDLoc DL(Op);

15295

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

15296

GA->getValueType(0),

15297

GA->getOffset(), OpFlag);

15298

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

15299

15300

// With PIC32, the address is actually $g + Offset.

15301

if (PIC32)

15302

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

15303

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

15304

Offset);

15305

15306

// Lowering the machine isd will make sure everything is in the right

15307

// location.

15308

SDValue Chain = DAG.getEntryNode();

15309

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

15310

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

15311

SDValue Args[] = { Chain, Offset };

15312

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

15313

Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),

15314

DAG.getIntPtrConstant(0, DL, true),

15315

Chain.getValue(1), DL);

15316

15317

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

15318

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

15319

MFI.setAdjustsStack(true);

15320

15321

// And our return value (tls address) is in the standard call return value

15322

// location.

15323

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

15324

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

15325

}

15326

15327

if (Subtarget.isTargetKnownWindowsMSVC() ||

15328

Subtarget.isTargetWindowsItanium() ||

15329

Subtarget.isTargetWindowsGNU()) {

15330

// Just use the implicit TLS architecture

15331

// Need to generate something similar to:

15332

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

15333

// ; from TEB

15334

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

15335

// mov rcx, qword [rdx+rcx*8]

15336

// mov eax, .tls$:tlsvar

15337

// [rax+rcx] contains the address

15338

// Windows 64bit: gs:0x58

15339

// Windows 32bit: fs:__tls_array

15340

15341

SDLoc dl(GA);

15342

SDValue Chain = DAG.getEntryNode();

15343

15344

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

15345

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

15346

// use its literal value of 0x2C.

15347

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

15348

? Type::getInt8PtrTy(*DAG.getContext(),

15349

256)

15350

: Type::getInt32PtrTy(*DAG.getContext(),

15351

257));

15352

15353

SDValue TlsArray = Subtarget.is64Bit()

15354

? DAG.getIntPtrConstant(0x58, dl)

15355

: (Subtarget.isTargetWindowsGNU()

15356

? DAG.getIntPtrConstant(0x2C, dl)

15357

: DAG.getExternalSymbol("_tls_array", PtrVT));

15358

15359

SDValue ThreadPointer =

15360

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

15361

15362

SDValue res;

15363

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

15364

res = ThreadPointer;

15365

} else {

15366

// Load the _tls_index variable

15367

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

15368

if (Subtarget.is64Bit())

15369

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

15370

MachinePointerInfo(), MVT::i32);

15371

else

15372

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

15373

15374

auto &DL = DAG.getDataLayout();

15375

SDValue Scale =

15376

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);

15377

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

15378

15379

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

15380

}

15381

15382

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

15383

15384

// Get the offset of start of .tls section

15385

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

15386

GA->getValueType(0),

15387

GA->getOffset(), X86II::MO_SECREL);

15388

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

15389

15390

// The address of the thread local variable is the add of the thread

15391

// pointer with the offset of the variable.

15392

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

15393

}

15394

15395

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15395);

15396

}

15397

15398

/// Lower SRA_PARTS and friends, which return two i32 values

15399

/// and take a 2 x i32 value to shift plus a shift amount.

15400

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

15401

assert(Op.getNumOperands() == 3 && "Not a double-shift!")(static_cast <bool> (Op.getNumOperands() == 3 &&
"Not a double-shift!") ? void (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15401, __extension__ __PRETTY_FUNCTION__));

15402

MVT VT = Op.getSimpleValueType();

15403

unsigned VTBits = VT.getSizeInBits();

15404

SDLoc dl(Op);

15405

bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;

15406

SDValue ShOpLo = Op.getOperand(0);

15407

SDValue ShOpHi = Op.getOperand(1);

15408

SDValue ShAmt = Op.getOperand(2);

15409

// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the

15410

// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away

15411

// during isel.

15412

SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,

15413

DAG.getConstant(VTBits - 1, dl, MVT::i8));

15414

SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,

15415

DAG.getConstant(VTBits - 1, dl, MVT::i8))

15416

: DAG.getConstant(0, dl, VT);

15417

15418

SDValue Tmp2, Tmp3;

15419

if (Op.getOpcode() == ISD::SHL_PARTS) {

15420

Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);

15421

Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);

15422

} else {

15423

Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);

15424

Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);

15425

}

15426

15427

// If the shift amount is larger or equal than the width of a part we can't

15428

// rely on the results of shld/shrd. Insert a test and select the appropriate

15429

// values for large shift amounts.

15430

SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,

15431

DAG.getConstant(VTBits, dl, MVT::i8));

15432

SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

15433

AndNode, DAG.getConstant(0, dl, MVT::i8));

15434

15435

SDValue Hi, Lo;

15436

SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);

15437

SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };

15438

SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

15439

15440

if (Op.getOpcode() == ISD::SHL_PARTS) {

15441

Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);

15442

Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);

15443

} else {

15444

Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);

15445

Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);

15446

}

15447

15448

SDValue Ops[2] = { Lo, Hi };

15449

return DAG.getMergeValues(Ops, dl);

15450

}

15451

15452

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

15453

SelectionDAG &DAG) const {

15454

SDValue Src = Op.getOperand(0);

15455

MVT SrcVT = Src.getSimpleValueType();

15456

MVT VT = Op.getSimpleValueType();

15457

SDLoc dl(Op);

15458

15459

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

15460

if (SrcVT.isVector()) {

15461

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

15462

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

15463

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

15464

DAG.getUNDEF(SrcVT)));

15465

}

15466

if (SrcVT.getVectorElementType() == MVT::i1) {

15467

if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))

15468

return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),

15469

DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));

15470

MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());

15471

return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),

15472

DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));

15473

}

15474

return SDValue();

15475

}

15476

15477

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15478, __extension__ __PRETTY_FUNCTION__))

15478

"Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15478, __extension__ __PRETTY_FUNCTION__));

15479

15480

// These are really Legal; return the operand so the caller accepts it as

15481

// Legal.

15482

if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))

15483

return Op;

15484

if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&

15485

Subtarget.is64Bit()) {

15486

return Op;

15487

}

15488

15489

SDValue ValueToStore = Op.getOperand(0);

15490

if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&

15491

!Subtarget.is64Bit())

15492

// Bitcasting to f64 here allows us to do a single 64-bit store from

15493

// an SSE register, avoiding the store forwarding penalty that would come

15494

// with two 32-bit stores.

15495

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

15496

15497

unsigned Size = SrcVT.getSizeInBits()/8;

15498

MachineFunction &MF = DAG.getMachineFunction();

15499

auto PtrVT = getPointerTy(MF.getDataLayout());

15500

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);

15501

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

15502

SDValue Chain = DAG.getStore(

15503

DAG.getEntryNode(), dl, ValueToStore, StackSlot,

15504

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

15505

return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);

15506

}

15507

15508

SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

15509

SDValue StackSlot,

15510

SelectionDAG &DAG) const {

15511

// Build the FILD

15512

SDLoc DL(Op);

15513

SDVTList Tys;

15514

bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());

15515

if (useSSE)

15516

Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);

15517

else

15518

Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

15519

15520

unsigned ByteSize = SrcVT.getSizeInBits()/8;

15521

15522

FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);

15523

MachineMemOperand *MMO;

15524

if (FI) {

15525

int SSFI = FI->getIndex();

15526

MMO = DAG.getMachineFunction().getMachineMemOperand(

15527

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

15528

MachineMemOperand::MOLoad, ByteSize, ByteSize);

15529

} else {

15530

MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();

15531

StackSlot = StackSlot.getOperand(1);

15532

}

15533

SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };

15534

SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :

15535

X86ISD::FILD, DL,

15536

Tys, Ops, SrcVT, MMO);

15537

15538

if (useSSE) {

15539

Chain = Result.getValue(1);

15540

SDValue InFlag = Result.getValue(2);

15541

15542

// FIXME: Currently the FST is flagged to the FILD_FLAG. This

15543

// shouldn't be necessary except that RFP cannot be live across

15544

// multiple blocks. When stackifier is fixed, they can be uncoupled.

15545

MachineFunction &MF = DAG.getMachineFunction();

15546

unsigned SSFISize = Op.getValueSizeInBits()/8;

15547

int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);

15548

auto PtrVT = getPointerTy(MF.getDataLayout());

15549

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

15550

Tys = DAG.getVTList(MVT::Other);

15551

SDValue Ops[] = {

15552

Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag

15553

};

15554

MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(

15555

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

15556

MachineMemOperand::MOStore, SSFISize, SSFISize);

15557

15558

Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,

15559

Ops, Op.getValueType(), MMO);

15560

Result = DAG.getLoad(

15561

Op.getValueType(), DL, Chain, StackSlot,

15562

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

15563

}

15564

15565

return Result;

15566

}

15567

15568

/// 64-bit unsigned integer to double expansion.

15569

SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,

15570

SelectionDAG &DAG) const {

15571

// This algorithm is not obvious. Here it is what we're trying to output:

15572

15573

movq %rax, %xmm0

15574

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

15575

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

15576

#ifdef __SSE3__

15577

haddpd %xmm0, %xmm0

15578

#else

15579

pshufd $0x4e, %xmm0, %xmm1

15580

addpd %xmm1, %xmm0

15581

#endif

15582

15583

15584

SDLoc dl(Op);

15585

LLVMContext *Context = DAG.getContext();

15586

15587

// Build some magic constants.

15588

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

15589

Constant *C0 = ConstantDataVector::get(*Context, CV0);

15590

auto PtrVT = getPointerTy(DAG.getDataLayout());

15591

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

15592

15593

SmallVector<Constant*,2> CV1;

15594

CV1.push_back(

15595

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

15596

APInt(64, 0x4330000000000000ULL))));

15597

CV1.push_back(

15598

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

15599

APInt(64, 0x4530000000000000ULL))));

15600

Constant *C1 = ConstantVector::get(CV1);

15601

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

15602

15603

// Load the 64-bit value into an XMM register.

15604

SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

15605

Op.getOperand(0));

15606

SDValue CLod0 =

15607

DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

15608

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

15609

/* Alignment = */ 16);

15610

SDValue Unpck1 =

15611

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

15612

15613

SDValue CLod1 =

15614

DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

15615

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),

15616

/* Alignment = */ 16);

15617

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

15618

// TODO: Are there any fast-math-flags to propagate here?

15619

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

15620

SDValue Result;

15621

15622

if (Subtarget.hasSSE3()) {

15623

// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.

15624

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

15625

} else {

15626

SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);

15627

SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});

15628

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,

15629

DAG.getBitcast(MVT::v2f64, Shuffle), Sub);

15630

}

15631

15632

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

15633

DAG.getIntPtrConstant(0, dl));

15634

}

15635

15636

/// 32-bit unsigned integer to float expansion.

15637

SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,

15638

SelectionDAG &DAG) const {

15639

SDLoc dl(Op);

15640

// FP constant to bias correct the final result.

15641

SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,

15642

MVT::f64);

15643

15644

// Load the 32-bit value into an XMM register.

15645

SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,

15646

Op.getOperand(0));

15647

15648

// Zero out the upper parts of the register.

15649

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

15650

15651

Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

15652

DAG.getBitcast(MVT::v2f64, Load),

15653

DAG.getIntPtrConstant(0, dl));

15654

15655

// Or the load with the bias.

15656

SDValue Or = DAG.getNode(

15657

ISD::OR, dl, MVT::v2i64,

15658

DAG.getBitcast(MVT::v2i64,

15659

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),

15660

DAG.getBitcast(MVT::v2i64,

15661

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

15662

Or =

15663

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

15664

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

15665

15666

// Subtract the bias.

15667

// TODO: Are there any fast-math-flags to propagate here?

15668

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

15669

15670

// Handle final rounding.

15671

MVT DestVT = Op.getSimpleValueType();

15672

15673

if (DestVT.bitsLT(MVT::f64))

15674

return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,

15675

DAG.getIntPtrConstant(0, dl));

15676

if (DestVT.bitsGT(MVT::f64))

15677

return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);

15678

15679

// Handle final rounding.

15680

return Sub;

15681

}

15682

15683

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

15684

const X86Subtarget &Subtarget, SDLoc &DL) {

15685

if (Op.getSimpleValueType() != MVT::v2f64)

15686

return SDValue();

15687

15688

SDValue N0 = Op.getOperand(0);

15689

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15689, __extension__ __PRETTY_FUNCTION__));

15690

15691

// Legalize to v4i32 type.

15692

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

15693

DAG.getUNDEF(MVT::v2i32));

15694

15695

if (Subtarget.hasAVX512())

15696

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

15697

15698

// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,

15699

// but using v2i32 to v2f64 with X86ISD::CVTSI2P.

15700

SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);

15701

SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

15702

15703

// Two to the power of half-word-size.

15704

SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);

15705

15706

// Clear upper part of LO, lower HI.

15707

SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);

15708

SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

15709

15710

SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);

15711

fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);

15712

SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

15713

15714

// Add the two halves.

15715

return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);

15716

}

15717

15718

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

15719

const X86Subtarget &Subtarget) {

15720

// The algorithm is the following:

15721

// #ifdef __SSE4_1__

15722

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

15723

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

15724

// (uint4) 0x53000000, 0xaa);

15725

// #else

15726

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

15727

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

15728

// #endif

15729

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

15730

// return (float4) lo + fhi;

15731

15732

// We shouldn't use it when unsafe-fp-math is enabled though: we might later

15733

// reassociate the two FADDs, and if we do that, the algorithm fails

15734

// spectacularly (PR24512).

15735

// FIXME: If we ever have some kind of Machine FMF, this should be marked

15736

// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because

15737

// there's also the MachineCombiner reassociations happening on Machine IR.

15738

if (DAG.getTarget().Options.UnsafeFPMath)

15739

return SDValue();

15740

15741

SDLoc DL(Op);

15742

SDValue V = Op->getOperand(0);

15743

MVT VecIntVT = V.getSimpleValueType();

15744

bool Is128 = VecIntVT == MVT::v4i32;

15745

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

15746

// If we convert to something else than the supported type, e.g., to v4f64,

15747

// abort early.

15748

if (VecFloatVT != Op->getSimpleValueType(0))

15749

return SDValue();

15750

15751

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15752, __extension__ __PRETTY_FUNCTION__))

15752

"Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15752, __extension__ __PRETTY_FUNCTION__));

15753

15754

// In the #idef/#else code, we have in common:

15755

// - The vector of constants:

15756

// -- 0x4b000000

15757

// -- 0x53000000

15758

// - A shift:

15759

// -- v >> 16

15760

15761

// Create the splat vector for 0x4b000000.

15762

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

15763

// Create the splat vector for 0x53000000.

15764

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

15765

15766

// Create the right shift.

15767

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

15768

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

15769

15770

SDValue Low, High;

15771

if (Subtarget.hasSSE41()) {

15772

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

15773

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

15774

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

15775

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

15776

// Low will be bitcasted right away, so do not bother bitcasting back to its

15777

// original type.

15778

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

15779

VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));

15780

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

15781

// (uint4) 0x53000000, 0xaa);

15782

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

15783

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

15784

// High will be bitcasted right away, so do not bother bitcasting back to

15785

// its original type.

15786

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

15787

VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));

15788

} else {

15789

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

15790

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

15791

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

15792

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

15793

15794

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

15795

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

15796

}

15797

15798

// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).

15799

SDValue VecCstFAdd = DAG.getConstantFP(

15800

APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

15801

15802

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

15803

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

15804

// TODO: Are there any fast-math-flags to propagate here?

15805

SDValue FHigh =

15806

DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);

15807

// return (float4) lo + fhi;

15808

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

15809

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

15810

}

15811

15812

SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,

15813

SelectionDAG &DAG) const {

15814

SDValue N0 = Op.getOperand(0);

15815

MVT SrcVT = N0.getSimpleValueType();

15816

SDLoc dl(Op);

15817

15818

if (SrcVT.getVectorElementType() == MVT::i1) {

15819

if (SrcVT == MVT::v2i1)

15820

return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),

15821

DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));

15822

MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());

15823

return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),

15824

DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));

15825

}

15826

15827

switch (SrcVT.SimpleTy) {

15828

default:

15829

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15829);

15830

case MVT::v2i32:

15831

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

15832

case MVT::v4i32:

15833

case MVT::v8i32:

15834

assert(!Subtarget.hasAVX512())(static_cast <bool> (!Subtarget.hasAVX512()) ? void (0)
: __assert_fail ("!Subtarget.hasAVX512()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15834, __extension__ __PRETTY_FUNCTION__));

15835

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

15836

}

15837

}

15838

15839

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

15840

SelectionDAG &DAG) const {

15841

SDValue N0 = Op.getOperand(0);

15842

SDLoc dl(Op);

15843

auto PtrVT = getPointerTy(DAG.getDataLayout());

15844

15845

if (Op.getSimpleValueType().isVector())

15846

return lowerUINT_TO_FP_vec(Op, DAG);

15847

15848

MVT SrcVT = N0.getSimpleValueType();

15849

MVT DstVT = Op.getSimpleValueType();

15850

15851

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

15852

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

15853

// Conversions from unsigned i32 to f32/f64 are legal,

15854

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

15855

return Op;

15856

}

15857

15858

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)

15859

return LowerUINT_TO_FP_i64(Op, DAG);

15860

if (SrcVT == MVT::i32 && X86ScalarSSEf64)

15861

return LowerUINT_TO_FP_i32(Op, DAG);

15862

if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)

15863

return SDValue();

15864

15865

// Make a 64-bit buffer, and use it to build an FILD.

15866

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);

15867

if (SrcVT == MVT::i32) {

15868

SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);

15869

SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),

15870

StackSlot, MachinePointerInfo());

15871

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

15872

OffsetSlot, MachinePointerInfo());

15873

SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);

15874

return Fild;

15875

}

15876

15877

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15877, __extension__ __PRETTY_FUNCTION__));

15878

SDValue ValueToStore = Op.getOperand(0);

15879

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())

15880

// Bitcasting to f64 here allows us to do a single 64-bit store from

15881

// an SSE register, avoiding the store forwarding penalty that would come

15882

// with two 32-bit stores.

15883

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

15884

SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,

15885

MachinePointerInfo());

15886

// For i64 source, we need to add the appropriate power of 2 if the input

15887

// was negative. This is the same as the optimization in

15888

// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,

15889

// we must be careful to do the computation in x87 extended precision, not

15890

// in SSE. (The generic code can't know it's OK to do this, or how to.)

15891

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

15892

MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(

15893

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

15894

MachineMemOperand::MOLoad, 8, 8);

15895

15896

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

15897

SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };

15898

SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,

15899

MVT::i64, MMO);

15900

15901

APInt FF(32, 0x5F800000ULL);

15902

15903

// Check whether the sign bit is set.

15904

SDValue SignSet = DAG.getSetCC(

15905

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

15906

Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

15907

15908

// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.

15909

SDValue FudgePtr = DAG.getConstantPool(

15910

ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

15911

15912

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

15913

SDValue Zero = DAG.getIntPtrConstant(0, dl);

15914

SDValue Four = DAG.getIntPtrConstant(4, dl);

15915

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);

15916

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

15917

15918

// Load the value out, extending it from f32 to f80.

15919

// FIXME: Avoid the extend by constructing the right constant pool?

15920

SDValue Fudge = DAG.getExtLoad(

15921

ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,

15922

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

15923

/* Alignment = */ 4);

15924

// Extend everything to 80 bits to force it to be done on x87.

15925

// TODO: Are there any fast-math-flags to propagate here?

15926

SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);

15927

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

15928

DAG.getIntPtrConstant(0, dl));

15929

}

15930

15931

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

15932

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

15933

// just return an <SDValue(), SDValue()> pair.

15934

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

15935

// to i16, i32 or i64, and we lower it to a legal sequence.

15936

// If lowered to the final integer result we return a <result, SDValue()> pair.

15937

// Otherwise we lower it to a sequence ending with a FIST, return a

15938

// <FIST, StackSlot> pair, and the caller is responsible for loading

15939

// the final integer result from StackSlot.

15940

std::pair<SDValue,SDValue>

15941

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

15942

bool IsSigned, bool IsReplace) const {

15943

SDLoc DL(Op);

15944

15945

EVT DstTy = Op.getValueType();

15946

EVT TheVT = Op.getOperand(0).getValueType();

15947

auto PtrVT = getPointerTy(DAG.getDataLayout());

15948

15949

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

15950

// f16 must be promoted before using the lowering in this routine.

15951

// fp128 does not use this lowering.

15952

return std::make_pair(SDValue(), SDValue());

15953

}

15954

15955

// If using FIST to compute an unsigned i64, we'll need some fixup

15956

// to handle values above the maximum signed i64. A FIST is always

15957

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

15958

bool UnsignedFixup = !IsSigned &&

15959

DstTy == MVT::i64 &&

15960

(!Subtarget.is64Bit() ||

15961

!isScalarFPTypeInSSEReg(TheVT));

15962

15963

if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {

15964

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

15965

// The low 32 bits of the fist result will have the correct uint32 result.

15966

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15966, __extension__ __PRETTY_FUNCTION__));

15967

DstTy = MVT::i64;

15968

}

15969

15970

assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15972, __extension__ __PRETTY_FUNCTION__))

15971

DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15972, __extension__ __PRETTY_FUNCTION__))

15972

"Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15972, __extension__ __PRETTY_FUNCTION__));

15973

15974

// These are really Legal.

15975

if (DstTy == MVT::i32 &&

15976

isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))

15977

return std::make_pair(SDValue(), SDValue());

15978

if (Subtarget.is64Bit() &&

15979

DstTy == MVT::i64 &&

15980

isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))

15981

return std::make_pair(SDValue(), SDValue());

15982

15983

// We lower FP->int64 into FISTP64 followed by a load from a temporary

15984

// stack slot.

15985

MachineFunction &MF = DAG.getMachineFunction();

15986

unsigned MemSize = DstTy.getSizeInBits()/8;

15987

int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);

15988

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

15989

15990

unsigned Opc;

15991

switch (DstTy.getSimpleVT().SimpleTy) {

15992

default: llvm_unreachable("Invalid FP_TO_SINT to lower!")::llvm::llvm_unreachable_internal("Invalid FP_TO_SINT to lower!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15992);

15993

case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;

15994

case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;

15995

case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;

15996

}

15997

15998

SDValue Chain = DAG.getEntryNode();

15999

SDValue Value = Op.getOperand(0);

16000

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

16001

16002

if (UnsignedFixup) {

16003

16004

// Conversion to unsigned i64 is implemented with a select,

16005

// depending on whether the source value fits in the range

16006

// of a signed i64. Let Thresh be the FP equivalent of

16007

// 0x8000000000000000ULL.

16008

16009

// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;

16010

// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);

16011

// Fist-to-mem64 FistSrc

16012

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

16013

// to XOR'ing the high 32 bits with Adjust.

16014

16015

// Being a power of 2, Thresh is exactly representable in all FP formats.

16016

// For X87 we'd like to use the smallest FP type for this constant, but

16017

// for DAG type consistency we have to match the FP operand type.

16018

16019

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

16020

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

16021

bool LosesInfo = false;

16022

if (TheVT == MVT::f64)

16023

// The rounding mode is irrelevant as the conversion should be exact.

16024

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

16025

&LosesInfo);

16026

else if (TheVT == MVT::f80)

16027

Status = Thresh.convert(APFloat::x87DoubleExtended(),

16028

APFloat::rmNearestTiesToEven, &LosesInfo);

16029

16030

assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16031, __extension__ __PRETTY_FUNCTION__))

16031

"FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16031, __extension__ __PRETTY_FUNCTION__));

16032

16033

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

16034

16035

SDValue Cmp = DAG.getSetCC(DL,

16036

getSetCCResultType(DAG.getDataLayout(),

16037

*DAG.getContext(), TheVT),

16038

Value, ThreshVal, ISD::SETLT);

16039

Adjust = DAG.getSelect(DL, MVT::i32, Cmp,

16040

DAG.getConstant(0, DL, MVT::i32),

16041

DAG.getConstant(0x80000000, DL, MVT::i32));

16042

SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);

16043

Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),

16044

*DAG.getContext(), TheVT),

16045

Value, ThreshVal, ISD::SETLT);

16046

Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);

16047

}

16048

16049

// FIXME This causes a redundant load/store if the SSE-class value is already

16050

// in memory, such as if it is on the callstack.

16051

if (isScalarFPTypeInSSEReg(TheVT)) {

16052

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16052, __extension__ __PRETTY_FUNCTION__));

16053

Chain = DAG.getStore(Chain, DL, Value, StackSlot,

16054

MachinePointerInfo::getFixedStack(MF, SSFI));

16055

SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);

16056

SDValue Ops[] = {

16057

Chain, StackSlot, DAG.getValueType(TheVT)

16058

};

16059

16060

MachineMemOperand *MMO =

16061

MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),

16062

MachineMemOperand::MOLoad, MemSize, MemSize);

16063

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);

16064

Chain = Value.getValue(1);

16065

SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);

16066

StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

16067

}

16068

16069

MachineMemOperand *MMO =

16070

MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),

16071

MachineMemOperand::MOStore, MemSize, MemSize);

16072

16073

if (UnsignedFixup) {

16074

16075

// Insert the FIST, load its result as two i32's,

16076

// and XOR the high i32 with Adjust.

16077

16078

SDValue FistOps[] = { Chain, Value, StackSlot };

16079

SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),

16080

FistOps, DstTy, MMO);

16081

16082

SDValue Low32 =

16083

DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());

16084

SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);

16085

16086

SDValue High32 =

16087

DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());

16088

High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);

16089

16090

if (Subtarget.is64Bit()) {

16091

// Join High32 and Low32 into a 64-bit result.

16092

// (High32 << 32) | Low32

16093

Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);

16094

High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);

16095

High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,

16096

DAG.getConstant(32, DL, MVT::i8));

16097

SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);

16098

return std::make_pair(Result, SDValue());

16099

}

16100

16101

SDValue ResultOps[] = { Low32, High32 };

16102

16103

SDValue pair = IsReplace

16104

? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)

16105

: DAG.getMergeValues(ResultOps, DL);

16106

return std::make_pair(pair, SDValue());

16107

} else {

16108

// Build the FP_TO_INT*_IN_MEM

16109

SDValue Ops[] = { Chain, Value, StackSlot };

16110

SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),

16111

Ops, DstTy, MMO);

16112

return std::make_pair(FIST, StackSlot);

16113

}

16114

}

16115

16116

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

16117

const X86Subtarget &Subtarget) {

16118

MVT VT = Op->getSimpleValueType(0);

16119

SDValue In = Op->getOperand(0);

16120

MVT InVT = In.getSimpleValueType();

16121

SDLoc dl(Op);

16122

16123

if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&

16124

(VT != MVT::v8i32 || InVT != MVT::v8i16) &&

16125

(VT != MVT::v16i16 || InVT != MVT::v16i8) &&

16126

(VT != MVT::v8i64 || InVT != MVT::v8i32) &&

16127

(VT != MVT::v8i64 || InVT != MVT::v8i16) &&

16128

(VT != MVT::v16i32 || InVT != MVT::v16i16) &&

16129

(VT != MVT::v16i32 || InVT != MVT::v16i8) &&

16130

(VT != MVT::v32i16 || InVT != MVT::v32i8))

16131

return SDValue();

16132

16133

if (Subtarget.hasInt256())

16134

return DAG.getNode(X86ISD::VZEXT, dl, VT, In);

16135

16136

// Optimize vectors in AVX mode:

16137

16138

// v8i16 -> v8i32

16139

// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.

16140

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

16141

// Concat upper and lower parts.

16142

16143

// v4i32 -> v4i64

16144

// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.

16145

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

16146

// Concat upper and lower parts.

16147

16148

16149

SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);

16150

SDValue Undef = DAG.getUNDEF(InVT);

16151

bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;

16152

SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

16153

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

16154

16155

MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),

16156

VT.getVectorNumElements()/2);

16157

16158

OpLo = DAG.getBitcast(HVT, OpLo);

16159

OpHi = DAG.getBitcast(HVT, OpHi);

16160

16161

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

16162

}

16163

16164

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

16165

const X86Subtarget &Subtarget,

16166

SelectionDAG &DAG) {

16167

MVT VT = Op->getSimpleValueType(0);

16168

SDValue In = Op->getOperand(0);

16169

MVT InVT = In.getSimpleValueType();

16170

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16170, __extension__ __PRETTY_FUNCTION__));

16171

SDLoc DL(Op);

16172

unsigned NumElts = VT.getVectorNumElements();

16173

16174

// Extend VT if the scalar type is v8/v16 and BWI is not supported.

16175

MVT ExtVT = VT;

16176

if (!Subtarget.hasBWI() &&

16177

(VT.getVectorElementType().getSizeInBits() <= 16))

16178

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

16179

16180

// Widen to 512-bits if VLX is not supported.

16181

MVT WideVT = ExtVT;

16182

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

16183

NumElts *= 512 / ExtVT.getSizeInBits();

16184

InVT = MVT::getVectorVT(MVT::i1, NumElts);

16185

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

16186

In, DAG.getIntPtrConstant(0, DL));

16187

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

16188

NumElts);

16189

}

16190

16191

SDValue One = DAG.getConstant(1, DL, WideVT);

16192

SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);

16193

16194

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

16195

16196

// Truncate if we had to extend i16/i8 above.

16197

if (VT != ExtVT) {

16198

WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

16199

SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal);

16200

}

16201

16202

// Extract back to 128/256-bit if we widened.

16203

if (WideVT != VT)

16204

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

16205

DAG.getIntPtrConstant(0, DL));

16206

16207

return SelectedVal;

16208

}

16209

16210

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

16211

SelectionDAG &DAG) {

16212

SDValue In = Op->getOperand(0);

16213

MVT InVT = In.getSimpleValueType();

16214

16215

if (InVT.getVectorElementType() == MVT::i1)

16216

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

16217

16218

if (Subtarget.hasFp256())

16219

if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))

16220

return Res;

16221

16222

return SDValue();

16223

}

16224

16225

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

16226

SelectionDAG &DAG) {

16227

SDValue In = Op.getOperand(0);

16228

MVT SVT = In.getSimpleValueType();

16229

16230

if (SVT.getVectorElementType() == MVT::i1)

16231

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

16232

16233

if (Subtarget.hasFp256())

16234

if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))

16235

return Res;

16236

16237

assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||(static_cast <bool> (!Op.getSimpleValueType().is256BitVector
() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements
() != SVT.getVectorNumElements()) ? void (0) : __assert_fail (
"!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements() != SVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16239, __extension__ __PRETTY_FUNCTION__))

16238

Op.getSimpleValueType().getVectorNumElements() !=(static_cast <bool> (!Op.getSimpleValueType().is256BitVector
() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements
() != SVT.getVectorNumElements()) ? void (0) : __assert_fail (
"!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements() != SVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16239, __extension__ __PRETTY_FUNCTION__))

16239

SVT.getVectorNumElements())(static_cast <bool> (!Op.getSimpleValueType().is256BitVector
() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements
() != SVT.getVectorNumElements()) ? void (0) : __assert_fail (
"!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements() != SVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16239, __extension__ __PRETTY_FUNCTION__));

16240

return SDValue();

16241

}

16242

16243

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

16244

/// It makes use of the fact that vectors with enough leading sign/zero bits

16245

/// prevent the PACKSS/PACKUS from saturating the results.

16246

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

16247

/// within each 128-bit lane.

16248

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

16249

const SDLoc &DL, SelectionDAG &DAG,

16250

const X86Subtarget &Subtarget) {

16251

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16252, __extension__ __PRETTY_FUNCTION__))

16252

"Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16252, __extension__ __PRETTY_FUNCTION__));

16253

16254

// Requires SSE2 but AVX512 has fast truncate.

16255

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

16256

return SDValue();

16257

16258

EVT SrcVT = In.getValueType();

16259

16260

// No truncation required, we might get here due to recursive calls.

16261

if (SrcVT == DstVT)

16262

return In;

16263

16264

// We only support vector truncation to 128bits or greater from a

16265

// 256bits or greater source.

16266

unsigned DstSizeInBits = DstVT.getSizeInBits();

16267

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

16268

if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)

16269

return SDValue();

16270

16271

LLVMContext &Ctx = *DAG.getContext();

16272

unsigned NumElems = SrcVT.getVectorNumElements();

16273

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16273, __extension__ __PRETTY_FUNCTION__));

16274

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16274, __extension__ __PRETTY_FUNCTION__));

16275

16276

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

16277

16278

// Extract lower/upper subvectors.

16279

unsigned NumSubElts = NumElems / 2;

16280

SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

16281

SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

16282

16283

// Pack to the largest type possible:

16284

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

16285

EVT InVT = MVT::i16, OutVT = MVT::i8;

16286

if (DstVT.getScalarSizeInBits() > 8 &&

16287

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

16288

InVT = MVT::i32;

16289

OutVT = MVT::i16;

16290

}

16291

16292

unsigned SubSizeInBits = SrcSizeInBits / 2;

16293

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

16294

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

16295

16296

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

16297

if (SrcVT.is256BitVector()) {

16298

Lo = DAG.getBitcast(InVT, Lo);

16299

Hi = DAG.getBitcast(InVT, Hi);

16300

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

16301

return DAG.getBitcast(DstVT, Res);

16302

}

16303

16304

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

16305

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

16306

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

16307

Lo = DAG.getBitcast(InVT, Lo);

16308

Hi = DAG.getBitcast(InVT, Hi);

16309

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

16310

16311

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

16312

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

16313

Res = DAG.getBitcast(MVT::v4i64, Res);

16314

Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});

16315

16316

if (DstVT.is256BitVector())

16317

return DAG.getBitcast(DstVT, Res);

16318

16319

// If 512bit -> 128bit truncate another stage.

16320

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

16321

Res = DAG.getBitcast(PackedVT, Res);

16322

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

16323

}

16324

16325

// Recursively pack lower/upper subvectors, concat result and pack again.

16326

assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 512 &&
"Expected 512-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 512 && \"Expected 512-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16326, __extension__ __PRETTY_FUNCTION__));

16327

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);

16328

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

16329

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

16330

16331

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

16332

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

16333

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

16334

}

16335

16336

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

16337

const X86Subtarget &Subtarget) {

16338

16339

SDLoc DL(Op);

16340

MVT VT = Op.getSimpleValueType();

16341

SDValue In = Op.getOperand(0);

16342

MVT InVT = In.getSimpleValueType();

16343

16344

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16344, __extension__ __PRETTY_FUNCTION__));

16345

16346

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

16347

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

16348

if (InVT.getScalarSizeInBits() <= 16) {

16349

if (Subtarget.hasBWI()) {

16350

// legal, will go to VPMOVB2M, VPMOVW2M

16351

// Shift packed bytes not supported natively, bitcast to word

16352

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

16353

SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,

16354

DAG.getBitcast(ExtVT, In),

16355

DAG.getConstant(ShiftInx, DL, ExtVT));

16356

ShiftNode = DAG.getBitcast(InVT, ShiftNode);

16357

return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);

16358

}

16359

// Use TESTD/Q, extended vector to packed dword/qword.

16360

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16361, __extension__ __PRETTY_FUNCTION__))

16361

"Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16361, __extension__ __PRETTY_FUNCTION__));

16362

unsigned NumElts = InVT.getVectorNumElements();

16363

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);

16364

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

16365

InVT = ExtVT;

16366

ShiftInx = InVT.getScalarSizeInBits() - 1;

16367

}

16368

16369

SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,

16370

DAG.getConstant(ShiftInx, DL, InVT));

16371

return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);

16372

}

16373

16374

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

16375

SDLoc DL(Op);

16376

MVT VT = Op.getSimpleValueType();

16377

SDValue In = Op.getOperand(0);

16378

MVT InVT = In.getSimpleValueType();

16379

unsigned InNumEltBits = InVT.getScalarSizeInBits();

16380

16381

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16382, __extension__ __PRETTY_FUNCTION__))

16382

"Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16382, __extension__ __PRETTY_FUNCTION__));

16383

16384

if (VT.getVectorElementType() == MVT::i1)

16385

return LowerTruncateVecI1(Op, DAG, Subtarget);

16386

16387

// vpmovqb/w/d, vpmovdb/w, vpmovwb

16388

if (Subtarget.hasAVX512()) {

16389

// word to byte only under BWI

16390

if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8

16391

return DAG.getNode(X86ISD::VTRUNC, DL, VT,

16392

getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));

16393

return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);

16394

}

16395

16396

// Truncate with PACKSS if we are truncating a vector with sign-bits that

16397

// extend all the way to the packed/truncated value.

16398

unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

16399

if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))

16400

if (SDValue V =

16401

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

16402

return V;

16403

16404

// Truncate with PACKUS if we are truncating a vector with leading zero bits

16405

// that extend all the way to the packed/truncated value.

16406

// Pre-SSE41 we can only use PACKUSWB.

16407

KnownBits Known;

16408

DAG.computeKnownBits(In, Known);

16409

NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;

16410

if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())

16411

if (SDValue V =

16412

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

16413

return V;

16414

16415

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

16416

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

16417

if (Subtarget.hasInt256()) {

16418

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

16419

In = DAG.getBitcast(MVT::v8i32, In);

16420

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

16421

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

16422

DAG.getIntPtrConstant(0, DL));

16423

}

16424

16425

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

16426

DAG.getIntPtrConstant(0, DL));

16427

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

16428

DAG.getIntPtrConstant(2, DL));

16429

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

16430

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

16431

static const int ShufMask[] = {0, 2, 4, 6};

16432

return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);

16433

}

16434

16435

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

16436

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

16437

if (Subtarget.hasInt256()) {

16438

In = DAG.getBitcast(MVT::v32i8, In);

16439

16440

// The PSHUFB mask:

16441

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

16442

-1, -1, -1, -1, -1, -1, -1, -1,

16443

16, 17, 20, 21, 24, 25, 28, 29,

16444

-1, -1, -1, -1, -1, -1, -1, -1 };

16445

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

16446

In = DAG.getBitcast(MVT::v4i64, In);

16447

16448

static const int ShufMask2[] = {0, 2, -1, -1};

16449

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

16450

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

16451

DAG.getIntPtrConstant(0, DL));

16452

return DAG.getBitcast(VT, In);

16453

}

16454

16455

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

16456

DAG.getIntPtrConstant(0, DL));

16457

16458

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

16459

DAG.getIntPtrConstant(4, DL));

16460

16461

OpLo = DAG.getBitcast(MVT::v16i8, OpLo);

16462

OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

16463

16464

// The PSHUFB mask:

16465

static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,

16466

-1, -1, -1, -1, -1, -1, -1, -1};

16467

16468

OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);

16469

OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

16470

16471

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

16472

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

16473

16474

// The MOVLHPS Mask:

16475

static const int ShufMask2[] = {0, 1, 4, 5};

16476

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

16477

return DAG.getBitcast(MVT::v8i16, res);

16478

}

16479

16480

// Handle truncation of V256 to V128 using shuffles.

16481

if (!VT.is128BitVector() || !InVT.is256BitVector())

16482

return SDValue();

16483

16484

assert(Subtarget.hasFp256() && "256-bit vector without AVX!")(static_cast <bool> (Subtarget.hasFp256() && "256-bit vector without AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasFp256() && \"256-bit vector without AVX!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16484, __extension__ __PRETTY_FUNCTION__));

16485

16486

unsigned NumElems = VT.getVectorNumElements();

16487

MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

16488

16489

SmallVector<int, 16> MaskVec(NumElems * 2, -1);

16490

// Prepare truncation shuffle mask

16491

for (unsigned i = 0; i != NumElems; ++i)

16492

MaskVec[i] = i * 2;

16493

In = DAG.getBitcast(NVT, In);

16494

SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);

16495

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,

16496

DAG.getIntPtrConstant(0, DL));

16497

}

16498

16499

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

16500

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;

16501

MVT VT = Op.getSimpleValueType();

16502

16503

if (VT.isVector()) {

16504

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL!") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16504, __extension__ __PRETTY_FUNCTION__));

16505

SDValue Src = Op.getOperand(0);

16506

SDLoc dl(Op);

16507

if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {

16508

return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,

16509

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

16510

DAG.getUNDEF(MVT::v2f32)));

16511

}

16512

16513

return SDValue();

16514

}

16515

16516

assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16516, __extension__ __PRETTY_FUNCTION__));

16517

16518

std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,

16519

IsSigned, /*IsReplace=*/ false);

16520

SDValue FIST = Vals.first, StackSlot = Vals.second;

16521

// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.

16522

if (!FIST.getNode())

16523

return Op;

16524

16525

if (StackSlot.getNode())

16526

// Load the result.

16527

return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());

16528

16529

// The node is the result.

16530

return FIST;

16531

}

16532

16533

static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {

16534

SDLoc DL(Op);

16535

MVT VT = Op.getSimpleValueType();

16536

SDValue In = Op.getOperand(0);

16537

MVT SVT = In.getSimpleValueType();

16538

16539

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16539, __extension__ __PRETTY_FUNCTION__));

16540

16541

return DAG.getNode(X86ISD::VFPEXT, DL, VT,

16542

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,

16543

In, DAG.getUNDEF(SVT)));

16544

}

16545

16546

/// The only differences between FABS and FNEG are the mask and the logic op.

16547

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

16548

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

16549

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16550, __extension__ __PRETTY_FUNCTION__))

16550

"Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16550, __extension__ __PRETTY_FUNCTION__));

16551

16552

bool IsFABS = (Op.getOpcode() == ISD::FABS);

16553

16554

// If this is a FABS and it has an FNEG user, bail out to fold the combination

16555

// into an FNABS. We'll lower the FABS after that if it is still in use.

16556

if (IsFABS)

16557

for (SDNode *User : Op->uses())

16558

if (User->getOpcode() == ISD::FNEG)

16559

return Op;

16560

16561

SDLoc dl(Op);

16562

MVT VT = Op.getSimpleValueType();

16563

16564

bool IsF128 = (VT == MVT::f128);

16565

16566

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

16567

// decide if we should generate a 16-byte constant mask when we only need 4 or

16568

// 8 bytes for the scalar case.

16569

16570

MVT LogicVT;

16571

MVT EltVT;

16572

16573

if (VT.isVector()) {

16574

LogicVT = VT;

16575

EltVT = VT.getVectorElementType();

16576

} else if (IsF128) {

16577

// SSE instructions are used for optimized f128 logical operations.

16578

LogicVT = MVT::f128;

16579

EltVT = VT;

16580

} else {

16581

// There are no scalar bitwise logical SSE/AVX instructions, so we

16582

// generate a 16-byte vector constant and logic op even for the scalar case.

16583

// Using a 16-byte mask allows folding the load of the mask with

16584

// the logic op, so it can save (~4 bytes) on code size.

16585

LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

16586

EltVT = VT;

16587

}

16588

16589

unsigned EltBits = EltVT.getSizeInBits();

16590

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

16591

APInt MaskElt =

16592

IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);

16593

const fltSemantics &Sem =

16594

EltVT == MVT::f64 ? APFloat::IEEEdouble() :

16595

(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());

16596

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

16597

16598

SDValue Op0 = Op.getOperand(0);

16599

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

16600

unsigned LogicOp =

16601

IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;

16602

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

16603

16604

if (VT.isVector() || IsF128)

16605

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

16606

16607

// For the scalar case extend to a 128-bit vector, perform the logic op,

16608

// and extract the scalar result back out.

16609

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

16610

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

16611

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

16612

DAG.getIntPtrConstant(0, dl));

16613

}

16614

16615

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

16616

SDValue Mag = Op.getOperand(0);

16617

SDValue Sign = Op.getOperand(1);

16618

SDLoc dl(Op);

16619

16620

// If the sign operand is smaller, extend it first.

16621

MVT VT = Op.getSimpleValueType();

16622

if (Sign.getSimpleValueType().bitsLT(VT))

16623

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

16624

16625

// And if it is bigger, shrink it first.

16626

if (Sign.getSimpleValueType().bitsGT(VT))

16627

Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

16628

16629

// At this point the operands and the result should have the same

16630

// type, and that won't be f80 since that is not custom lowered.

16631

bool IsF128 = (VT == MVT::f128);

16632

assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__))

16633

VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__))

16634

VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__))

16635

"Unexpected type in LowerFCOPYSIGN")(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__));

16636

16637

MVT EltVT = VT.getScalarType();

16638

const fltSemantics &Sem =

16639

EltVT == MVT::f64 ? APFloat::IEEEdouble()

16640

: (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());

16641

16642

// Perform all scalar logic operations as 16-byte vectors because there are no

16643

// scalar FP logic instructions in SSE.

16644

// TODO: This isn't necessary. If we used scalar types, we might avoid some

16645

// unnecessary splats, but we might miss load folding opportunities. Should

16646

// this decision be based on OptimizeForSize?

16647

bool IsFakeVector = !VT.isVector() && !IsF128;

16648

MVT LogicVT = VT;

16649

if (IsFakeVector)

16650

LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

16651

16652

// The mask constants are automatically splatted for vector types.

16653

unsigned EltSizeInBits = VT.getScalarSizeInBits();

16654

SDValue SignMask = DAG.getConstantFP(

16655

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

16656

SDValue MagMask = DAG.getConstantFP(

16657

APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

16658

16659

// First, clear all bits but the sign bit from the second operand (sign).

16660

if (IsFakeVector)

16661

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

16662

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

16663

16664

// Next, clear the sign bit from the first operand (magnitude).

16665

// TODO: If we had general constant folding for FP logic ops, this check

16666

// wouldn't be necessary.

16667

SDValue MagBits;

16668

if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {

16669

APFloat APF = Op0CN->getValueAPF();

16670

APF.clearSign();

16671

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

16672

} else {

16673

// If the magnitude operand wasn't a constant, we need to AND out the sign.

16674

if (IsFakeVector)

16675

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

16676

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

16677

}

16678

16679

// OR the magnitude value with the sign bit.

16680

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

16681

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

16682

DAG.getIntPtrConstant(0, dl));

16683

}

16684

16685

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

16686

SDValue N0 = Op.getOperand(0);

16687

SDLoc dl(Op);

16688

MVT VT = Op.getSimpleValueType();

16689

16690

MVT OpVT = N0.getSimpleValueType();

16691

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16692, __extension__ __PRETTY_FUNCTION__))

16692

"Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16692, __extension__ __PRETTY_FUNCTION__));

16693

16694

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

16695

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

16696

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

16697

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

16698

Res = DAG.getZExtOrTrunc(Res, dl, VT);

16699

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

16700

return Res;

16701

}

16702

16703

// Check whether an OR'd tree is PTEST-able.

16704

static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,

16705

SelectionDAG &DAG) {

16706

assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")(static_cast <bool> (Op.getOpcode() == ISD::OR &&
"Only check OR'd tree.") ? void (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16706, __extension__ __PRETTY_FUNCTION__));

16707

16708

if (!Subtarget.hasSSE41())

16709

return SDValue();

16710

16711

if (!Op->hasOneUse())

16712

return SDValue();

16713

16714

SDNode *N = Op.getNode();

16715

SDLoc DL(N);

16716

16717

SmallVector<SDValue, 8> Opnds;

16718

DenseMap<SDValue, unsigned> VecInMap;

16719

SmallVector<SDValue, 8> VecIns;

16720

EVT VT = MVT::Other;

16721

16722

// Recognize a special case where a vector is casted into wide integer to

16723

// test all 0s.

16724

Opnds.push_back(N->getOperand(0));

16725

Opnds.push_back(N->getOperand(1));

16726

16727

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

16728

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

16729

// BFS traverse all OR'd operands.

16730

if (I->getOpcode() == ISD::OR) {

16731

Opnds.push_back(I->getOperand(0));

16732

Opnds.push_back(I->getOperand(1));

16733

// Re-evaluate the number of nodes to be traversed.

16734

e += 2; // 2 more nodes (LHS and RHS) are pushed.

16735

continue;

16736

}

16737

16738

// Quit if a non-EXTRACT_VECTOR_ELT

16739

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

16740

return SDValue();

16741

16742

// Quit if without a constant index.

16743

SDValue Idx = I->getOperand(1);

16744

if (!isa<ConstantSDNode>(Idx))

16745

return SDValue();

16746

16747

SDValue ExtractedFromVec = I->getOperand(0);

16748

DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);

16749

if (M == VecInMap.end()) {

16750

VT = ExtractedFromVec.getValueType();

16751

// Quit if not 128/256-bit vector.

16752

if (!VT.is128BitVector() && !VT.is256BitVector())

16753

return SDValue();

16754

// Quit if not the same type.

16755

if (VecInMap.begin() != VecInMap.end() &&

16756

VT != VecInMap.begin()->first.getValueType())

16757

return SDValue();

16758

M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;

16759

VecIns.push_back(ExtractedFromVec);

16760

}

16761

M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();

16762

}

16763

16764

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Not extracted from 128-/256-bit vector.") ? void
(0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16765, __extension__ __PRETTY_FUNCTION__))

16765

"Not extracted from 128-/256-bit vector.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Not extracted from 128-/256-bit vector.") ? void
(0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16765, __extension__ __PRETTY_FUNCTION__));

16766

16767

unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;

16768

16769

for (DenseMap<SDValue, unsigned>::const_iterator

16770

I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {

16771

// Quit if not all elements are used.

16772

if (I->second != FullMask)

16773

return SDValue();

16774

}

16775

16776

MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

16777

16778

// Cast all vectors into TestVT for PTEST.

16779

for (unsigned i = 0, e = VecIns.size(); i < e; ++i)

16780

VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

16781

16782

// If more than one full vector is evaluated, OR them first before PTEST.

16783

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {

16784

// Each iteration will OR 2 nodes and append the result until there is only

16785

// 1 node left, i.e. the final OR'd value of all vectors.

16786

SDValue LHS = VecIns[Slot];

16787

SDValue RHS = VecIns[Slot + 1];

16788

VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));

16789

}

16790

16791

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());

16792

}

16793

16794

/// \brief return true if \c Op has a use that doesn't just read flags.

16795

static bool hasNonFlagsUse(SDValue Op) {

16796

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

16797

++UI) {

16798

SDNode *User = *UI;

16799

unsigned UOpNo = UI.getOperandNo();

16800

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

16801

// Look pass truncate.

16802

UOpNo = User->use_begin().getOperandNo();

16803

User = *User->use_begin();

16804

}

16805

16806

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

16807

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

16808

return true;

16809

}

16810

return false;

16811

}

16812

16813

// Emit KTEST instruction for bit vectors on AVX-512

16814

static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,

16815

const X86Subtarget &Subtarget) {

16816

if (Op.getOpcode() == ISD::BITCAST) {

16817

auto hasKTEST = [&](MVT VT) {

16818

unsigned SizeInBits = VT.getSizeInBits();

16819

return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||

16820

(Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));

16821

};

16822

SDValue Op0 = Op.getOperand(0);

16823

MVT Op0VT = Op0.getValueType().getSimpleVT();

16824

if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&

16825

hasKTEST(Op0VT))

16826

return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);

16827

}

16828

return SDValue();

16829

}

16830

16831

/// Emit nodes that will be selected as "test Op0,Op0", or something

16832

/// equivalent.

16833

SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

16834

SelectionDAG &DAG) const {

16835

if (Op.getValueType() == MVT::i1) {

16836

SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);

16837

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,

16838

DAG.getConstant(0, dl, MVT::i8));

16839

}

16840

// CF and OF aren't always set the way we want. Determine which

16841

// of these we need.

16842

bool NeedCF = false;

16843

bool NeedOF = false;

16844

switch (X86CC) {

16845

default: break;

16846

case X86::COND_A: case X86::COND_AE:

16847

case X86::COND_B: case X86::COND_BE:

16848

NeedCF = true;

16849

break;

16850

case X86::COND_G: case X86::COND_GE:

16851

case X86::COND_L: case X86::COND_LE:

16852

case X86::COND_O: case X86::COND_NO: {

16853

// Check if we really need to set the

16854

// Overflow flag. If NoSignedWrap is present

16855

// that is not actually needed.

16856

switch (Op->getOpcode()) {

16857

case ISD::ADD:

16858

case ISD::SUB:

16859

case ISD::MUL:

16860

case ISD::SHL:

16861

if (Op.getNode()->getFlags().hasNoSignedWrap())

16862

break;

16863

LLVM_FALLTHROUGH[[clang::fallthrough]];

16864

default:

16865

NeedOF = true;

16866

break;

16867

}

16868

break;

16869

}

16870

}

16871

// See if we can use the EFLAGS value from the operand instead of

16872

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

16873

// we prove that the arithmetic won't overflow, we can't use OF or CF.

16874

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

16875

// Emit KTEST for bit vectors

16876

if (auto Node = EmitKTEST(Op, DAG, Subtarget))

16877

return Node;

16878

// Emit a CMP with 0, which is the TEST pattern.

16879

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

16880

DAG.getConstant(0, dl, Op.getValueType()));

16881

}

16882

unsigned Opcode = 0;

16883

unsigned NumOperands = 0;

16884

16885

// Truncate operations may prevent the merge of the SETCC instruction

16886

// and the arithmetic instruction before it. Attempt to truncate the operands

16887

// of the arithmetic instruction and use a reduced bit-width instruction.

16888

bool NeedTruncation = false;

16889

SDValue ArithOp = Op;

16890

if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {

16891

SDValue Arith = Op->getOperand(0);

16892

// Both the trunc and the arithmetic op need to have one user each.

16893

if (Arith->hasOneUse())

16894

switch (Arith.getOpcode()) {

16895

default: break;

16896

case ISD::ADD:

16897

case ISD::SUB:

16898

case ISD::AND:

16899

case ISD::OR:

16900

case ISD::XOR: {

16901

NeedTruncation = true;

16902

ArithOp = Arith;

16903

}

16904

}

16905

}

16906

16907

// Sometimes flags can be set either with an AND or with an SRL/SHL

16908

// instruction. SRL/SHL variant should be preferred for masks longer than this

16909

// number of bits.

16910

const int ShiftToAndMaxMaskWidth = 32;

16911

const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);

16912

16913

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

16914

// which may be the result of a CAST. We use the variable 'Op', which is the

16915

// non-casted variable when we check for possible users.

16916

switch (ArithOp.getOpcode()) {

16917

case ISD::ADD:

16918

// We only want to rewrite this as a target-specific node with attached

16919

// flags if there is a reasonable chance of either using that to do custom

16920

// instructions selection that can fold some of the memory operands, or if

16921

// only the flags are used. If there are other uses, leave the node alone

16922

// and emit a test instruction.

16923

for (SDNode::use_iterator UI = Op.getNode()->use_begin(),

16924

UE = Op.getNode()->use_end(); UI != UE; ++UI)

16925

if (UI->getOpcode() != ISD::CopyToReg &&

16926

UI->getOpcode() != ISD::SETCC &&

16927

UI->getOpcode() != ISD::STORE)

16928

goto default_case;

16929

16930

if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {

16931

// An add of one will be selected as an INC.

16932

if (C->isOne() &&

16933

(!Subtarget.slowIncDec() ||

16934

DAG.getMachineFunction().getFunction()->optForSize())) {

16935

Opcode = X86ISD::INC;

16936

NumOperands = 1;

16937

break;

16938

}

16939

16940

// An add of negative one (subtract of one) will be selected as a DEC.

16941

if (C->isAllOnesValue() &&

16942

(!Subtarget.slowIncDec() ||

16943

DAG.getMachineFunction().getFunction()->optForSize())) {

16944

Opcode = X86ISD::DEC;

16945

NumOperands = 1;

16946

break;

16947

}

16948

}

16949

16950

// Otherwise use a regular EFLAGS-setting add.

16951

Opcode = X86ISD::ADD;

16952

NumOperands = 2;

16953

break;

16954

case ISD::SHL:

16955

case ISD::SRL:

16956

// If we have a constant logical shift that's only used in a comparison

16957

// against zero turn it into an equivalent AND. This allows turning it into

16958

// a TEST instruction later.

16959

if (ZeroCheck && Op->hasOneUse() &&

16960

isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {

16961

EVT VT = Op.getValueType();

16962

unsigned BitWidth = VT.getSizeInBits();

16963

unsigned ShAmt = Op->getConstantOperandVal(1);

16964

if (ShAmt >= BitWidth) // Avoid undefined shifts.

16965

break;

16966

APInt Mask = ArithOp.getOpcode() == ISD::SRL

16967

? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)

16968

: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);

16969

if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))

16970

break;

16971

Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),

16972

DAG.getConstant(Mask, dl, VT));

16973

}

16974

break;

16975

16976

case ISD::AND:

16977

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

16978

// because a TEST instruction will be better. However, AND should be

16979

// preferred if the instruction can be combined into ANDN.

16980

if (!hasNonFlagsUse(Op)) {

16981

SDValue Op0 = ArithOp->getOperand(0);

16982

SDValue Op1 = ArithOp->getOperand(1);

16983

EVT VT = ArithOp.getValueType();

16984

bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);

16985

bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;

16986

bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();

16987

16988

// If we cannot select an ANDN instruction, check if we can replace

16989

// AND+IMM64 with a shift before giving up. This is possible for masks

16990

// like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.

16991

if (!isProperAndn) {

16992

if (!ZeroCheck)

16993

break;

16994

16995

assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized")(static_cast <bool> (!isa<ConstantSDNode>(Op0) &&
"AND node isn't canonicalized") ? void (0) : __assert_fail (
"!isa<ConstantSDNode>(Op0) && \"AND node isn't canonicalized\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16995, __extension__ __PRETTY_FUNCTION__));

16996

auto *CN = dyn_cast<ConstantSDNode>(Op1);

16997

if (!CN)

16998

break;

16999

17000

const APInt &Mask = CN->getAPIntValue();

17001

if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))

17002

break; // Prefer TEST instruction.

17003

17004

unsigned BitWidth = Mask.getBitWidth();

17005

unsigned LeadingOnes = Mask.countLeadingOnes();

17006

unsigned TrailingZeros = Mask.countTrailingZeros();

17007

17008

if (LeadingOnes + TrailingZeros == BitWidth) {

17009

assert(TrailingZeros < VT.getSizeInBits() &&(static_cast <bool> (TrailingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("TrailingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17010, __extension__ __PRETTY_FUNCTION__))

17010

"Shift amount should be less than the type width")(static_cast <bool> (TrailingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("TrailingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17010, __extension__ __PRETTY_FUNCTION__));

17011

MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);

17012

SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);

17013

Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);

17014

break;

17015

}

17016

17017

unsigned LeadingZeros = Mask.countLeadingZeros();

17018

unsigned TrailingOnes = Mask.countTrailingOnes();

17019

17020

if (LeadingZeros + TrailingOnes == BitWidth) {

17021

assert(LeadingZeros < VT.getSizeInBits() &&(static_cast <bool> (LeadingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("LeadingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17022, __extension__ __PRETTY_FUNCTION__))

17022

"Shift amount should be less than the type width")(static_cast <bool> (LeadingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("LeadingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17022, __extension__ __PRETTY_FUNCTION__));

17023

MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);

17024

SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);

17025

Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);

17026

break;

17027

}

17028

17029

break;

17030

}

17031

}

17032

LLVM_FALLTHROUGH[[clang::fallthrough]];

17033

case ISD::SUB:

17034

case ISD::OR:

17035

case ISD::XOR:

17036

// Similar to ISD::ADD above, check if the uses will preclude useful

17037

// lowering of the target-specific node.

17038

for (SDNode::use_iterator UI = Op.getNode()->use_begin(),

17039

UE = Op.getNode()->use_end(); UI != UE; ++UI)

17040

if (UI->getOpcode() != ISD::CopyToReg &&

17041

UI->getOpcode() != ISD::SETCC &&

17042

UI->getOpcode() != ISD::STORE)

17043

goto default_case;

17044

17045

// Otherwise use a regular EFLAGS-setting instruction.

17046

switch (ArithOp.getOpcode()) {

17047

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17047);

17048

case ISD::SUB: Opcode = X86ISD::SUB; break;

17049

case ISD::XOR: Opcode = X86ISD::XOR; break;

17050

case ISD::AND: Opcode = X86ISD::AND; break;

17051

case ISD::OR: {

17052

if (!NeedTruncation && ZeroCheck) {

17053

if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))

17054

return EFLAGS;

17055

}

17056

Opcode = X86ISD::OR;

17057

break;

17058

}

17059

}

17060

17061

NumOperands = 2;

17062

break;

17063

case X86ISD::ADD:

17064

case X86ISD::SUB:

17065

case X86ISD::INC:

17066

case X86ISD::DEC:

17067

case X86ISD::OR:

17068

case X86ISD::XOR:

17069

case X86ISD::AND:

17070

return SDValue(Op.getNode(), 1);

17071

default:

17072

default_case:

17073

break;

17074

}

17075

17076

// If we found that truncation is beneficial, perform the truncation and

17077

// update 'Op'.

17078

if (NeedTruncation) {

17079

EVT VT = Op.getValueType();

17080

SDValue WideVal = Op->getOperand(0);

17081

EVT WideVT = WideVal.getValueType();

17082

unsigned ConvertedOp = 0;

17083

// Use a target machine opcode to prevent further DAGCombine

17084

// optimizations that may separate the arithmetic operations

17085

// from the setcc node.

17086

switch (WideVal.getOpcode()) {

17087

default: break;

17088

case ISD::ADD: ConvertedOp = X86ISD::ADD; break;

17089

case ISD::SUB: ConvertedOp = X86ISD::SUB; break;

17090

case ISD::AND: ConvertedOp = X86ISD::AND; break;

17091

case ISD::OR: ConvertedOp = X86ISD::OR; break;

17092

case ISD::XOR: ConvertedOp = X86ISD::XOR; break;

17093

}

17094

17095

if (ConvertedOp) {

17096

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

17097

if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {

17098

SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));

17099

SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));

17100

Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);

17101

}

17102

}

17103

}

17104

17105

if (Opcode == 0) {

17106

// Emit KTEST for bit vectors

17107

if (auto Node = EmitKTEST(Op, DAG, Subtarget))

17108

return Node;

17109

17110

// Emit a CMP with 0, which is the TEST pattern.

17111

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

17112

DAG.getConstant(0, dl, Op.getValueType()));

17113

}

17114

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

17115

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

17116

17117

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

17118

DAG.ReplaceAllUsesWith(Op, New);

17119

return SDValue(New.getNode(), 1);

17120

}

17121

17122

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

17123

/// equivalent.

17124

SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

17125

const SDLoc &dl, SelectionDAG &DAG) const {

17126

if (isNullConstant(Op1))

17127

return EmitTest(Op0, X86CC, dl, DAG);

17128

17129

assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&(static_cast <bool> (!(isa<ConstantSDNode>(Op1) &&
Op0.getValueType() == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands"
) ? void (0) : __assert_fail ("!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && \"Unexpected comparison operation for MVT::i1 operands\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17130, __extension__ __PRETTY_FUNCTION__))

17130

"Unexpected comparison operation for MVT::i1 operands")(static_cast <bool> (!(isa<ConstantSDNode>(Op1) &&
Op0.getValueType() == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands"
) ? void (0) : __assert_fail ("!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && \"Unexpected comparison operation for MVT::i1 operands\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17130, __extension__ __PRETTY_FUNCTION__));

17131

17132

if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||

17133

Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {

17134

// Only promote the compare up to I32 if it is a 16 bit operation

17135

// with an immediate. 16 bit immediates are to be avoided.

17136

if ((Op0.getValueType() == MVT::i16 &&

17137

(isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&

17138

!DAG.getMachineFunction().getFunction()->optForMinSize() &&

17139

!Subtarget.isAtom()) {

17140

unsigned ExtendOp =

17141

isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;

17142

Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);

17143

Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);

17144

}

17145

// Use SUB instead of CMP to enable CSE between SUB and CMP.

17146

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

17147

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

17148

return SDValue(Sub.getNode(), 1);

17149

}

17150

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);

17151

}

17152

17153

/// Convert a comparison if required by the subtarget.

17154

SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,

17155

SelectionDAG &DAG) const {

17156

// If the subtarget does not support the FUCOMI instruction, floating-point

17157

// comparisons have to be converted.

17158

if (Subtarget.hasCMov() ||

17159

Cmp.getOpcode() != X86ISD::CMP ||

17160

!Cmp.getOperand(0).getValueType().isFloatingPoint() ||

17161

!Cmp.getOperand(1).getValueType().isFloatingPoint())

17162

return Cmp;

17163

17164

// The instruction selector will select an FUCOM instruction instead of

17165

// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence

17166

// build an SDNode sequence that transfers the result from FPSW into EFLAGS:

17167

// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))

17168

SDLoc dl(Cmp);

17169

SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);

17170

SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);

17171

SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,

17172

DAG.getConstant(8, dl, MVT::i8));

17173

SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

17174

17175

// Some 64-bit targets lack SAHF support, but they do support FCOMI.

17176

assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?")(static_cast <bool> (Subtarget.hasLAHFSAHF() &&
"Target doesn't support SAHF or FCOMI?") ? void (0) : __assert_fail
("Subtarget.hasLAHFSAHF() && \"Target doesn't support SAHF or FCOMI?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17176, __extension__ __PRETTY_FUNCTION__));

17177

return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);

17178

}

17179

17180

/// Check if replacement of SQRT with RSQRT should be disabled.

17181

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

17182

EVT VT = Op.getValueType();

17183

17184

// We never want to use both SQRT and RSQRT instructions for the same input.

17185

if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

17186

return false;

17187

17188

if (VT.isVector())

17189

return Subtarget.hasFastVectorFSQRT();

17190

return Subtarget.hasFastScalarFSQRT();

17191

}

17192

17193

/// The minimum architected relative accuracy is 2^-12. We need one

17194

/// Newton-Raphson step to have a good float result (24 bits of precision).

17195

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

17196

SelectionDAG &DAG, int Enabled,

17197

int &RefinementSteps,

17198

bool &UseOneConstNR,

17199

bool Reciprocal) const {

17200

EVT VT = Op.getValueType();

17201

17202

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

17203

// TODO: Add support for AVX512 (v16f32).

17204

// It is likely not profitable to do this for f64 because a double-precision

17205

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

17206

// instructions: convert to single, rsqrtss, convert back to double, refine

17207

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

17208

// along with FMA, this could be a throughput win.

17209

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

17210

// after legalize types.

17211

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

17212

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

17213

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

17214

(VT == MVT::v8f32 && Subtarget.hasAVX())) {

17215

if (RefinementSteps == ReciprocalEstimate::Unspecified)

17216

RefinementSteps = 1;

17217

17218

UseOneConstNR = false;

17219

return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);

17220

}

17221

return SDValue();

17222

}

17223

17224

/// The minimum architected relative accuracy is 2^-12. We need one

17225

/// Newton-Raphson step to have a good float result (24 bits of precision).

17226

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

17227

int Enabled,

17228

int &RefinementSteps) const {

17229

EVT VT = Op.getValueType();

17230

17231

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

17232

// TODO: Add support for AVX512 (v16f32).

17233

// It is likely not profitable to do this for f64 because a double-precision

17234

// reciprocal estimate with refinement on x86 prior to FMA requires

17235

// 15 instructions: convert to single, rcpss, convert back to double, refine

17236

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

17237

// along with FMA, this could be a throughput win.

17238

17239

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

17240

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

17241

(VT == MVT::v8f32 && Subtarget.hasAVX())) {

17242

// Enable estimate codegen with 1 refinement step for vector division.

17243

// Scalar division estimates are disabled because they break too much

17244

// real-world code. These defaults are intended to match GCC behavior.

17245

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

17246

return SDValue();

17247

17248

if (RefinementSteps == ReciprocalEstimate::Unspecified)

17249

RefinementSteps = 1;

17250

17251

return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);

17252

}

17253

return SDValue();

17254

}

17255

17256

/// If we have at least two divisions that use the same divisor, convert to

17257

/// multiplication by a reciprocal. This may need to be adjusted for a given

17258

/// CPU if a division's cost is not at least twice the cost of a multiplication.

17259

/// This is because we still need one division to calculate the reciprocal and

17260

/// then we need two multiplies by that reciprocal as replacements for the

17261

/// original divisions.

17262

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

17263

return 2;

17264

}

17265

17266

/// Helper for creating a X86ISD::SETCC node.

17267

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

17268

SelectionDAG &DAG) {

17269

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

17270

DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);

17271

}

17272

17273

/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition

17274

/// according to equal/not-equal condition code \p CC.

17275

static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,

17276

const SDLoc &dl, SelectionDAG &DAG) {

17277

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

17278

// instruction. Since the shift amount is in-range-or-undefined, we know

17279

// that doing a bittest on the i32 value is ok. We extend to i32 because

17280

// the encoding for the i16 version is larger than the i32 version.

17281

// Also promote i16 to i32 for performance / code size reason.

17282

if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)

17283

Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

17284

17285

// See if we can use the 32-bit instruction instead of the 64-bit one for a

17286

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

17287

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

17288

// known to be zero.

17289

if (Src.getValueType() == MVT::i64 &&

17290

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

17291

Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

17292

17293

// If the operand types disagree, extend the shift amount to match. Since

17294

// BT ignores high bits (like shifts) we can use anyextend.

17295

if (Src.getValueType() != BitNo.getValueType())

17296

BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

17297

17298

SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);

17299

X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

17300

return getSETCC(Cond, BT, dl , DAG);

17301

}

17302

17303

/// Result of 'and' is compared against zero. Change to a BT node if possible.

17304

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,

17305

const SDLoc &dl, SelectionDAG &DAG) {

17306

assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17306, __extension__ __PRETTY_FUNCTION__));

17307

SDValue Op0 = And.getOperand(0);

17308

SDValue Op1 = And.getOperand(1);

17309

if (Op0.getOpcode() == ISD::TRUNCATE)

17310

Op0 = Op0.getOperand(0);

17311

if (Op1.getOpcode() == ISD::TRUNCATE)

17312

Op1 = Op1.getOperand(0);

17313

17314

SDValue LHS, RHS;

17315

if (Op1.getOpcode() == ISD::SHL)

17316

std::swap(Op0, Op1);

17317

if (Op0.getOpcode() == ISD::SHL) {

17318

if (isOneConstant(Op0.getOperand(0))) {

17319

// If we looked past a truncate, check that it's only truncating away

17320

// known zeros.

17321

unsigned BitWidth = Op0.getValueSizeInBits();

17322

unsigned AndBitWidth = And.getValueSizeInBits();

17323

if (BitWidth > AndBitWidth) {

17324

KnownBits Known;

17325

DAG.computeKnownBits(Op0, Known);

17326

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

17327

return SDValue();

17328

}

17329

LHS = Op1;

17330

RHS = Op0.getOperand(1);

17331

}

17332

} else if (Op1.getOpcode() == ISD::Constant) {

17333

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

17334

uint64_t AndRHSVal = AndRHS->getZExtValue();

17335

SDValue AndLHS = Op0;

17336

17337

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

17338

LHS = AndLHS.getOperand(0);

17339

RHS = AndLHS.getOperand(1);

17340

}

17341

17342

// Use BT if the immediate can't be encoded in a TEST instruction.

17343

if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {

17344

LHS = AndLHS;

17345

RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());

17346

}

17347

}

17348

17349

if (LHS.getNode())

17350

return getBitTestCondition(LHS, RHS, CC, dl, DAG);

17351

17352

return SDValue();

17353

}

17354

17355

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

17356

/// CMPs.

17357

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

17358

SDValue &Op1) {

17359

unsigned SSECC;

17360

bool Swap = false;

17361

17362

// SSE Condition code mapping:

17363

// 0 - EQ

17364

// 1 - LT

17365

// 2 - LE

17366

// 3 - UNORD

17367

// 4 - NEQ

17368

// 5 - NLT

17369

// 6 - NLE

17370

// 7 - ORD

17371

switch (SetCCOpcode) {

17372

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17372);

17373

case ISD::SETOEQ:

17374

case ISD::SETEQ: SSECC = 0; break;

17375

case ISD::SETOGT:

17376

case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];

17377

case ISD::SETLT:

17378

case ISD::SETOLT: SSECC = 1; break;

17379

case ISD::SETOGE:

17380

case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];

17381

case ISD::SETLE:

17382

case ISD::SETOLE: SSECC = 2; break;

17383

case ISD::SETUO: SSECC = 3; break;

17384

case ISD::SETUNE:

17385

case ISD::SETNE: SSECC = 4; break;

17386

case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];

17387

case ISD::SETUGE: SSECC = 5; break;

17388

case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];

17389

case ISD::SETUGT: SSECC = 6; break;

17390

case ISD::SETO: SSECC = 7; break;

17391

case ISD::SETUEQ: SSECC = 8; break;

17392

case ISD::SETONE: SSECC = 12; break;

17393

}

17394

if (Swap)

17395

std::swap(Op0, Op1);

17396

17397

return SSECC;

17398

}

17399

17400

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

17401

/// concatenate the result back.

17402

static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {

17403

MVT VT = Op.getSimpleValueType();

17404

17405

assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&(static_cast <bool> (VT.is256BitVector() && Op.
getOpcode() == ISD::SETCC && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17406, __extension__ __PRETTY_FUNCTION__))

17406

"Unsupported value type for operation")(static_cast <bool> (VT.is256BitVector() && Op.
getOpcode() == ISD::SETCC && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17406, __extension__ __PRETTY_FUNCTION__));

17407

17408

unsigned NumElems = VT.getVectorNumElements();

17409

SDLoc dl(Op);

17410

SDValue CC = Op.getOperand(2);

17411

17412

// Extract the LHS vectors

17413

SDValue LHS = Op.getOperand(0);

17414

SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);

17415

SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

17416

17417

// Extract the RHS vectors

17418

SDValue RHS = Op.getOperand(1);

17419

SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);

17420

SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

17421

17422

// Issue the operation on the smaller types and concatenate the result back

17423

MVT EltVT = VT.getVectorElementType();

17424

MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

17425

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

17426

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),

17427

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));

17428

}

17429

17430

static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

17431

SDValue Op0 = Op.getOperand(0);

17432

SDValue Op1 = Op.getOperand(1);

17433

SDValue CC = Op.getOperand(2);

17434

MVT VT = Op.getSimpleValueType();

17435

SDLoc dl(Op);

17436

17437

assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op0.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Unexpected type for boolean compare operation"
) ? void (0) : __assert_fail ("Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected type for boolean compare operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17438, __extension__ __PRETTY_FUNCTION__))

17438

"Unexpected type for boolean compare operation")(static_cast <bool> (Op0.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Unexpected type for boolean compare operation"
) ? void (0) : __assert_fail ("Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected type for boolean compare operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17438, __extension__ __PRETTY_FUNCTION__));

17439

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

17440

SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,

17441

DAG.getConstant(-1, dl, VT));

17442

SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,

17443

DAG.getConstant(-1, dl, VT));

17444

switch (SetCCOpcode) {

17445

17446

case ISD::SETEQ:

17447

// (x == y) -> ~(x ^ y)

17448

return DAG.getNode(ISD::XOR, dl, VT,

17449

DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),

17450

DAG.getConstant(-1, dl, VT));

17451

case ISD::SETNE:

17452

// (x != y) -> (x ^ y)

17453

return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);

17454

case ISD::SETUGT:

17455

case ISD::SETGT:

17456

// (x > y) -> (x & ~y)

17457

return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);

17458

case ISD::SETULT:

17459

case ISD::SETLT:

17460

// (x < y) -> (~x & y)

17461

return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);

17462

case ISD::SETULE:

17463

case ISD::SETLE:

17464

// (x <= y) -> (~x | y)

17465

return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);

17466

case ISD::SETUGE:

17467

case ISD::SETGE:

17468

// (x >=y) -> (x | ~y)

17469

return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);

17470

}

17471

}

17472

17473

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

17474

17475

SDValue Op0 = Op.getOperand(0);

17476

SDValue Op1 = Op.getOperand(1);

17477

SDValue CC = Op.getOperand(2);

17478

MVT VT = Op.getSimpleValueType();

17479

SDLoc dl(Op);

17480

17481

assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17482, __extension__ __PRETTY_FUNCTION__))

17482

"Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17482, __extension__ __PRETTY_FUNCTION__));

17483

17484

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

17485

unsigned Opc = 0;

17486

bool Unsigned = false;

17487

bool Swap = false;

17488

unsigned SSECC;

17489

switch (SetCCOpcode) {

17490

17491

case ISD::SETNE: SSECC = 4; break;

17492

case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;

17493

case ISD::SETUGT: SSECC = 6; Unsigned = true; break;

17494

case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];

17495

case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;

17496

case ISD::SETULT: SSECC = 1; Unsigned = true; break;

17497

case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT

17498

case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap

17499

case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH[[clang::fallthrough]];

17500

case ISD::SETLE: SSECC = 2; break;

17501

}

17502

17503

if (Swap)

17504

std::swap(Op0, Op1);

17505

17506

// See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.

17507

if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {

17508

SDValue A = peekThroughBitcasts(Op0);

17509

if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&

17510

ISD::isBuildVectorAllZeros(Op1.getNode())) {

17511

MVT VT0 = Op0.getSimpleValueType();

17512

SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));

17513

SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));

17514

return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,

17515

dl, VT, RHS, LHS);

17516

}

17517

}

17518

17519

if (Opc)

17520

return DAG.getNode(Opc, dl, VT, Op0, Op1);

17521

Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;

17522

return DAG.getNode(Opc, dl, VT, Op0, Op1,

17523

DAG.getConstant(SSECC, dl, MVT::i8));

17524

}

17525

17526

/// \brief Try to turn a VSETULT into a VSETULE by modifying its second

17527

/// operand \p Op1. If non-trivial (for example because it's not constant)

17528

/// return an empty value.

17529

static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,

17530

SelectionDAG &DAG) {

17531

BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());

17532

if (!BV)

17533

return SDValue();

17534

17535

MVT VT = Op1.getSimpleValueType();

17536

MVT EVT = VT.getVectorElementType();

17537

unsigned n = VT.getVectorNumElements();

17538

SmallVector<SDValue, 8> ULTOp1;

17539

17540

for (unsigned i = 0; i < n; ++i) {

17541

ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

17542

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)

17543

return SDValue();

17544

17545

// Avoid underflow.

17546

APInt Val = Elt->getAPIntValue();

17547

if (Val == 0)

17548

return SDValue();

17549

17550

ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));

17551

}

17552

17553

return DAG.getBuildVector(VT, dl, ULTOp1);

17554

}

17555

17556

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

17557

SelectionDAG &DAG) {

17558

SDValue Op0 = Op.getOperand(0);

17559

SDValue Op1 = Op.getOperand(1);

17560

SDValue CC = Op.getOperand(2);

17561

MVT VT = Op.getSimpleValueType();

17562

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

17563

bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();

17564

SDLoc dl(Op);

17565

17566

if (isFP) {

17567

#ifndef NDEBUG

17568

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

17569

assert(EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f32 || EltVT == MVT::
f64) ? void (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17569, __extension__ __PRETTY_FUNCTION__));

17570

#endif

17571

17572

unsigned Opc;

17573

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {

17574

assert(VT.getVectorNumElements() <= 16)(static_cast <bool> (VT.getVectorNumElements() <= 16
) ? void (0) : __assert_fail ("VT.getVectorNumElements() <= 16"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17574, __extension__ __PRETTY_FUNCTION__));

17575

Opc = X86ISD::CMPM;

17576

} else {

17577

Opc = X86ISD::CMPP;

17578

// The SSE/AVX packed FP comparison nodes are defined with a

17579

// floating-point vector result that matches the operand type. This allows

17580

// them to work with an SSE1 target (integer vector types are not legal).

17581

VT = Op0.getSimpleValueType();

17582

}

17583

17584

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

17585

// emit two comparisons and a logic op to tie them together.

17586

SDValue Cmp;

17587

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);

17588

if (SSECC >= 8 && !Subtarget.hasAVX()) {

17589

// LLVM predicate is SETUEQ or SETONE.

17590

unsigned CC0, CC1;

17591

unsigned CombineOpc;

17592

if (Cond == ISD::SETUEQ) {

17593

CC0 = 3; // UNORD

17594

CC1 = 0; // EQ

17595

CombineOpc = X86ISD::FOR;

17596

} else {

17597

assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17597, __extension__ __PRETTY_FUNCTION__));

17598

CC0 = 7; // ORD

17599

CC1 = 4; // NEQ

17600

CombineOpc = X86ISD::FAND;

17601

}

17602

17603

SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,

17604

DAG.getConstant(CC0, dl, MVT::i8));

17605

SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,

17606

DAG.getConstant(CC1, dl, MVT::i8));

17607

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

17608

} else {

17609

// Handle all other FP comparisons here.

17610

Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,

17611

DAG.getConstant(SSECC, dl, MVT::i8));

17612

}

17613

17614

// If this is SSE/AVX CMPP, bitcast the result back to integer to match the

17615

// result type of SETCC. The bitcast is expected to be optimized away

17616

// during combining/isel.

17617

if (Opc == X86ISD::CMPP)

17618

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

17619

17620

return Cmp;

17621

}

17622

17623

MVT VTOp0 = Op0.getSimpleValueType();

17624

assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17625, __extension__ __PRETTY_FUNCTION__))

17625

"Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17625, __extension__ __PRETTY_FUNCTION__));

17626

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17627, __extension__ __PRETTY_FUNCTION__))

17627

"Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17627, __extension__ __PRETTY_FUNCTION__));

17628

17629

if (VT.is128BitVector() && VTOp0.is256BitVector()) {

17630

// On non-AVX512 targets, a vector of MVT::i1 is promoted by the type

17631

// legalizer to a wider vector type. In the case of 'vsetcc' nodes, the

17632

// legalizer firstly checks if the first operand in input to the setcc has

17633

// a legal type. If so, then it promotes the return type to that same type.

17634

// Otherwise, the return type is promoted to the 'next legal type' which,

17635

// for a vector of MVT::i1 is always a 128-bit integer vector type.

17636

17637

// We reach this code only if the following two conditions are met:

17638

// 1. Both return type and operand type have been promoted to wider types

17639

// by the type legalizer.

17640

// 2. The original operand type has been promoted to a 256-bit vector.

17641

17642

// Note that condition 2. only applies for AVX targets.

17643

SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);

17644

return DAG.getZExtOrTrunc(NewOp, dl, VT);

17645

}

17646

17647

// The non-AVX512 code below works under the assumption that source and

17648

// destination types are the same.

17649

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17650, __extension__ __PRETTY_FUNCTION__))

17650

"Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17650, __extension__ __PRETTY_FUNCTION__));

17651

17652

// Break 256-bit integer vector compare into smaller ones.

17653

if (VT.is256BitVector() && !Subtarget.hasInt256())

17654

return Lower256IntVSETCC(Op, DAG);

17655

17656

// Operands are boolean (vectors of i1)

17657

MVT OpVT = Op1.getSimpleValueType();

17658

if (OpVT.getVectorElementType() == MVT::i1)

17659

return LowerBoolVSETCC_AVX512(Op, DAG);

17660

17661

// The result is boolean, but operands are int/float

17662

if (VT.getVectorElementType() == MVT::i1) {

17663

// In AVX-512 architecture setcc returns mask with i1 elements,

17664

// But there is no compare instruction for i8 and i16 elements in KNL.

17665

// In this case use SSE compare

17666

bool UseAVX512Inst =

17667

(OpVT.is512BitVector() ||

17668

OpVT.getScalarSizeInBits() >= 32 ||

17669

(Subtarget.hasBWI() && Subtarget.hasVLX()));

17670

17671

if (UseAVX512Inst)

17672

return LowerIntVSETCC_AVX512(Op, DAG);

17673

17674

return DAG.getNode(ISD::TRUNCATE, dl, VT,

17675

DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));

17676

}

17677

17678

// Lower using XOP integer comparisons.

17679

if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||

17680

VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {

17681

// Translate compare code to XOP PCOM compare mode.

17682

unsigned CmpMode = 0;

17683

switch (Cond) {

17684

17685

case ISD::SETULT:

17686

case ISD::SETLT: CmpMode = 0x00; break;

17687

case ISD::SETULE:

17688

case ISD::SETLE: CmpMode = 0x01; break;

17689

case ISD::SETUGT:

17690

case ISD::SETGT: CmpMode = 0x02; break;

17691

case ISD::SETUGE:

17692

case ISD::SETGE: CmpMode = 0x03; break;

17693

case ISD::SETEQ: CmpMode = 0x04; break;

17694

case ISD::SETNE: CmpMode = 0x05; break;

17695

}

17696

17697

// Are we comparing unsigned or signed integers?

17698

unsigned Opc =

17699

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

17700

17701

return DAG.getNode(Opc, dl, VT, Op0, Op1,

17702

DAG.getConstant(CmpMode, dl, MVT::i8));

17703

}

17704

17705

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

17706

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

17707

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

17708

SDValue BC0 = peekThroughBitcasts(Op0);

17709

if (BC0.getOpcode() == ISD::AND) {

17710

APInt UndefElts;

17711

SmallVector<APInt, 64> EltBits;

17712

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

17713

VT.getScalarSizeInBits(), UndefElts,

17714

EltBits, false, false)) {

17715

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

17716

Cond = ISD::SETEQ;

17717

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

17718

}

17719

}

17720

}

17721

}

17722

17723

// We are handling one of the integer comparisons here. Since SSE only has

17724

// GT and EQ comparisons for integer, swapping operands and multiple

17725

// operations may be required for some comparisons.

17726

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

17727

: X86ISD::PCMPGT;

17728

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

17729

Cond == ISD::SETGE || Cond == ISD::SETUGE;

17730

bool Invert = Cond == ISD::SETNE ||

17731

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

17732

17733

// If both operands are known non-negative, then an unsigned compare is the

17734

// same as a signed compare and there's no need to flip signbits.

17735

// TODO: We could check for more general simplifications here since we're

17736

// computing known bits.

17737

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

17738

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

17739

17740

// Special case: Use min/max operations for SETULE/SETUGE

17741

MVT VET = VT.getVectorElementType();

17742

bool HasMinMax =

17743

(Subtarget.hasAVX512() && VET == MVT::i64) ||

17744

(Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||

17745

(Subtarget.hasSSE2() && (VET == MVT::i8));

17746

bool MinMax = false;

17747

if (HasMinMax) {

17748

switch (Cond) {

17749

default: break;

17750

case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;

17751

case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;

17752

}

17753

17754

if (MinMax)

17755

Swap = Invert = FlipSigns = false;

17756

}

17757

17758

bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);

17759

bool Subus = false;

17760

if (!MinMax && HasSubus) {

17761

// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

17762

// Op0 u<= Op1:

17763

// t = psubus Op0, Op1

17764

// pcmpeq t, <0..0>

17765

switch (Cond) {

17766

default: break;

17767

case ISD::SETULT: {

17768

// If the comparison is against a constant we can turn this into a

17769

// setule. With psubus, setule does not require a swap. This is

17770

// beneficial because the constant in the register is no longer

17771

// destructed as the destination so it can be hoisted out of a loop.

17772

// Only do this pre-AVX since vpcmp* is no longer destructive.

17773

if (Subtarget.hasAVX())

17774

break;

17775

if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {

17776

Op1 = ULEOp1;

17777

Subus = true; Invert = false; Swap = false;

17778

}

17779

break;

17780

}

17781

// Psubus is better than flip-sign because it requires no inversion.

17782

case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;

17783

case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;

17784

}

17785

17786

if (Subus) {

17787

Opc = X86ISD::SUBUS;

17788

FlipSigns = false;

17789

}

17790

}

17791

17792

if (Swap)

17793

std::swap(Op0, Op1);

17794

17795

// Check that the operation in question is available (most are plain SSE2,

17796

// but PCMPGTQ and PCMPEQQ have different requirements).

17797

if (VT == MVT::v2i64) {

17798

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

17799

assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17799, __extension__ __PRETTY_FUNCTION__));

17800

17801

// First cast everything to the right type.

17802

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

17803

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

17804

17805

// Since SSE has no unsigned integer comparisons, we need to flip the sign

17806

// bits of the inputs before performing those operations. The lower

17807

// compare is always unsigned.

17808

SDValue SB;

17809

if (FlipSigns) {

17810

SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);

17811

} else {

17812

SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);

17813

SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);

17814

SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});

17815

}

17816

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);

17817

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);

17818

17819

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

17820

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

17821

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

17822

17823

// Create masks for only the low parts/high parts of the 64 bit integers.

17824

static const int MaskHi[] = { 1, 1, 3, 3 };

17825

static const int MaskLo[] = { 0, 0, 2, 2 };

17826

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

17827

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

17828

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

17829

17830

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

17831

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

17832

17833

if (Invert)

17834

Result = DAG.getNOT(dl, Result, MVT::v4i32);

17835

17836

return DAG.getBitcast(VT, Result);

17837

}

17838

17839

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

17840

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

17841

// pcmpeqd + pshufd + pand.

17842

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17842, __extension__ __PRETTY_FUNCTION__));

17843

17844

// First cast everything to the right type.

17845

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

17846

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

17847

17848

// Do the compare.

17849

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

17850

17851

// Make sure the lower and upper halves are both all-ones.

17852

static const int Mask[] = { 1, 0, 3, 2 };

17853

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

17854

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

17855

17856

if (Invert)

17857

Result = DAG.getNOT(dl, Result, MVT::v4i32);

17858

17859

return DAG.getBitcast(VT, Result);

17860

}

17861

}

17862

17863

// Since SSE has no unsigned integer comparisons, we need to flip the sign

17864

// bits of the inputs before performing those operations.

17865

if (FlipSigns) {

17866

MVT EltVT = VT.getVectorElementType();

17867

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

17868

VT);

17869

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

17870

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

17871

}

17872

17873

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

17874

17875

// If the logical-not of the result is required, perform that now.

17876

if (Invert)

17877

Result = DAG.getNOT(dl, Result, VT);

17878

17879

if (MinMax)

17880

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

17881

17882

if (Subus)

17883

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

17884

getZeroVector(VT, Subtarget, DAG, dl));

17885

17886

return Result;

17887

}

17888

17889

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

17890

17891

MVT VT = Op.getSimpleValueType();

17892

17893

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

17894

17895

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17895, __extension__ __PRETTY_FUNCTION__));

17896

SDValue Op0 = Op.getOperand(0);

17897

SDValue Op1 = Op.getOperand(1);

17898

SDLoc dl(Op);

17899

ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

17900

17901

// Optimize to BT if possible.

17902

// Lower (X & (1 << N)) == 0 to BT(X, N).

17903

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

17904

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

17905

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&

17906

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

17907

if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))

17908

return NewSetCC;

17909

}

17910

17911

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of

17912

// these.

17913

if ((isOneConstant(Op1) || isNullConstant(Op1)) &&

17914

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

17915

17916

// If the input is a setcc, then reuse the input setcc or use a new one with

17917

// the inverted condition.

17918

if (Op0.getOpcode() == X86ISD::SETCC) {

17919

X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);

17920

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

17921

if (!Invert)

17922

return Op0;

17923

17924

CCode = X86::GetOppositeBranchCondition(CCode);

17925

return getSETCC(CCode, Op0.getOperand(1), dl, DAG);

17926

}

17927

}

17928

17929

bool IsFP = Op1.getSimpleValueType().isFloatingPoint();

17930

X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);

17931

if (X86CC == X86::COND_INVALID)

17932

return SDValue();

17933

17934

SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);

17935

EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);

17936

return getSETCC(X86CC, EFLAGS, dl, DAG);

17937

}

17938

17939

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

17940

SDValue LHS = Op.getOperand(0);

17941

SDValue RHS = Op.getOperand(1);

17942

SDValue Carry = Op.getOperand(2);

17943

SDValue Cond = Op.getOperand(3);

17944

SDLoc DL(Op);

17945

17946

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17946, __extension__ __PRETTY_FUNCTION__));

17947

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

17948

17949

// Recreate the carry if needed.

17950

EVT CarryVT = Carry.getValueType();

17951

APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());

17952

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

17953

Carry, DAG.getConstant(NegOne, DL, CarryVT));

17954

17955

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

17956

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

17957

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

17958

}

17959

17960

/// Return true if opcode is a X86 logical comparison.

17961

static bool isX86LogicalCmp(SDValue Op) {

17962

unsigned Opc = Op.getOpcode();

17963

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

17964

Opc == X86ISD::SAHF)

17965

return true;

17966

if (Op.getResNo() == 1 &&

17967

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

17968

Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||

17969

Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||

17970

Opc == X86ISD::XOR || Opc == X86ISD::AND))

17971

return true;

17972

17973

if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)

17974

return true;

17975

17976

return false;

17977

}

17978

17979

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

17980

if (V.getOpcode() != ISD::TRUNCATE)

17981

return false;

17982

17983

SDValue VOp0 = V.getOperand(0);

17984

unsigned InBits = VOp0.getValueSizeInBits();

17985

unsigned Bits = V.getValueSizeInBits();

17986

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

17987

}

17988

17989

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

17990

bool AddTest = true;

17991

SDValue Cond = Op.getOperand(0);

17992

SDValue Op1 = Op.getOperand(1);

17993

SDValue Op2 = Op.getOperand(2);

17994

SDLoc DL(Op);

17995

MVT VT = Op1.getSimpleValueType();

17996

SDValue CC;

17997

17998

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

17999

// are available or VBLENDV if AVX is available.

18000

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

18001

if (Cond.getOpcode() == ISD::SETCC &&

18002

((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||

18003

(Subtarget.hasSSE1() && VT == MVT::f32)) &&

18004

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

18005

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

18006

unsigned SSECC = translateX86FSETCC(

18007

cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

18008

18009

if (Subtarget.hasAVX512()) {

18010

SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,

18011

CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));

18012

assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18012, __extension__ __PRETTY_FUNCTION__));

18013

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

18014

}

18015

18016

if (SSECC < 8 || Subtarget.hasAVX()) {

18017

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

18018

DAG.getConstant(SSECC, DL, MVT::i8));

18019

18020

// If we have AVX, we can use a variable vector select (VBLENDV) instead

18021

// of 3 logic instructions for size savings and potentially speed.

18022

// Unfortunately, there is no scalar form of VBLENDV.

18023

18024

// If either operand is a constant, don't try this. We can expect to

18025

// optimize away at least one of the logic instructions later in that

18026

// case, so that sequence would be faster than a variable blend.

18027

18028

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

18029

// uses XMM0 as the selection register. That may need just as many

18030

// instructions as the AND/ANDN/OR sequence due to register moves, so

18031

// don't bother.

18032

18033

if (Subtarget.hasAVX() &&

18034

!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {

18035

18036

// Convert to vectors, do a VSELECT, and convert back to scalar.

18037

// All of the conversions should be optimized away.

18038

18039

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

18040

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

18041

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

18042

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

18043

18044

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

18045

VCmp = DAG.getBitcast(VCmpVT, VCmp);

18046

18047

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

18048

18049

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

18050

VSel, DAG.getIntPtrConstant(0, DL));

18051

}

18052

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

18053

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

18054

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

18055

}

18056

}

18057

18058

// AVX512 fallback is to lower selects of scalar floats to masked moves.

18059

if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {

18060

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

18061

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

18062

}

18063

18064

if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {

18065

SDValue Op1Scalar;

18066

if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))

18067

Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);

18068

else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))

18069

Op1Scalar = Op1.getOperand(0);

18070

SDValue Op2Scalar;

18071

if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))

18072

Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);

18073

else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))

18074

Op2Scalar = Op2.getOperand(0);

18075

if (Op1Scalar.getNode() && Op2Scalar.getNode()) {

18076

SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,

18077

Op1Scalar, Op2Scalar);

18078

if (newSelect.getValueSizeInBits() == VT.getSizeInBits())

18079

return DAG.getBitcast(VT, newSelect);

18080

SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);

18081

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,

18082

DAG.getIntPtrConstant(0, DL));

18083

}

18084

}

18085

18086

if (VT == MVT::v4i1 || VT == MVT::v2i1) {

18087

SDValue zeroConst = DAG.getIntPtrConstant(0, DL);

18088

Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,

18089

DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);

18090

Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,

18091

DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);

18092

SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);

18093

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);

18094

}

18095

18096

if (Cond.getOpcode() == ISD::SETCC) {

18097

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

18098

Cond = NewCond;

18099

// If the condition was updated, it's possible that the operands of the

18100

// select were also updated (for example, EmitTest has a RAUW). Refresh

18101

// the local references to the select operands in case they got stale.

18102

Op1 = Op.getOperand(1);

18103

Op2 = Op.getOperand(2);

18104

}

18105

}

18106

18107

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

18108

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

18109

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

18110

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

18111

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

18112

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

18113

if (Cond.getOpcode() == X86ISD::SETCC &&

18114

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

18115

isNullConstant(Cond.getOperand(1).getOperand(1))) {

18116

SDValue Cmp = Cond.getOperand(1);

18117

unsigned CondCode =

18118

cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

18119

18120

if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

18121

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

18122

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

18123

SDValue CmpOp0 = Cmp.getOperand(0);

18124

18125

// Apply further optimizations for special cases

18126

// (select (x != 0), -1, 0) -> neg & sbb

18127

// (select (x == 0), 0, -1) -> neg & sbb

18128

if (isNullConstant(Y) &&

18129

(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {

18130

SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

18131

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

18132

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);

18133

SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

18134

DAG.getConstant(X86::COND_B, DL, MVT::i8),

18135

SDValue(Neg.getNode(), 1));

18136

return Res;

18137

}

18138

18139

Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,

18140

CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));

18141

Cmp = ConvertCmpIfNecessary(Cmp, DAG);

18142

18143

SDValue Res = // Res = 0 or -1.

18144

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

18145

DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);

18146

18147

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))

18148

Res = DAG.getNOT(DL, Res, Res.getValueType());

18149

18150

if (!isNullConstant(Op2))

18151

Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);

18152

return Res;

18153

} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&

18154

Cmp.getOperand(0).getOpcode() == ISD::AND &&

18155

isOneConstant(Cmp.getOperand(0).getOperand(1))) {

18156

SDValue CmpOp0 = Cmp.getOperand(0);

18157

SDValue Src1, Src2;

18158

// true if Op2 is XOR or OR operator and one of its operands

18159

// is equal to Op1

18160

// ( a , a op b) || ( b , a op b)

18161

auto isOrXorPattern = [&]() {

18162

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

18163

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

18164

Src1 =

18165

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

18166

Src2 = Op1;

18167

return true;

18168

}

18169

return false;

18170

};

18171

18172

if (isOrXorPattern()) {

18173

SDValue Neg;

18174

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

18175

// we need mask of all zeros or ones with same size of the other

18176

// operands.

18177

if (CmpSz > VT.getSizeInBits())

18178

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

18179

else if (CmpSz < VT.getSizeInBits())

18180

Neg = DAG.getNode(ISD::AND, DL, VT,

18181

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

18182

DAG.getConstant(1, DL, VT));

18183

else

18184

Neg = CmpOp0;

18185

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

18186

Neg); // -(and (x, 0x1))

18187

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

18188

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

18189

}

18190

}

18191

}

18192

18193

// Look past (and (setcc_carry (cmp ...)), 1).

18194

if (Cond.getOpcode() == ISD::AND &&

18195

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

18196

isOneConstant(Cond.getOperand(1)))

18197

Cond = Cond.getOperand(0);

18198

18199

// If condition flag is set by a X86ISD::CMP, then use it as the condition

18200

// setting operand in place of the X86ISD::SETCC.

18201

unsigned CondOpcode = Cond.getOpcode();

18202

if (CondOpcode == X86ISD::SETCC ||

18203

CondOpcode == X86ISD::SETCC_CARRY) {

18204

CC = Cond.getOperand(0);

18205

18206

SDValue Cmp = Cond.getOperand(1);

18207

unsigned Opc = Cmp.getOpcode();

18208

MVT VT = Op.getSimpleValueType();

18209

18210

bool IllegalFPCMov = false;

18211

if (VT.isFloatingPoint() && !VT.isVector() &&

18212

!isScalarFPTypeInSSEReg(VT)) // FPStack?

18213

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

18214

18215

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

18216

Opc == X86ISD::BT) { // FIXME

18217

Cond = Cmp;

18218

AddTest = false;

18219

}

18220

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

18221

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

18222

((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&

18223

Cond.getOperand(0).getValueType() != MVT::i8)) {

18224

SDValue LHS = Cond.getOperand(0);

18225

SDValue RHS = Cond.getOperand(1);

18226

unsigned X86Opcode;

18227

unsigned X86Cond;

18228

SDVTList VTs;

18229

switch (CondOpcode) {

18230

case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;

18231

case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;

18232

case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;

18233

case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;

18234

case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;

18235

case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;

18236

default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18236);

18237

}

18238

if (CondOpcode == ISD::UMULO)

18239

VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),

18240

MVT::i32);

18241

else

18242

VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

18243

18244

SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);

18245

18246

if (CondOpcode == ISD::UMULO)

18247

Cond = X86Op.getValue(2);

18248

else

18249

Cond = X86Op.getValue(1);

18250

18251

CC = DAG.getConstant(X86Cond, DL, MVT::i8);

18252

AddTest = false;

18253

}

18254

18255

if (AddTest) {

18256

// Look past the truncate if the high bits are known zero.

18257

if (isTruncWithZeroHighBitsInput(Cond, DAG))

18258

Cond = Cond.getOperand(0);

18259

18260

// We know the result of AND is compared against zero. Try to match

18261

// it to BT.

18262

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

18263

if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {

18264

CC = NewSetCC.getOperand(0);

18265

Cond = NewSetCC.getOperand(1);

18266

AddTest = false;

18267

}

18268

}

18269

}

18270

18271

if (AddTest) {

18272

CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);

18273

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);

18274

}

18275

18276

// a < b ? -1 : 0 -> RES = ~setcc_carry

18277

// a < b ? 0 : -1 -> RES = setcc_carry

18278

// a >= b ? -1 : 0 -> RES = setcc_carry

18279

// a >= b ? 0 : -1 -> RES = ~setcc_carry

18280

if (Cond.getOpcode() == X86ISD::SUB) {

18281

Cond = ConvertCmpIfNecessary(Cond, DAG);

18282

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

18283

18284

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

18285

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

18286

(isNullConstant(Op1) || isNullConstant(Op2))) {

18287

SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

18288

DAG.getConstant(X86::COND_B, DL, MVT::i8),

18289

Cond);

18290

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

18291

return DAG.getNOT(DL, Res, Res.getValueType());

18292

return Res;

18293

}

18294

}

18295

18296

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

18297

// widen the cmov and push the truncate through. This avoids introducing a new

18298

// branch during isel and doesn't add any extensions.

18299

if (Op.getValueType() == MVT::i8 &&

18300

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

18301

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

18302

if (T1.getValueType() == T2.getValueType() &&

18303

// Blacklist CopyFromReg to avoid partial register stalls.

18304

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

18305

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

18306

CC, Cond);

18307

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

18308

}

18309

}

18310

18311

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

18312

// condition is true.

18313

SDValue Ops[] = { Op2, Op1, CC, Cond };

18314

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

18315

}

18316

18317

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

18318

const X86Subtarget &Subtarget,

18319

SelectionDAG &DAG) {

18320

MVT VT = Op->getSimpleValueType(0);

18321

SDValue In = Op->getOperand(0);

18322

MVT InVT = In.getSimpleValueType();

18323

18324

MVT VTElt = VT.getVectorElementType();

18325

SDLoc dl(Op);

18326

18327

unsigned NumElts = VT.getVectorNumElements();

18328

18329

// Extend VT if the scalar type is v8/v16 and BWI is not supported.

18330

MVT ExtVT = VT;

18331

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)

18332

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

18333

18334

// Widen to 512-bits if VLX is not supported.

18335

MVT WideVT = ExtVT;

18336

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

18337

NumElts *= 512 / ExtVT.getSizeInBits();

18338

InVT = MVT::getVectorVT(MVT::i1, NumElts);

18339

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

18340

In, DAG.getIntPtrConstant(0, dl));

18341

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

18342

}

18343

18344

SDValue V;

18345

MVT WideEltVT = WideVT.getVectorElementType();

18346

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

18347

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

18348

V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);

18349

} else {

18350

SDValue NegOne = getOnesVector(WideVT, DAG, dl);

18351

SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);

18352

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

18353

}

18354

18355

// Truncate if we had to extend i16/i8 above.

18356

if (VT != ExtVT) {

18357

WideVT = MVT::getVectorVT(VTElt, NumElts);

18358

V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V);

18359

}

18360

18361

// Extract back to 128/256-bit if we widened.

18362

if (WideVT != VT)

18363

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

18364

DAG.getIntPtrConstant(0, dl));

18365

18366

return V;

18367

}

18368

18369

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

18370

// For sign extend this needs to handle all vector sizes and SSE4.1 and

18371

// non-SSE4.1 targets. For zero extend this should only handle inputs of

18372

// MVT::v64i8 when BWI is not supported, but AVX512 is.

18373

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

18374

const X86Subtarget &Subtarget,

18375

SelectionDAG &DAG) {

18376

SDValue In = Op->getOperand(0);

18377

MVT VT = Op->getSimpleValueType(0);

18378

MVT InVT = In.getSimpleValueType();

18379

assert(VT.getSizeInBits() == InVT.getSizeInBits())(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
()) ? void (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18379, __extension__ __PRETTY_FUNCTION__));

18380

18381

MVT SVT = VT.getVectorElementType();

18382

MVT InSVT = InVT.getVectorElementType();

18383

assert(SVT.getSizeInBits() > InSVT.getSizeInBits())(static_cast <bool> (SVT.getSizeInBits() > InSVT.getSizeInBits
()) ? void (0) : __assert_fail ("SVT.getSizeInBits() > InSVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18383, __extension__ __PRETTY_FUNCTION__));

18384

18385

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

18386

return SDValue();

18387

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

18388

return SDValue();

18389

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

18390

!(VT.is256BitVector() && Subtarget.hasInt256()) &&

18391

!(VT.is512BitVector() && Subtarget.hasAVX512()))

18392

return SDValue();

18393

18394

SDLoc dl(Op);

18395

18396

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

18397

// For 512-bit vectors, we need 128-bits or 256-bits.

18398

if (VT.getSizeInBits() > 128) {

18399

// Input needs to be at least the same number of elements as output, and

18400

// at least 128-bits.

18401

int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();

18402

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

18403

}

18404

18405

assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||(static_cast <bool> ((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG
|| InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"
) ? void (0) : __assert_fail ("(Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && \"Zero extend only for v64i8 input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18406, __extension__ __PRETTY_FUNCTION__))

18406

InVT == MVT::v64i8) && "Zero extend only for v64i8 input!")(static_cast <bool> ((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG
|| InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"
) ? void (0) : __assert_fail ("(Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && \"Zero extend only for v64i8 input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18406, __extension__ __PRETTY_FUNCTION__));

18407

18408

// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,

18409

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

18410

// need to be handled here for 256/512-bit results.

18411

if (Subtarget.hasInt256()) {

18412

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18412, __extension__ __PRETTY_FUNCTION__));

18413

unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?

18414

X86ISD::VSEXT : X86ISD::VZEXT;

18415

return DAG.getNode(ExtOpc, dl, VT, In);

18416

}

18417

18418

// We should only get here for sign extend.

18419

assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&(static_cast <bool> (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18420, __extension__ __PRETTY_FUNCTION__))

18420

"Unexpected opcode!")(static_cast <bool> (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18420, __extension__ __PRETTY_FUNCTION__));

18421

18422

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

18423

SDValue Curr = In;

18424

MVT CurrVT = InVT;

18425

18426

// As SRAI is only available on i16/i32 types, we expand only up to i32

18427

// and handle i64 separately.

18428

while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {

18429

Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);

18430

MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);

18431

CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);

18432

Curr = DAG.getBitcast(CurrVT, Curr);

18433

}

18434

18435

SDValue SignExt = Curr;

18436

if (CurrVT != InVT) {

18437

unsigned SignExtShift =

18438

CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();

18439

SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,

18440

DAG.getConstant(SignExtShift, dl, MVT::i8));

18441

}

18442

18443

if (CurrVT == VT)

18444

return SignExt;

18445

18446

if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {

18447

SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,

18448

DAG.getConstant(31, dl, MVT::i8));

18449

SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});

18450

return DAG.getBitcast(VT, Ext);

18451

}

18452

18453

return SDValue();

18454

}

18455

18456

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

18457

SelectionDAG &DAG) {

18458

MVT VT = Op->getSimpleValueType(0);

18459

SDValue In = Op->getOperand(0);

18460

MVT InVT = In.getSimpleValueType();

18461

SDLoc dl(Op);

18462

18463

if (InVT.getVectorElementType() == MVT::i1)

18464

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

18465

18466

if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&

18467

(VT != MVT::v8i32 || InVT != MVT::v8i16) &&

18468

(VT != MVT::v16i16 || InVT != MVT::v16i8) &&

18469

(VT != MVT::v8i64 || InVT != MVT::v8i32) &&

18470

(VT != MVT::v8i64 || InVT != MVT::v8i16) &&

18471

(VT != MVT::v16i32 || InVT != MVT::v16i16) &&

18472

(VT != MVT::v16i32 || InVT != MVT::v16i8) &&

18473

(VT != MVT::v32i16 || InVT != MVT::v32i8))

18474

return SDValue();

18475

18476

if (Subtarget.hasInt256())

18477

return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

18478

18479

// Optimize vectors in AVX mode

18480

// Sign extend v8i16 to v8i32 and

18481

// v4i32 to v4i64

18482

18483

// Divide input vector into two parts

18484

// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}

18485

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

18486

// concat the vectors to original VT

18487

18488

unsigned NumElems = InVT.getVectorNumElements();

18489

SDValue Undef = DAG.getUNDEF(InVT);

18490

18491

SmallVector<int,8> ShufMask1(NumElems, -1);

18492

for (unsigned i = 0; i != NumElems/2; ++i)

18493

ShufMask1[i] = i;

18494

18495

SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);

18496

18497

SmallVector<int,8> ShufMask2(NumElems, -1);

18498

for (unsigned i = 0; i != NumElems/2; ++i)

18499

ShufMask2[i] = i + NumElems/2;

18500

18501

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);

18502

18503

MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),

18504

VT.getVectorNumElements() / 2);

18505

18506

OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);

18507

OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);

18508

18509

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

18510

}

18511

18512

// Lower truncating store. We need a special lowering to vXi1 vectors

18513

static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,

18514

SelectionDAG &DAG) {

18515

StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());

18516

SDLoc dl(St);

18517

EVT MemVT = St->getMemoryVT();

18518

assert(St->isTruncatingStore() && "We only custom truncating store.")(static_cast <bool> (St->isTruncatingStore() &&
"We only custom truncating store.") ? void (0) : __assert_fail
("St->isTruncatingStore() && \"We only custom truncating store.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18518, __extension__ __PRETTY_FUNCTION__));

18519

assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (MemVT.isVector() && MemVT.
getVectorElementType() == MVT::i1 && "Expected truncstore of i1 vector"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && \"Expected truncstore of i1 vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18520, __extension__ __PRETTY_FUNCTION__))

18520

"Expected truncstore of i1 vector")(static_cast <bool> (MemVT.isVector() && MemVT.
getVectorElementType() == MVT::i1 && "Expected truncstore of i1 vector"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && \"Expected truncstore of i1 vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18520, __extension__ __PRETTY_FUNCTION__));

18521

18522

SDValue Op = St->getValue();

18523

MVT OpVT = Op.getValueType().getSimpleVT();

18524

unsigned NumElts = OpVT.getVectorNumElements();

18525

if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||

18526

NumElts == 16) {

18527

// Truncate and store - everything is legal

18528

Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);

18529

if (MemVT.getSizeInBits() < 8)

18530

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

18531

DAG.getUNDEF(MVT::v8i1), Op,

18532

DAG.getIntPtrConstant(0, dl));

18533

return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),

18534

St->getMemOperand());

18535

}

18536

18537

// A subset, assume that we have only AVX-512F

18538

if (NumElts <= 8) {

18539

if (NumElts < 8) {

18540

// Extend to 8-elts vector

18541

MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);

18542

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,

18543

DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));

18544

}

18545

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);

18546

return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),

18547

St->getMemOperand());

18548

}

18549

// v32i8

18550

assert(OpVT == MVT::v32i8 && "Unexpected operand type")(static_cast <bool> (OpVT == MVT::v32i8 && "Unexpected operand type"
) ? void (0) : __assert_fail ("OpVT == MVT::v32i8 && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18550, __extension__ __PRETTY_FUNCTION__));

18551

// Divide the vector into 2 parts and store each part separately

18552

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,

18553

DAG.getIntPtrConstant(0, dl));

18554

Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);

18555

SDValue BasePtr = St->getBasePtr();

18556

SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,

18557

St->getMemOperand());

18558

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,

18559

DAG.getIntPtrConstant(16, dl));

18560

Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);

18561

18562

SDValue BasePtrHi =

18563

DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,

18564

DAG.getConstant(2, dl, BasePtr.getValueType()));

18565

18566

SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,

18567

BasePtrHi, St->getMemOperand());

18568

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);

18569

}

18570

18571

static SDValue LowerExtended1BitVectorLoad(SDValue Op,

18572

const X86Subtarget &Subtarget,

18573

SelectionDAG &DAG) {

18574

18575

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

18576

SDLoc dl(Ld);

18577

EVT MemVT = Ld->getMemoryVT();

18578

assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&(static_cast <bool> (MemVT.isVector() && MemVT.
getScalarType() == MVT::i1 && "Expected i1 vector load"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && \"Expected i1 vector load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18579, __extension__ __PRETTY_FUNCTION__))

18579

"Expected i1 vector load")(static_cast <bool> (MemVT.isVector() && MemVT.
getScalarType() == MVT::i1 && "Expected i1 vector load"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && \"Expected i1 vector load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18579, __extension__ __PRETTY_FUNCTION__));

18580

unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?

18581

ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;

18582

MVT VT = Op.getValueType().getSimpleVT();

18583

unsigned NumElts = VT.getVectorNumElements();

18584

18585

if ((Subtarget.hasBWI() && NumElts >= 32) ||

18586

(Subtarget.hasDQI() && NumElts < 16) ||

18587

NumElts == 16) {

18588

// Load and extend - everything is legal

18589

if (NumElts < 8) {

18590

SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),

18591

Ld->getBasePtr(),

18592

Ld->getMemOperand());

18593

// Replace chain users with the new chain.

18594

assert(Load->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (Load->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18594, __extension__ __PRETTY_FUNCTION__));

18595

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

18596

MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);

18597

SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);

18598

18599

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,

18600

DAG.getIntPtrConstant(0, dl));

18601

}

18602

SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),

18603

Ld->getBasePtr(),

18604

Ld->getMemOperand());

18605

// Replace chain users with the new chain.

18606

18607

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

18608

18609

// Finally, do a normal sign-extend to the desired register.

18610

return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);

18611

}

18612

18613

if (NumElts <= 8) {

18614

// A subset, assume that we have only AVX-512F

18615

unsigned NumBitsToLoad = 8;

18616

MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);

18617

SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),

18618

Ld->getBasePtr(),

18619

Ld->getMemOperand());

18620

// Replace chain users with the new chain.

18621

18622

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

18623

18624

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);

18625

SDValue BitVec = DAG.getBitcast(MaskVT, Load);

18626

18627

if (NumElts == 8)

18628

return DAG.getNode(ExtOpcode, dl, VT, BitVec);

18629

18630

// we should take care to v4i1 and v2i1

18631

18632

MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);

18633

SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);

18634

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,

18635

DAG.getIntPtrConstant(0, dl));

18636

}

18637

18638

assert(VT == MVT::v32i8 && "Unexpected extload type")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected extload type"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected extload type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18638, __extension__ __PRETTY_FUNCTION__));

18639

18640

SmallVector<SDValue, 2> Chains;

18641

18642

SDValue BasePtr = Ld->getBasePtr();

18643

SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),

18644

Ld->getBasePtr(),

18645

Ld->getMemOperand());

18646

Chains.push_back(LoadLo.getValue(1));

18647

18648

SDValue BasePtrHi =

18649

DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,

18650

DAG.getConstant(2, dl, BasePtr.getValueType()));

18651

18652

SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),

18653

BasePtrHi,

18654

Ld->getMemOperand());

18655

Chains.push_back(LoadHi.getValue(1));

18656

SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

18657

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);

18658

18659

SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);

18660

SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);

18661

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);

18662

}

18663

18664

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

18665

// may emit an illegal shuffle but the expansion is still better than scalar

18666

// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise

18667

// we'll emit a shuffle and a arithmetic shift.

18668

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

18669

// TODO: It is possible to support ZExt by zeroing the undef values during

18670

// the shuffle phase or after the shuffle.

18671

static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,

18672

SelectionDAG &DAG) {

18673

MVT RegVT = Op.getSimpleValueType();

18674

assert(RegVT.isVector() && "We only custom lower vector sext loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector sext loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector sext loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18674, __extension__ __PRETTY_FUNCTION__));

18675

assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18676, __extension__ __PRETTY_FUNCTION__))

18676

"We only custom lower integer vector sext loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18676, __extension__ __PRETTY_FUNCTION__));

18677

18678

// Nothing useful we can do without SSE2 shuffles.

18679

assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.")(static_cast <bool> (Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"We only custom lower sext loads with SSE2.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18679, __extension__ __PRETTY_FUNCTION__));

18680

18681

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

18682

SDLoc dl(Ld);

18683

EVT MemVT = Ld->getMemoryVT();

18684

if (MemVT.getScalarType() == MVT::i1)

18685

return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);

18686

18687

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

18688

unsigned RegSz = RegVT.getSizeInBits();

18689

18690

ISD::LoadExtType Ext = Ld->getExtensionType();

18691

18692

assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)(static_cast <bool> ((Ext == ISD::EXTLOAD || Ext == ISD
::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? void (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18693, __extension__ __PRETTY_FUNCTION__))

18693

&& "Only anyext and sext are currently implemented.")(static_cast <bool> ((Ext == ISD::EXTLOAD || Ext == ISD
::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? void (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18693, __extension__ __PRETTY_FUNCTION__));

18694

assert(MemVT != RegVT && "Cannot extend to the same type")(static_cast <bool> (MemVT != RegVT && "Cannot extend to the same type"
) ? void (0) : __assert_fail ("MemVT != RegVT && \"Cannot extend to the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18694, __extension__ __PRETTY_FUNCTION__));

18695

assert(MemVT.isVector() && "Must load a vector from memory")(static_cast <bool> (MemVT.isVector() && "Must load a vector from memory"
) ? void (0) : __assert_fail ("MemVT.isVector() && \"Must load a vector from memory\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18695, __extension__ __PRETTY_FUNCTION__));

18696

18697

unsigned NumElems = RegVT.getVectorNumElements();

18698

unsigned MemSz = MemVT.getSizeInBits();

18699

assert(RegSz > MemSz && "Register size must be greater than the mem size")(static_cast <bool> (RegSz > MemSz && "Register size must be greater than the mem size"
) ? void (0) : __assert_fail ("RegSz > MemSz && \"Register size must be greater than the mem size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18699, __extension__ __PRETTY_FUNCTION__));

18700

18701

if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {

18702

// The only way in which we have a legal 256-bit vector result but not the

18703

// integer 256-bit operations needed to directly lower a sextload is if we

18704

// have AVX1 but not AVX2. In that case, we can always emit a sextload to

18705

// a 128-bit vector and a normal sign_extend to 256-bits that should get

18706

// correctly legalized. We do this late to allow the canonical form of

18707

// sextload to persist throughout the rest of the DAG combiner -- it wants

18708

// to fold together any extensions it can, and so will fuse a sign_extend

18709

// of an sextload into a sextload targeting a wider value.

18710

SDValue Load;

18711

if (MemSz == 128) {

18712

// Just switch this to a normal load.

18713

assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "(static_cast <bool> (TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? void (0) : __assert_fail
("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18715, __extension__ __PRETTY_FUNCTION__))

18714

"it must be a legal 128-bit vector "(static_cast <bool> (TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? void (0) : __assert_fail
("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18715, __extension__ __PRETTY_FUNCTION__))

18715

"type!")(static_cast <bool> (TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? void (0) : __assert_fail
("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18715, __extension__ __PRETTY_FUNCTION__));

18716

Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),

18717

Ld->getPointerInfo(), Ld->getAlignment(),

18718

Ld->getMemOperand()->getFlags());

18719

} else {

18720

assert(MemSz < 128 &&(static_cast <bool> (MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? void (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18721, __extension__ __PRETTY_FUNCTION__))

18721

"Can't extend a type wider than 128 bits to a 256 bit vector!")(static_cast <bool> (MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? void (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18721, __extension__ __PRETTY_FUNCTION__));

18722

// Do an sext load to a 128-bit vector type. We want to use the same

18723

// number of elements, but elements half as wide. This will end up being

18724

// recursively lowered by this routine, but will succeed as we definitely

18725

// have all the necessary features if we're using AVX1.

18726

EVT HalfEltVT =

18727

EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);

18728

EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);

18729

Load =

18730

DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),

18731

Ld->getPointerInfo(), MemVT, Ld->getAlignment(),

18732

Ld->getMemOperand()->getFlags());

18733

}

18734

18735

// Replace chain users with the new chain.

18736

18737

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

18738

18739

// Finally, do a normal sign-extend to the desired register.

18740

return DAG.getSExtOrTrunc(Load, dl, RegVT);

18741

}

18742

18743

// All sizes must be a power of two.

18744

assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&(static_cast <bool> (isPowerOf2_32(RegSz * MemSz * NumElems
) && "Non-power-of-two elements are not custom lowered!"
) ? void (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18745, __extension__ __PRETTY_FUNCTION__))

18745

"Non-power-of-two elements are not custom lowered!")(static_cast <bool> (isPowerOf2_32(RegSz * MemSz * NumElems
) && "Non-power-of-two elements are not custom lowered!"
) ? void (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18745, __extension__ __PRETTY_FUNCTION__));

18746

18747

// Attempt to load the original value using scalar loads.

18748

// Find the largest scalar type that divides the total loaded size.

18749

MVT SclrLoadTy = MVT::i8;

18750

for (MVT Tp : MVT::integer_valuetypes()) {

18751

if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {

18752

SclrLoadTy = Tp;

18753

}

18754

}

18755

18756

// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.

18757

if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&

18758

(64 <= MemSz))

18759

SclrLoadTy = MVT::f64;

18760

18761

// Calculate the number of scalar loads that we need to perform

18762

// in order to load our vector from memory.

18763

unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

18764

18765

assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&(static_cast <bool> ((Ext != ISD::SEXTLOAD || NumLoads ==
1) && "Can only lower sext loads with a single scalar load!"
) ? void (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18766, __extension__ __PRETTY_FUNCTION__))

18766

"Can only lower sext loads with a single scalar load!")(static_cast <bool> ((Ext != ISD::SEXTLOAD || NumLoads ==
1) && "Can only lower sext loads with a single scalar load!"
) ? void (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18766, __extension__ __PRETTY_FUNCTION__));

18767

18768

unsigned loadRegZize = RegSz;

18769

if (Ext == ISD::SEXTLOAD && RegSz >= 256)

18770

loadRegZize = 128;

18771

18772

// If we don't have BWI we won't be able to create the shuffle needed for

18773

// v8i8->v8i64.

18774

if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&

18775

MemVT == MVT::v8i8)

18776

loadRegZize = 128;

18777

18778

// Represent our vector as a sequence of elements which are the

18779

// largest scalar that we can load.

18780

EVT LoadUnitVecVT = EVT::getVectorVT(

18781

*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());

18782

18783

// Represent the data using the same element type that is stored in

18784

// memory. In practice, we ''widen'' MemVT.

18785

EVT WideVecVT =

18786

EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

18787

loadRegZize / MemVT.getScalarSizeInBits());

18788

18789

assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&(static_cast <bool> (WideVecVT.getSizeInBits() == LoadUnitVecVT
.getSizeInBits() && "Invalid vector type") ? void (0)
: __assert_fail ("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18790, __extension__ __PRETTY_FUNCTION__))

18790

"Invalid vector type")(static_cast <bool> (WideVecVT.getSizeInBits() == LoadUnitVecVT
.getSizeInBits() && "Invalid vector type") ? void (0)
: __assert_fail ("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18790, __extension__ __PRETTY_FUNCTION__));

18791

18792

// We can't shuffle using an illegal type.

18793

assert(TLI.isTypeLegal(WideVecVT) &&(static_cast <bool> (TLI.isTypeLegal(WideVecVT) &&
"We only lower types that form legal widened vector types") ?
void (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18794, __extension__ __PRETTY_FUNCTION__))

18794

"We only lower types that form legal widened vector types")(static_cast <bool> (TLI.isTypeLegal(WideVecVT) &&
"We only lower types that form legal widened vector types") ?
void (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18794, __extension__ __PRETTY_FUNCTION__));

18795

18796

SmallVector<SDValue, 8> Chains;

18797

SDValue Ptr = Ld->getBasePtr();

18798

SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,

18799

TLI.getPointerTy(DAG.getDataLayout()));

18800

SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

18801

18802

for (unsigned i = 0; i < NumLoads; ++i) {

18803

// Perform a single load.

18804

SDValue ScalarLoad =

18805

DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),

18806

Ld->getAlignment(), Ld->getMemOperand()->getFlags());

18807

Chains.push_back(ScalarLoad.getValue(1));

18808

// Create the first element type using SCALAR_TO_VECTOR in order to avoid

18809

// another round of DAGCombining.

18810

if (i == 0)

18811

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);

18812

else

18813

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,

18814

ScalarLoad, DAG.getIntPtrConstant(i, dl));

18815

18816

Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);

18817

}

18818

18819

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

18820

18821

// Bitcast the loaded value to a vector of the original element type, in

18822

// the size of the target vector type.

18823

SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);

18824

unsigned SizeRatio = RegSz / MemSz;

18825

18826

if (Ext == ISD::SEXTLOAD) {

18827

// If we have SSE4.1, we can directly emit a VSEXT node.

18828

if (Subtarget.hasSSE41()) {

18829

SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);

18830

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);

18831

return Sext;

18832

}

18833

18834

// Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest

18835

// lanes.

18836

assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&(static_cast <bool> (TLI.isOperationLegalOrCustom(ISD::
SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"
) ? void (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && \"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18837, __extension__ __PRETTY_FUNCTION__))

18837

"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!")(static_cast <bool> (TLI.isOperationLegalOrCustom(ISD::
SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"
) ? void (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && \"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18837, __extension__ __PRETTY_FUNCTION__));

18838

18839

SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);

18840

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);

18841

return Shuff;

18842

}

18843

18844

if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&

18845

MemVT == MVT::v8i8) {

18846

SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);

18847

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);

18848

return Sext;

18849

}

18850

18851

// Redistribute the loaded elements into the different locations.

18852

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

18853

for (unsigned i = 0; i != NumElems; ++i)

18854

ShuffleVec[i * SizeRatio] = i;

18855

18856

SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,

18857

DAG.getUNDEF(WideVecVT), ShuffleVec);

18858

18859

// Bitcast to the requested type.

18860

Shuff = DAG.getBitcast(RegVT, Shuff);

18861

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);

18862

return Shuff;

18863

}

18864

18865

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

18866

/// each of which has no other use apart from the AND / OR.

18867

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

18868

Opc = Op.getOpcode();

18869

if (Opc != ISD::OR && Opc != ISD::AND)

18870

return false;

18871

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

18872

Op.getOperand(0).hasOneUse() &&

18873

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

18874

Op.getOperand(1).hasOneUse());

18875

}

18876

18877

/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the

18878

/// SETCC node has a single use.

18879

static bool isXor1OfSetCC(SDValue Op) {

18880

if (Op.getOpcode() != ISD::XOR)

18881

return false;

18882

if (isOneConstant(Op.getOperand(1)))

18883

return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

18884

Op.getOperand(0).hasOneUse();

18885

return false;

18886

}

18887

18888

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

18889

bool addTest = true;

18890

SDValue Chain = Op.getOperand(0);

18891

SDValue Cond = Op.getOperand(1);

18892

SDValue Dest = Op.getOperand(2);

18893

SDLoc dl(Op);

18894

SDValue CC;

18895

bool Inverted = false;

18896

18897

if (Cond.getOpcode() == ISD::SETCC) {

18898

// Check for setcc([su]{add,sub,mul}o == 0).

18899

if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

18900

isNullConstant(Cond.getOperand(1)) &&

18901

Cond.getOperand(0).getResNo() == 1 &&

18902

(Cond.getOperand(0).getOpcode() == ISD::SADDO ||

18903

Cond.getOperand(0).getOpcode() == ISD::UADDO ||

18904

Cond.getOperand(0).getOpcode() == ISD::SSUBO ||

18905

Cond.getOperand(0).getOpcode() == ISD::USUBO ||

18906

Cond.getOperand(0).getOpcode() == ISD::SMULO ||

18907

Cond.getOperand(0).getOpcode() == ISD::UMULO)) {

18908

Inverted = true;

18909

Cond = Cond.getOperand(0);

18910

} else {

18911

if (SDValue NewCond = LowerSETCC(Cond, DAG))

18912

Cond = NewCond;

18913

}

18914

}

18915

#if 0

18916

// FIXME: LowerXALUO doesn't handle these!!

18917

else if (Cond.getOpcode() == X86ISD::ADD ||

18918

Cond.getOpcode() == X86ISD::SUB ||

18919

Cond.getOpcode() == X86ISD::SMUL ||

18920

Cond.getOpcode() == X86ISD::UMUL)

18921

Cond = LowerXALUO(Cond, DAG);

18922

#endif

18923

18924

// Look pass (and (setcc_carry (cmp ...)), 1).

18925

if (Cond.getOpcode() == ISD::AND &&

18926

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

18927

isOneConstant(Cond.getOperand(1)))

18928

Cond = Cond.getOperand(0);

18929

18930

// If condition flag is set by a X86ISD::CMP, then use it as the condition

18931

// setting operand in place of the X86ISD::SETCC.

18932

unsigned CondOpcode = Cond.getOpcode();

18933

if (CondOpcode == X86ISD::SETCC ||

18934

CondOpcode == X86ISD::SETCC_CARRY) {

18935

CC = Cond.getOperand(0);

18936

18937

SDValue Cmp = Cond.getOperand(1);

18938

unsigned Opc = Cmp.getOpcode();

18939

// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??

18940

if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {

18941

Cond = Cmp;

18942

addTest = false;

18943

} else {

18944

switch (cast<ConstantSDNode>(CC)->getZExtValue()) {

18945

default: break;

18946

case X86::COND_O:

18947

case X86::COND_B:

18948

// These can only come from an arithmetic instruction with overflow,

18949

// e.g. SADDO, UADDO.

18950

Cond = Cond.getOperand(1);

18951

addTest = false;

18952

break;

18953

}

18954

}

18955

}

18956

CondOpcode = Cond.getOpcode();

18957

if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

18958

CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

18959

((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&

18960

Cond.getOperand(0).getValueType() != MVT::i8)) {

18961

SDValue LHS = Cond.getOperand(0);

18962

SDValue RHS = Cond.getOperand(1);

18963

unsigned X86Opcode;

18964

unsigned X86Cond;

18965

SDVTList VTs;

18966

// Keep this in sync with LowerXALUO, otherwise we might create redundant

18967

// instructions that can't be removed afterwards (i.e. X86ISD::ADD and

18968

// X86ISD::INC).

18969

switch (CondOpcode) {

18970

case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;

18971

case ISD::SADDO:

18972

if (isOneConstant(RHS)) {

18973

X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;

18974

break;

18975

}

18976

X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;

18977

case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;

18978

case ISD::SSUBO:

18979

if (isOneConstant(RHS)) {

18980

X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;

18981

break;

18982

}

18983

X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;

18984

case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;

18985

case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;

18986

18987

}

18988

if (Inverted)

18989

X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);

18990

if (CondOpcode == ISD::UMULO)

18991

VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),

18992

MVT::i32);

18993

else

18994

VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

18995

18996

SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);

18997

18998

if (CondOpcode == ISD::UMULO)

18999

Cond = X86Op.getValue(2);

19000

else

19001

Cond = X86Op.getValue(1);

19002

19003

CC = DAG.getConstant(X86Cond, dl, MVT::i8);

19004

addTest = false;

19005

} else {

19006

unsigned CondOpc;

19007

if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {

19008

SDValue Cmp = Cond.getOperand(0).getOperand(1);

19009

if (CondOpc == ISD::OR) {

19010

// Also, recognize the pattern generated by an FCMP_UNE. We can emit

19011

// two branches instead of an explicit OR instruction with a

19012

// separate test.

19013

if (Cmp == Cond.getOperand(1).getOperand(1) &&

19014

isX86LogicalCmp(Cmp)) {

19015

CC = Cond.getOperand(0).getOperand(0);

19016

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

19017

Chain, Dest, CC, Cmp);

19018

CC = Cond.getOperand(1).getOperand(0);

19019

Cond = Cmp;

19020

addTest = false;

19021

}

19022

} else { // ISD::AND

19023

// Also, recognize the pattern generated by an FCMP_OEQ. We can emit

19024

// two branches instead of an explicit AND instruction with a

19025

// separate test. However, we only do this if this block doesn't

19026

// have a fall-through edge, because this requires an explicit

19027

// jmp when the condition is false.

19028

if (Cmp == Cond.getOperand(1).getOperand(1) &&

19029

isX86LogicalCmp(Cmp) &&

19030

Op.getNode()->hasOneUse()) {

19031

X86::CondCode CCode =

19032

(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);

19033

CCode = X86::GetOppositeBranchCondition(CCode);

19034

CC = DAG.getConstant(CCode, dl, MVT::i8);

19035

SDNode *User = *Op.getNode()->use_begin();

19036

// Look for an unconditional branch following this conditional branch.

19037

// We need this because we need to reverse the successors in order

19038

// to implement FCMP_OEQ.

19039

if (User->getOpcode() == ISD::BR) {

19040

SDValue FalseBB = User->getOperand(1);

19041

SDNode *NewBR =

19042

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

19043

assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19043, __extension__ __PRETTY_FUNCTION__));

19044

(void)NewBR;

19045

Dest = FalseBB;

19046

19047

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

19048

Chain, Dest, CC, Cmp);

19049

X86::CondCode CCode =

19050

(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);

19051

CCode = X86::GetOppositeBranchCondition(CCode);

19052

CC = DAG.getConstant(CCode, dl, MVT::i8);

19053

Cond = Cmp;

19054

addTest = false;

19055

}

19056

}

19057

}

19058

} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {

19059

// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.

19060

// It should be transformed during dag combiner except when the condition

19061

// is set by a arithmetics with overflow node.

19062

X86::CondCode CCode =

19063

(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);

19064

CCode = X86::GetOppositeBranchCondition(CCode);

19065

CC = DAG.getConstant(CCode, dl, MVT::i8);

19066

Cond = Cond.getOperand(0).getOperand(1);

19067

addTest = false;

19068

} else if (Cond.getOpcode() == ISD::SETCC &&

19069

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {

19070

// For FCMP_OEQ, we can emit

19071

// two branches instead of an explicit AND instruction with a

19072

// separate test. However, we only do this if this block doesn't

19073

// have a fall-through edge, because this requires an explicit

19074

// jmp when the condition is false.

19075

if (Op.getNode()->hasOneUse()) {

19076

SDNode *User = *Op.getNode()->use_begin();

19077

// Look for an unconditional branch following this conditional branch.

19078

// We need this because we need to reverse the successors in order

19079

// to implement FCMP_OEQ.

19080

if (User->getOpcode() == ISD::BR) {

19081

SDValue FalseBB = User->getOperand(1);

19082

SDNode *NewBR =

19083

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

19084

19085

(void)NewBR;

19086

Dest = FalseBB;

19087

19088

SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

19089

Cond.getOperand(0), Cond.getOperand(1));

19090

Cmp = ConvertCmpIfNecessary(Cmp, DAG);

19091

CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);

19092

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

19093

Chain, Dest, CC, Cmp);

19094

CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);

19095

Cond = Cmp;

19096

addTest = false;

19097

}

19098

}

19099

} else if (Cond.getOpcode() == ISD::SETCC &&

19100

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {

19101

// For FCMP_UNE, we can emit

19102

// two branches instead of an explicit AND instruction with a

19103

// separate test. However, we only do this if this block doesn't

19104

// have a fall-through edge, because this requires an explicit

19105

// jmp when the condition is false.

19106

if (Op.getNode()->hasOneUse()) {

19107

SDNode *User = *Op.getNode()->use_begin();

19108

// Look for an unconditional branch following this conditional branch.

19109

// We need this because we need to reverse the successors in order

19110

// to implement FCMP_UNE.

19111

if (User->getOpcode() == ISD::BR) {

19112

SDValue FalseBB = User->getOperand(1);

19113

SDNode *NewBR =

19114

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

19115

19116

(void)NewBR;

19117

19118

SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

19119

Cond.getOperand(0), Cond.getOperand(1));

19120

Cmp = ConvertCmpIfNecessary(Cmp, DAG);

19121

CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);

19122

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

19123

Chain, Dest, CC, Cmp);

19124

CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);

19125

Cond = Cmp;

19126

addTest = false;

19127

Dest = FalseBB;

19128

}

19129

}

19130

}

19131

}

19132

19133

if (addTest) {

19134

// Look pass the truncate if the high bits are known zero.

19135

if (isTruncWithZeroHighBitsInput(Cond, DAG))

19136

Cond = Cond.getOperand(0);

19137

19138

// We know the result of AND is compared against zero. Try to match

19139

// it to BT.

19140

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

19141

if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {

19142

CC = NewSetCC.getOperand(0);

19143

Cond = NewSetCC.getOperand(1);

19144

addTest = false;

19145

}

19146

}

19147

}

19148

19149

if (addTest) {

19150

X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;

19151

CC = DAG.getConstant(X86Cond, dl, MVT::i8);

19152

Cond = EmitTest(Cond, X86Cond, dl, DAG);

19153

}

19154

Cond = ConvertCmpIfNecessary(Cond, DAG);

19155

return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

19156

Chain, Dest, CC, Cond);

19157

}

19158

19159

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

19160

// Calls to _alloca are needed to probe the stack when allocating more than 4k

19161

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

19162

// that the guard pages used by the OS virtual memory manager are allocated in

19163

// correct sequence.

19164

SDValue

19165

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

19166

SelectionDAG &DAG) const {

19167

MachineFunction &MF = DAG.getMachineFunction();

19168

bool SplitStack = MF.shouldSplitStack();

19169

bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();

19170

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

19171

SplitStack || EmitStackProbe;

19172

SDLoc dl(Op);

19173

19174

// Get the inputs.

19175

SDNode *Node = Op.getNode();

19176

SDValue Chain = Op.getOperand(0);

19177

SDValue Size = Op.getOperand(1);

19178

unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();

19179

EVT VT = Node->getValueType(0);

19180

19181

// Chain the dynamic stack allocation so that it doesn't modify the stack

19182

// pointer when other instructions are using the stack.

19183

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

19184

19185

bool Is64Bit = Subtarget.is64Bit();

19186

MVT SPTy = getPointerTy(DAG.getDataLayout());

19187

19188

SDValue Result;

19189

if (!Lower) {

19190

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

19191

unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();

19192

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19193, __extension__ __PRETTY_FUNCTION__))

19193

" not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19193, __extension__ __PRETTY_FUNCTION__));

19194

19195

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

19196

Chain = SP.getValue(1);

19197

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

19198

unsigned StackAlign = TFI.getStackAlignment();

19199

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

19200

if (Align > StackAlign)

19201

Result = DAG.getNode(ISD::AND, dl, VT, Result,

19202

DAG.getConstant(-(uint64_t)Align, dl, VT));

19203

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

19204

} else if (SplitStack) {

19205

MachineRegisterInfo &MRI = MF.getRegInfo();

19206

19207

if (Is64Bit) {

19208

// The 64 bit implementation of segmented stacks needs to clobber both r10

19209

// r11. This makes it impossible to use it along with nested parameters.

19210

const Function *F = MF.getFunction();

19211

for (const auto &A : F->args()) {

19212

if (A.hasNestAttr())

19213

report_fatal_error("Cannot use segmented stacks with functions that "

19214

"have nested arguments.");

19215

}

19216

}

19217

19218

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

19219

unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);

19220

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

19221

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

19222

DAG.getRegister(Vreg, SPTy));

19223

} else {

19224

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

19225

Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);

19226

MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

19227

19228

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

19229

unsigned SPReg = RegInfo->getStackRegister();

19230

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

19231

Chain = SP.getValue(1);

19232

19233

if (Align) {

19234

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

19235

DAG.getConstant(-(uint64_t)Align, dl, VT));

19236

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

19237

}

19238

19239

Result = SP;

19240

}

19241

19242

Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),

19243

DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

19244

19245

SDValue Ops[2] = {Result, Chain};

19246

return DAG.getMergeValues(Ops, dl);

19247

}

19248

19249

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

19250

MachineFunction &MF = DAG.getMachineFunction();

19251

auto PtrVT = getPointerTy(MF.getDataLayout());

19252

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

19253

19254

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

19255

SDLoc DL(Op);

19256

19257

if (!Subtarget.is64Bit() ||

19258

Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {

19259

// vastart just stores the address of the VarArgsFrameIndex slot into the

19260

// memory location argument.

19261

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

19262

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

19263

MachinePointerInfo(SV));

19264

}

19265

19266

// __va_list_tag:

19267

// gp_offset (0 - 6 * 8)

19268

// fp_offset (48 - 48 + 8 * 16)

19269

// overflow_arg_area (point to parameters coming in memory).

19270

// reg_save_area

19271

SmallVector<SDValue, 8> MemOps;

19272

SDValue FIN = Op.getOperand(1);

19273

// Store gp_offset

19274

SDValue Store = DAG.getStore(

19275

Op.getOperand(0), DL,

19276

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

19277

MachinePointerInfo(SV));

19278

MemOps.push_back(Store);

19279

19280

// Store fp_offset

19281

FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);

19282

Store = DAG.getStore(

19283

Op.getOperand(0), DL,

19284

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

19285

MachinePointerInfo(SV, 4));

19286

MemOps.push_back(Store);

19287

19288

// Store ptr to overflow_arg_area

19289

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

19290

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

19291

Store =

19292

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

19293

MemOps.push_back(Store);

19294

19295

// Store ptr to reg_save_area.

19296

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

19297

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

19298

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

19299

Store = DAG.getStore(

19300

Op.getOperand(0), DL, RSFIN, FIN,

19301

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

19302

MemOps.push_back(Store);

19303

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

19304

}

19305

19306

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

19307

assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19308, __extension__ __PRETTY_FUNCTION__))

19308

"LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19308, __extension__ __PRETTY_FUNCTION__));

19309

assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19309, __extension__ __PRETTY_FUNCTION__));

19310

19311

MachineFunction &MF = DAG.getMachineFunction();

19312

if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))

19313

// The Win64 ABI uses char* instead of a structure.

19314

return DAG.expandVAArg(Op.getNode());

19315

19316

SDValue Chain = Op.getOperand(0);

19317

SDValue SrcPtr = Op.getOperand(1);

19318

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

19319

unsigned Align = Op.getConstantOperandVal(3);

19320

SDLoc dl(Op);

19321

19322

EVT ArgVT = Op.getNode()->getValueType(0);

19323

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

19324

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

19325

uint8_t ArgMode;

19326

19327

// Decide which area this value should be read from.

19328

// TODO: Implement the AMD64 ABI in its entirety. This simple

19329

// selection mechanism works only for the basic types.

19330

if (ArgVT == MVT::f80) {

19331

llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19331);

19332

} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

19333

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

19334

} else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {

19335

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

19336

} else {

19337

llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19337);

19338

}

19339

19340

if (ArgMode == 2) {

19341

// Sanity Check: Make sure using fp_offset makes sense.

19342

assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19344, __extension__ __PRETTY_FUNCTION__))

19343

!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19344, __extension__ __PRETTY_FUNCTION__))

19344

Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19344, __extension__ __PRETTY_FUNCTION__));

19345

}

19346

19347

// Insert VAARG_64 node into the DAG

19348

// VAARG_64 returns two values: Variable Argument Address, Chain

19349

SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),

19350

DAG.getConstant(ArgMode, dl, MVT::i8),

19351

DAG.getConstant(Align, dl, MVT::i32)};

19352

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

19353

SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,

19354

VTs, InstOps, MVT::i64,

19355

MachinePointerInfo(SV),

19356

/*Align=*/0,

19357

/*Volatile=*/false,

19358

/*ReadMem=*/true,

19359

/*WriteMem=*/true);

19360

Chain = VAARG.getValue(1);

19361

19362

// Load the next argument and return it

19363

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

19364

}

19365

19366

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

19367

SelectionDAG &DAG) {

19368

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

19369

// where a va_list is still an i8*.

19370

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19370, __extension__ __PRETTY_FUNCTION__));

19371

if (Subtarget.isCallingConvWin64(

19372

DAG.getMachineFunction().getFunction()->getCallingConv()))

19373

// Probably a Win64 va_copy.

19374

return DAG.expandVACopy(Op.getNode());

19375

19376

SDValue Chain = Op.getOperand(0);

19377

SDValue DstPtr = Op.getOperand(1);

19378

SDValue SrcPtr = Op.getOperand(2);

19379

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

19380

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

19381

SDLoc DL(Op);

19382

19383

return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,

19384

DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,

19385

false, false,

19386

MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

19387

}

19388

19389

/// Handle vector element shifts where the shift amount is a constant.

19390

/// Takes immediate version of shift as input.

19391

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

19392

SDValue SrcOp, uint64_t ShiftAmt,

19393

SelectionDAG &DAG) {

19394

MVT ElementType = VT.getVectorElementType();

19395

19396

// Bitcast the source vector to the output type, this is mainly necessary for

19397

// vXi8/vXi64 shifts.

19398

if (VT != SrcOp.getSimpleValueType())

19399

SrcOp = DAG.getBitcast(VT, SrcOp);

19400

19401

// Fold this packed shift into its first operand if ShiftAmt is 0.

19402

if (ShiftAmt == 0)

19403

return SrcOp;

19404

19405

// Check for ShiftAmt >= element width

19406

if (ShiftAmt >= ElementType.getSizeInBits()) {

19407

if (Opc == X86ISD::VSRAI)

19408

ShiftAmt = ElementType.getSizeInBits() - 1;

19409

else

19410

return DAG.getConstant(0, dl, VT);

19411

}

19412

19413

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19414, __extension__ __PRETTY_FUNCTION__))

19414

&& "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19414, __extension__ __PRETTY_FUNCTION__));

19415

19416

// Fold this packed vector shift into a build vector if SrcOp is a

19417

// vector of Constants or UNDEFs.

19418

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

19419

SmallVector<SDValue, 8> Elts;

19420

unsigned NumElts = SrcOp->getNumOperands();

19421

ConstantSDNode *ND;

19422

19423

switch(Opc) {

19424

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19424);

19425

case X86ISD::VSHLI:

19426

for (unsigned i=0; i!=NumElts; ++i) {

19427

SDValue CurrentOp = SrcOp->getOperand(i);

19428

if (CurrentOp->isUndef()) {

19429

Elts.push_back(CurrentOp);

19430

continue;

19431

}

19432

ND = cast<ConstantSDNode>(CurrentOp);

19433

const APInt &C = ND->getAPIntValue();

19434

Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));

19435

}

19436

break;

19437

case X86ISD::VSRLI:

19438

for (unsigned i=0; i!=NumElts; ++i) {

19439

SDValue CurrentOp = SrcOp->getOperand(i);

19440

if (CurrentOp->isUndef()) {

19441

Elts.push_back(CurrentOp);

19442

continue;

19443

}

19444

ND = cast<ConstantSDNode>(CurrentOp);

19445

const APInt &C = ND->getAPIntValue();

19446

Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));

19447

}

19448

break;

19449

case X86ISD::VSRAI:

19450

for (unsigned i=0; i!=NumElts; ++i) {

19451

SDValue CurrentOp = SrcOp->getOperand(i);

19452

if (CurrentOp->isUndef()) {

19453

Elts.push_back(CurrentOp);

19454

continue;

19455

}

19456

ND = cast<ConstantSDNode>(CurrentOp);

19457

const APInt &C = ND->getAPIntValue();

19458

Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));

19459

}

19460

break;

19461

}

19462

19463

return DAG.getBuildVector(VT, dl, Elts);

19464

}

19465

19466

return DAG.getNode(Opc, dl, VT, SrcOp,

19467

DAG.getConstant(ShiftAmt, dl, MVT::i8));

19468

}

19469

19470

/// Handle vector element shifts where the shift amount may or may not be a

19471

/// constant. Takes immediate version of shift as input.

19472

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

19473

SDValue SrcOp, SDValue ShAmt,

19474

const X86Subtarget &Subtarget,

19475

SelectionDAG &DAG) {

19476

MVT SVT = ShAmt.getSimpleValueType();

19477

assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(static_cast <bool> ((SVT == MVT::i32 || SVT == MVT::i64
) && "Unexpected value type!") ? void (0) : __assert_fail
("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19477, __extension__ __PRETTY_FUNCTION__));

19478

19479

// Catch shift-by-constant.

19480

if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

19481

return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,

19482

CShAmt->getZExtValue(), DAG);

19483

19484

// Change opcode to non-immediate version

19485

switch (Opc) {

19486

default: llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19486);

19487

case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;

19488

case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;

19489

case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;

19490

}

19491

19492

// Need to build a vector containing shift amount.

19493

// SSE/AVX packed shifts only use the lower 64-bit of the shift count.

19494

// +=================+============+=======================================+

19495

// | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |

19496

// +=================+============+=======================================+

19497

// | i64 | Yes, No | Use ShAmt as lowest elt |

19498

// | i32 | Yes | zero-extend in-reg |

19499

// | (i32 zext(i16)) | Yes | zero-extend in-reg |

19500

// | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |

19501

// +=================+============+=======================================+

19502

19503

if (SVT == MVT::i64)

19504

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);

19505

else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&

19506

ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {

19507

ShAmt = ShAmt.getOperand(0);

19508

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);

19509

ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);

19510

} else if (Subtarget.hasSSE41() &&

19511

ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

19512

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);

19513

ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);

19514

} else {

19515

SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),

19516

DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};

19517

ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);

19518

}

19519

19520

// The return type has to be a 128-bit type with the same element

19521

// type as the input type.

19522

MVT EltVT = VT.getVectorElementType();

19523

MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());

19524

19525

ShAmt = DAG.getBitcast(ShVT, ShAmt);

19526

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

19527

}

19528

19529

/// \brief Return Mask with the necessary casting or extending

19530

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

19531

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

19532

const X86Subtarget &Subtarget, SelectionDAG &DAG,

19533

const SDLoc &dl) {

19534

19535

if (isAllOnesConstant(Mask))

19536

return DAG.getConstant(1, dl, MaskVT);

19537

if (X86::isZeroNode(Mask))

19538

return DAG.getConstant(0, dl, MaskVT);

19539

19540

if (MaskVT.bitsGT(Mask.getSimpleValueType())) {

19541

// Mask should be extended

19542

Mask = DAG.getNode(ISD::ANY_EXTEND, dl,

19543

MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);

19544

}

19545

19546

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

19547

if (MaskVT == MVT::v64i1) {

19548

19549

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

19550

SDValue Lo, Hi;

19551

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

19552

DAG.getConstant(0, dl, MVT::i32));

19553

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

19554

DAG.getConstant(1, dl, MVT::i32));

19555

19556

Lo = DAG.getBitcast(MVT::v32i1, Lo);

19557

Hi = DAG.getBitcast(MVT::v32i1, Hi);

19558

19559

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

19560

} else {

19561

// MaskVT require < 64bit. Truncate mask (should succeed in any case),

19562

// and bitcast.

19563

MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());

19564

return DAG.getBitcast(MaskVT,

19565

DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));

19566

}

19567

19568

} else {

19569

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

19570

Mask.getSimpleValueType().getSizeInBits());

19571

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

19572

// are extracted by EXTRACT_SUBVECTOR.

19573

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

19574

DAG.getBitcast(BitcastVT, Mask),

19575

DAG.getIntPtrConstant(0, dl));

19576

}

19577

}

19578

19579

/// \brief Return (and \p Op, \p Mask) for compare instructions or

19580

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

19581

/// necessary casting or extending for \p Mask when lowering masking intrinsics

19582

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

19583

SDValue PreservedSrc,

19584

const X86Subtarget &Subtarget,

19585

SelectionDAG &DAG) {

19586

MVT VT = Op.getSimpleValueType();

19587

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

19588

unsigned OpcodeSelect = ISD::VSELECT;

19589

SDLoc dl(Op);

19590

19591

if (isAllOnesConstant(Mask))

19592

return Op;

19593

19594

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

19595

19596

switch (Op.getOpcode()) {

19597

default: break;

19598

case X86ISD::CMPM:

19599

case X86ISD::CMPM_RND:

19600

case X86ISD::CMPMU:

19601

return DAG.getNode(ISD::AND, dl, VT, Op, VMask);

19602

case X86ISD::VFPCLASS:

19603

return DAG.getNode(ISD::OR, dl, VT, Op, VMask);

19604

case X86ISD::VTRUNC:

19605

case X86ISD::VTRUNCS:

19606

case X86ISD::VTRUNCUS:

19607

case X86ISD::CVTPS2PH:

19608

// We can't use ISD::VSELECT here because it is not always "Legal"

19609

// for the destination type. For example vpmovqb require only AVX512

19610

// and vselect that can operate on byte element type require BWI

19611

OpcodeSelect = X86ISD::SELECT;

19612

break;

19613

}

19614

if (PreservedSrc.isUndef())

19615

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

19616

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

19617

}

19618

19619

/// \brief Creates an SDNode for a predicated scalar operation.

19620

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

19621

/// The mask is coming as MVT::i8 and it should be transformed

19622

/// to MVT::v1i1 while lowering masking intrinsics.

19623

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

19624

/// "X86select" instead of "vselect". We just can't create the "vselect" node

19625

/// for a scalar instruction.

19626

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

19627

SDValue PreservedSrc,

19628

const X86Subtarget &Subtarget,

19629

SelectionDAG &DAG) {

19630

19631

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

19632

if (MaskConst->getZExtValue() & 0x1)

19633

return Op;

19634

19635

MVT VT = Op.getSimpleValueType();

19636

SDLoc dl(Op);

19637

19638

SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);

19639

if (Op.getOpcode() == X86ISD::FSETCCM ||

19640

Op.getOpcode() == X86ISD::FSETCCM_RND)

19641

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

19642

if (Op.getOpcode() == X86ISD::VFPCLASSS)

19643

return DAG.getNode(ISD::OR, dl, VT, Op, IMask);

19644

19645

if (PreservedSrc.isUndef())

19646

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

19647

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

19648

}

19649

19650

static int getSEHRegistrationNodeSize(const Function *Fn) {

19651

if (!Fn->hasPersonalityFn())

19652

report_fatal_error(

19653

"querying registration node size for function without personality");

19654

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

19655

// WinEHStatePass for the full struct definition.

19656

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

19657

case EHPersonality::MSVC_X86SEH: return 24;

19658

case EHPersonality::MSVC_CXX: return 16;

19659

default: break;

19660

}

19661

report_fatal_error(

19662

"can only recover FP for 32-bit MSVC EH personality functions");

19663

}

19664

19665

/// When the MSVC runtime transfers control to us, either to an outlined

19666

/// function or when returning to a parent frame after catching an exception, we

19667

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

19668

/// Here's the math:

19669

/// RegNodeBase = EntryEBP - RegNodeSize

19670

/// ParentFP = RegNodeBase - ParentFrameOffset

19671

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

19672

/// subtracting the offset (negative on x86) takes us back to the parent FP.

19673

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

19674

SDValue EntryEBP) {

19675

MachineFunction &MF = DAG.getMachineFunction();

19676

SDLoc dl;

19677

19678

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

19679

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

19680

19681

// It's possible that the parent function no longer has a personality function

19682

// if the exceptional code was optimized away, in which case we just return

19683

// the incoming EBP.

19684

if (!Fn->hasPersonalityFn())

19685

return EntryEBP;

19686

19687

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

19688

// registration, or the .set_setframe offset.

19689

MCSymbol *OffsetSym =

19690

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

19691

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

19692

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

19693

SDValue ParentFrameOffset =

19694

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

19695

19696

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

19697

// prologue to RBP in the parent function.

19698

const X86Subtarget &Subtarget =

19699

static_cast<const X86Subtarget &>(DAG.getSubtarget());

19700

if (Subtarget.is64Bit())

19701

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

19702

19703

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

19704

// RegNodeBase = EntryEBP - RegNodeSize

19705

// ParentFP = RegNodeBase - ParentFrameOffset

19706

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

19707

DAG.getConstant(RegNodeSize, dl, PtrVT));

19708

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

19709

}

19710

19711

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

19712

SelectionDAG &DAG) const {

19713

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

19714

auto isRoundModeCurDirection = [](SDValue Rnd) {

19715

if (!isa<ConstantSDNode>(Rnd))

19716

return false;

19717

19718

unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();

19719

return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;

19720

};

19721

19722

SDLoc dl(Op);

19723

unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

19724

MVT VT = Op.getSimpleValueType();

19725

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

19726

if (IntrData) {

19727

switch(IntrData->Type) {

19728

case INTR_TYPE_1OP:

19729

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));

19730

case INTR_TYPE_2OP:

19731

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),

19732

Op.getOperand(2));

19733

case INTR_TYPE_3OP:

19734

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),

19735

Op.getOperand(2), Op.getOperand(3));

19736

case INTR_TYPE_4OP:

19737

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),

19738

Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));

19739

case INTR_TYPE_1OP_MASK_RM: {

19740

SDValue Src = Op.getOperand(1);

19741

SDValue PassThru = Op.getOperand(2);

19742

SDValue Mask = Op.getOperand(3);

19743

SDValue RoundingMode;

19744

// We always add rounding mode to the Node.

19745

// If the rounding mode is not specified, we add the

19746

// "current direction" mode.

19747

if (Op.getNumOperands() == 4)

19748

RoundingMode =

19749

DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);

19750

else

19751

RoundingMode = Op.getOperand(4);

19752

assert(IntrData->Opc1 == 0 && "Unexpected second opcode!")(static_cast <bool> (IntrData->Opc1 == 0 && "Unexpected second opcode!"
) ? void (0) : __assert_fail ("IntrData->Opc1 == 0 && \"Unexpected second opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19752, __extension__ __PRETTY_FUNCTION__));

19753

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,

19754

RoundingMode),

19755

Mask, PassThru, Subtarget, DAG);

19756

}

19757

case INTR_TYPE_1OP_MASK: {

19758

SDValue Src = Op.getOperand(1);

19759

SDValue PassThru = Op.getOperand(2);

19760

SDValue Mask = Op.getOperand(3);

19761

// We add rounding mode to the Node when

19762

// - RM Opcode is specified and

19763

// - RM is not "current direction".

19764

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

19765

if (IntrWithRoundingModeOpcode != 0) {

19766

SDValue Rnd = Op.getOperand(4);

19767

if (!isRoundModeCurDirection(Rnd)) {

19768

return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

19769

dl, Op.getValueType(),

19770

Src, Rnd),

19771

Mask, PassThru, Subtarget, DAG);

19772

}

19773

}

19774

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),

19775

Mask, PassThru, Subtarget, DAG);

19776

}

19777

case INTR_TYPE_SCALAR_MASK: {

19778

SDValue Src1 = Op.getOperand(1);

19779

SDValue Src2 = Op.getOperand(2);

19780

SDValue passThru = Op.getOperand(3);

19781

SDValue Mask = Op.getOperand(4);

19782

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

19783

// There are 2 kinds of intrinsics in this group:

19784

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

19785

// (2) With rounding mode and sae - 7 operands.

19786

bool HasRounding = IntrWithRoundingModeOpcode != 0;

19787

if (Op.getNumOperands() == (5U + HasRounding)) {

19788

if (HasRounding) {

19789

SDValue Rnd = Op.getOperand(5);

19790

if (!isRoundModeCurDirection(Rnd))

19791

return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

19792

dl, VT, Src1, Src2, Rnd),

19793

Mask, passThru, Subtarget, DAG);

19794

}

19795

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

19796

Src2),

19797

Mask, passThru, Subtarget, DAG);

19798

}

19799

19800

assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19801, __extension__ __PRETTY_FUNCTION__))

19801

"Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19801, __extension__ __PRETTY_FUNCTION__));

19802

SDValue RoundingMode = Op.getOperand(5);

19803

if (HasRounding) {

19804

SDValue Sae = Op.getOperand(6);

19805

if (!isRoundModeCurDirection(Sae))

19806

return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

19807

dl, VT, Src1, Src2,

19808

RoundingMode, Sae),

19809

Mask, passThru, Subtarget, DAG);

19810

}

19811

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

19812

Src2, RoundingMode),

19813

Mask, passThru, Subtarget, DAG);

19814

}

19815

case INTR_TYPE_SCALAR_MASK_RM: {

19816

SDValue Src1 = Op.getOperand(1);

19817

SDValue Src2 = Op.getOperand(2);

19818

SDValue Src0 = Op.getOperand(3);

19819

SDValue Mask = Op.getOperand(4);

19820

// There are 2 kinds of intrinsics in this group:

19821

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

19822

// (2) With rounding mode and sae - 7 operands.

19823

if (Op.getNumOperands() == 6) {

19824

SDValue Sae = Op.getOperand(5);

19825

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,

19826

Sae),

19827

Mask, Src0, Subtarget, DAG);

19828

}

19829

assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == 7 &&
"Unexpected intrinsic form") ? void (0) : __assert_fail ("Op.getNumOperands() == 7 && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19829, __extension__ __PRETTY_FUNCTION__));

19830

SDValue RoundingMode = Op.getOperand(5);

19831

SDValue Sae = Op.getOperand(6);

19832

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,

19833

RoundingMode, Sae),

19834

Mask, Src0, Subtarget, DAG);

19835

}

19836

case INTR_TYPE_2OP_MASK:

19837

case INTR_TYPE_2OP_IMM8_MASK: {

19838

SDValue Src1 = Op.getOperand(1);

19839

SDValue Src2 = Op.getOperand(2);

19840

SDValue PassThru = Op.getOperand(3);

19841

SDValue Mask = Op.getOperand(4);

19842

19843

if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)

19844

Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);

19845

19846

// We specify 2 possible opcodes for intrinsics with rounding modes.

19847

// First, we check if the intrinsic may have non-default rounding mode,

19848

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

19849

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

19850

if (IntrWithRoundingModeOpcode != 0) {

19851

SDValue Rnd = Op.getOperand(5);

19852

if (!isRoundModeCurDirection(Rnd)) {

19853

return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

19854

dl, Op.getValueType(),

19855

Src1, Src2, Rnd),

19856

Mask, PassThru, Subtarget, DAG);

19857

}

19858

}

19859

// TODO: Intrinsics should have fast-math-flags to propagate.

19860

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),

19861

Mask, PassThru, Subtarget, DAG);

19862

}

19863

case INTR_TYPE_2OP_MASK_RM: {

19864

SDValue Src1 = Op.getOperand(1);

19865

SDValue Src2 = Op.getOperand(2);

19866

SDValue PassThru = Op.getOperand(3);

19867

SDValue Mask = Op.getOperand(4);

19868

// We specify 2 possible modes for intrinsics, with/without rounding

19869

// modes.

19870

// First, we check if the intrinsic have rounding mode (6 operands),

19871

// if not, we set rounding mode to "current".

19872

SDValue Rnd;

19873

if (Op.getNumOperands() == 6)

19874

Rnd = Op.getOperand(5);

19875

else

19876

Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);

19877

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

19878

Src1, Src2, Rnd),

19879

Mask, PassThru, Subtarget, DAG);

19880

}

19881

case INTR_TYPE_3OP_SCALAR_MASK: {

19882

SDValue Src1 = Op.getOperand(1);

19883

SDValue Src2 = Op.getOperand(2);

19884

SDValue Src3 = Op.getOperand(3);

19885

SDValue PassThru = Op.getOperand(4);

19886

SDValue Mask = Op.getOperand(5);

19887

19888

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

19889

if (IntrWithRoundingModeOpcode != 0) {

19890

SDValue Rnd = Op.getOperand(6);

19891

if (!isRoundModeCurDirection(Rnd))

19892

return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

19893

dl, VT, Src1, Src2, Src3, Rnd),

19894

Mask, PassThru, Subtarget, DAG);

19895

}

19896

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

19897

Src2, Src3),

19898

Mask, PassThru, Subtarget, DAG);

19899

}

19900

case INTR_TYPE_3OP_MASK_RM: {

19901

SDValue Src1 = Op.getOperand(1);

19902

SDValue Src2 = Op.getOperand(2);

19903

SDValue Imm = Op.getOperand(3);

19904

SDValue PassThru = Op.getOperand(4);

19905

SDValue Mask = Op.getOperand(5);

19906

// We specify 2 possible modes for intrinsics, with/without rounding

19907

// modes.

19908

// First, we check if the intrinsic have rounding mode (7 operands),

19909

// if not, we set rounding mode to "current".

19910

SDValue Rnd;

19911

if (Op.getNumOperands() == 7)

19912

Rnd = Op.getOperand(6);

19913

else

19914

Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);

19915

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

19916

Src1, Src2, Imm, Rnd),

19917

Mask, PassThru, Subtarget, DAG);

19918

}

19919

case INTR_TYPE_3OP_IMM8_MASK:

19920

case INTR_TYPE_3OP_MASK: {

19921

SDValue Src1 = Op.getOperand(1);

19922

SDValue Src2 = Op.getOperand(2);

19923

SDValue Src3 = Op.getOperand(3);

19924

SDValue PassThru = Op.getOperand(4);

19925

SDValue Mask = Op.getOperand(5);

19926

19927

if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)

19928

Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

19929

19930

// We specify 2 possible opcodes for intrinsics with rounding modes.

19931

// First, we check if the intrinsic may have non-default rounding mode,

19932

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

19933

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

19934

if (IntrWithRoundingModeOpcode != 0) {

19935

SDValue Rnd = Op.getOperand(6);

19936

if (!isRoundModeCurDirection(Rnd)) {

19937

return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

19938

dl, Op.getValueType(),

19939

Src1, Src2, Src3, Rnd),

19940

Mask, PassThru, Subtarget, DAG);

19941

}

19942

}

19943

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

19944

Src1, Src2, Src3),

19945

Mask, PassThru, Subtarget, DAG);

19946

}

19947

case VPERM_2OP_MASK : {

19948

SDValue Src1 = Op.getOperand(1);

19949

SDValue Src2 = Op.getOperand(2);

19950

SDValue PassThru = Op.getOperand(3);

19951

SDValue Mask = Op.getOperand(4);

19952

19953

// Swap Src1 and Src2 in the node creation

19954

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),

19955

Mask, PassThru, Subtarget, DAG);

19956

}

19957

case VPERM_3OP_MASKZ:

19958

case VPERM_3OP_MASK:{

19959

MVT VT = Op.getSimpleValueType();

19960

// Src2 is the PassThru

19961

SDValue Src1 = Op.getOperand(1);

19962

// PassThru needs to be the same type as the destination in order

19963

// to pattern match correctly.

19964

SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));

19965

SDValue Src3 = Op.getOperand(3);

19966

SDValue Mask = Op.getOperand(4);

19967

SDValue PassThru = SDValue();

19968

19969

// set PassThru element

19970

if (IntrData->Type == VPERM_3OP_MASKZ)

19971

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

19972

else

19973

PassThru = Src2;

19974

19975

// Swap Src1 and Src2 in the node creation

19976

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,

19977

dl, Op.getValueType(),

19978

Src2, Src1, Src3),

19979

Mask, PassThru, Subtarget, DAG);

19980

}

19981

case FMA_OP_MASK3:

19982

case FMA_OP_MASKZ:

19983

case FMA_OP_MASK: {

19984

SDValue Src1 = Op.getOperand(1);

19985

SDValue Src2 = Op.getOperand(2);

19986

SDValue Src3 = Op.getOperand(3);

19987

SDValue Mask = Op.getOperand(4);

19988

MVT VT = Op.getSimpleValueType();

19989

SDValue PassThru = SDValue();

19990

19991

// set PassThru element

19992

if (IntrData->Type == FMA_OP_MASKZ)

19993

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

19994

else if (IntrData->Type == FMA_OP_MASK3)

19995

PassThru = Src3;

19996

else

19997

PassThru = Src1;

19998

19999

// We specify 2 possible opcodes for intrinsics with rounding modes.

20000

// First, we check if the intrinsic may have non-default rounding mode,

20001

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

20002

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

20003

if (IntrWithRoundingModeOpcode != 0) {

20004

SDValue Rnd = Op.getOperand(5);

20005

if (!isRoundModeCurDirection(Rnd))

20006

return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

20007

dl, Op.getValueType(),

20008

Src1, Src2, Src3, Rnd),

20009

Mask, PassThru, Subtarget, DAG);

20010

}

20011

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,

20012

dl, Op.getValueType(),

20013

Src1, Src2, Src3),

20014

Mask, PassThru, Subtarget, DAG);

20015

}

20016

case FMA_OP_SCALAR_MASK:

20017

case FMA_OP_SCALAR_MASK3:

20018

case FMA_OP_SCALAR_MASKZ: {

20019

SDValue Src1 = Op.getOperand(1);

20020

SDValue Src2 = Op.getOperand(2);

20021

SDValue Src3 = Op.getOperand(3);

20022

SDValue Mask = Op.getOperand(4);

20023

MVT VT = Op.getSimpleValueType();

20024

SDValue PassThru = SDValue();

20025

20026

// set PassThru element

20027

if (IntrData->Type == FMA_OP_SCALAR_MASKZ)

20028

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

20029

else if (IntrData->Type == FMA_OP_SCALAR_MASK3)

20030

PassThru = Src3;

20031

else

20032

PassThru = Src1;

20033

20034

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

20035

if (IntrWithRoundingModeOpcode != 0) {

20036

SDValue Rnd = Op.getOperand(5);

20037

if (!isRoundModeCurDirection(Rnd))

20038

return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,

20039

Op.getValueType(), Src1, Src2,

20040

Src3, Rnd),

20041

Mask, PassThru, Subtarget, DAG);

20042

}

20043

20044

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,

20045

Op.getValueType(), Src1, Src2,

20046

Src3),

20047

Mask, PassThru, Subtarget, DAG);

20048

}

20049

case IFMA_OP_MASKZ:

20050

case IFMA_OP_MASK: {

20051

SDValue Src1 = Op.getOperand(1);

20052

SDValue Src2 = Op.getOperand(2);

20053

SDValue Src3 = Op.getOperand(3);

20054

SDValue Mask = Op.getOperand(4);

20055

MVT VT = Op.getSimpleValueType();

20056

SDValue PassThru = Src1;

20057

20058

// set PassThru element

20059

if (IntrData->Type == IFMA_OP_MASKZ)

20060

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

20061

20062

// Node we need to swizzle the operands to pass the multiply operands

20063

// first.

20064

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,

20065

dl, Op.getValueType(),

20066

Src2, Src3, Src1),

20067

Mask, PassThru, Subtarget, DAG);

20068

}

20069

case TERLOG_OP_MASK:

20070

case TERLOG_OP_MASKZ: {

20071

SDValue Src1 = Op.getOperand(1);

20072

SDValue Src2 = Op.getOperand(2);

20073

SDValue Src3 = Op.getOperand(3);

20074

SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));

20075

SDValue Mask = Op.getOperand(5);

20076

MVT VT = Op.getSimpleValueType();

20077

SDValue PassThru = Src1;

20078

// Set PassThru element.

20079

if (IntrData->Type == TERLOG_OP_MASKZ)

20080

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

20081

20082

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

20083

Src1, Src2, Src3, Src4),

20084

Mask, PassThru, Subtarget, DAG);

20085

}

20086

case CVTPD2PS:

20087

// ISD::FP_ROUND has a second argument that indicates if the truncation

20088

// does not change the value. Set it to 0 since it can change.

20089

return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),

20090

DAG.getIntPtrConstant(0, dl));

20091

case CVTPD2PS_MASK: {

20092

SDValue Src = Op.getOperand(1);

20093

SDValue PassThru = Op.getOperand(2);

20094

SDValue Mask = Op.getOperand(3);

20095

// We add rounding mode to the Node when

20096

// - RM Opcode is specified and

20097

// - RM is not "current direction".

20098

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

20099

if (IntrWithRoundingModeOpcode != 0) {

20100

SDValue Rnd = Op.getOperand(4);

20101

if (!isRoundModeCurDirection(Rnd)) {

20102

return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

20103

dl, Op.getValueType(),

20104

Src, Rnd),

20105

Mask, PassThru, Subtarget, DAG);

20106

}

20107

}

20108

assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!")(static_cast <bool> (IntrData->Opc0 == ISD::FP_ROUND
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"IntrData->Opc0 == ISD::FP_ROUND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20108, __extension__ __PRETTY_FUNCTION__));

20109

// ISD::FP_ROUND has a second argument that indicates if the truncation

20110

// does not change the value. Set it to 0 since it can change.

20111

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,

20112

DAG.getIntPtrConstant(0, dl)),

20113

Mask, PassThru, Subtarget, DAG);

20114

}

20115

case FPCLASS: {

20116

// FPclass intrinsics with mask

20117

SDValue Src1 = Op.getOperand(1);

20118

MVT VT = Src1.getSimpleValueType();

20119

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

20120

SDValue Imm = Op.getOperand(2);

20121

SDValue Mask = Op.getOperand(3);

20122

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

20123

Mask.getSimpleValueType().getSizeInBits());

20124

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);

20125

SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,

20126

DAG.getConstant(0, dl, MaskVT),

20127

Subtarget, DAG);

20128

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,

20129

DAG.getUNDEF(BitcastVT), FPclassMask,

20130

DAG.getIntPtrConstant(0, dl));

20131

return DAG.getBitcast(Op.getValueType(), Res);

20132

}

20133

case FPCLASSS: {

20134

SDValue Src1 = Op.getOperand(1);

20135

SDValue Imm = Op.getOperand(2);

20136

SDValue Mask = Op.getOperand(3);

20137

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

20138

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,

20139

DAG.getConstant(0, dl, MVT::i1), Subtarget, DAG);

20140

return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,

20141

DAG.getIntPtrConstant(0, dl));

20142

}

20143

case CMP_MASK:

20144

case CMP_MASK_CC: {

20145

// Comparison intrinsics with masks.

20146

// Example of transformation:

20147

// (i8 (int_x86_avx512_mask_pcmpeq_q_128

20148

// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->

20149

// (i8 (bitcast

20150

// (v8i1 (insert_subvector undef,

20151

// (v2i1 (and (PCMPEQM %a, %b),

20152

// (extract_subvector

20153

// (v8i1 (bitcast %mask)), 0))), 0))))

20154

MVT VT = Op.getOperand(1).getSimpleValueType();

20155

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

20156

SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);

20157

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

20158

Mask.getSimpleValueType().getSizeInBits());

20159

SDValue Cmp;

20160

if (IntrData->Type == CMP_MASK_CC) {

20161

SDValue CC = Op.getOperand(3);

20162

CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);

20163

// We specify 2 possible opcodes for intrinsics with rounding modes.

20164

// First, we check if the intrinsic may have non-default rounding mode,

20165

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

20166

if (IntrData->Opc1 != 0) {

20167

SDValue Rnd = Op.getOperand(5);

20168

if (!isRoundModeCurDirection(Rnd))

20169

Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

20170

Op.getOperand(2), CC, Rnd);

20171

}

20172

//default rounding mode

20173

if(!Cmp.getNode())

20174

Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),

20175

Op.getOperand(2), CC);

20176

20177

} else {

20178

assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!")(static_cast <bool> (IntrData->Type == CMP_MASK &&
"Unexpected intrinsic type!") ? void (0) : __assert_fail ("IntrData->Type == CMP_MASK && \"Unexpected intrinsic type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20178, __extension__ __PRETTY_FUNCTION__));

20179

Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),

20180

Op.getOperand(2));

20181

}

20182

SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,

20183

DAG.getConstant(0, dl, MaskVT),

20184

Subtarget, DAG);

20185

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,

20186

DAG.getUNDEF(BitcastVT), CmpMask,

20187

DAG.getIntPtrConstant(0, dl));

20188

return DAG.getBitcast(Op.getValueType(), Res);

20189

}

20190

case CMP_MASK_SCALAR_CC: {

20191

SDValue Src1 = Op.getOperand(1);

20192

SDValue Src2 = Op.getOperand(2);

20193

SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));

20194

SDValue Mask = Op.getOperand(4);

20195

20196

SDValue Cmp;

20197

if (IntrData->Opc1 != 0) {

20198

SDValue Rnd = Op.getOperand(5);

20199

if (!isRoundModeCurDirection(Rnd))

20200

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);

20201

}

20202

//default rounding mode

20203

if(!Cmp.getNode())

20204

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

20205

20206

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,

20207

DAG.getConstant(0, dl, MVT::i1),

20208

Subtarget, DAG);

20209

return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,

20210

DAG.getIntPtrConstant(0, dl));

20211

}

20212

case COMI: { // Comparison intrinsics

20213

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

20214

SDValue LHS = Op.getOperand(1);

20215

SDValue RHS = Op.getOperand(2);

20216

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

20217

SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);

20218

SDValue SetCC;

20219

switch (CC) {

20220

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

20221

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

20222

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

20223

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

20224

break;

20225

}

20226

case ISD::SETNE: { // (ZF = 1 or PF = 1)

20227

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

20228

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

20229

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

20230

break;

20231

}

20232

case ISD::SETGT: // (CF = 0 and ZF = 0)

20233

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

20234

break;

20235

case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.

20236

SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);

20237

break;

20238

}

20239

case ISD::SETGE: // CF = 0

20240

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

20241

break;

20242

case ISD::SETLE: // The condition is opposite to GE. Swap the operands.

20243

SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);

20244

break;

20245

default:

20246

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20246);

20247

}

20248

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

20249

}

20250

case COMI_RM: { // Comparison intrinsics with Sae

20251

SDValue LHS = Op.getOperand(1);

20252

SDValue RHS = Op.getOperand(2);

20253

unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();

20254

SDValue Sae = Op.getOperand(4);

20255

20256

SDValue FCmp;

20257

if (isRoundModeCurDirection(Sae))

20258

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

20259

DAG.getConstant(CondVal, dl, MVT::i8));

20260

else

20261

FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,

20262

DAG.getConstant(CondVal, dl, MVT::i8), Sae);

20263

return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,

20264

DAG.getIntPtrConstant(0, dl));

20265

}

20266

case VSHIFT:

20267

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

20268

Op.getOperand(1), Op.getOperand(2), Subtarget,

20269

DAG);

20270

case COMPRESS_EXPAND_IN_REG: {

20271

SDValue Mask = Op.getOperand(3);

20272

SDValue DataToCompress = Op.getOperand(1);

20273

SDValue PassThru = Op.getOperand(2);

20274

if (isAllOnesConstant(Mask)) // return data as is

20275

return Op.getOperand(1);

20276

20277

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

20278

DataToCompress),

20279

Mask, PassThru, Subtarget, DAG);

20280

}

20281

case BROADCASTM: {

20282

SDValue Mask = Op.getOperand(1);

20283

MVT MaskVT = MVT::getVectorVT(MVT::i1,

20284

Mask.getSimpleValueType().getSizeInBits());

20285

Mask = DAG.getBitcast(MaskVT, Mask);

20286

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);

20287

}

20288

case KUNPCK: {

20289

MVT VT = Op.getSimpleValueType();

20290

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);

20291

20292

SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);

20293

SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);

20294

// Arguments should be swapped.

20295

SDValue Res = DAG.getNode(IntrData->Opc0, dl,

20296

MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),

20297

Src2, Src1);

20298

return DAG.getBitcast(VT, Res);

20299

}

20300

case MASK_BINOP: {

20301

MVT VT = Op.getSimpleValueType();

20302

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

20303

20304

SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);

20305

SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);

20306

SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);

20307

return DAG.getBitcast(VT, Res);

20308

}

20309

case FIXUPIMMS:

20310

case FIXUPIMMS_MASKZ:

20311

case FIXUPIMM:

20312

case FIXUPIMM_MASKZ:{

20313

SDValue Src1 = Op.getOperand(1);

20314

SDValue Src2 = Op.getOperand(2);

20315

SDValue Src3 = Op.getOperand(3);

20316

SDValue Imm = Op.getOperand(4);

20317

SDValue Mask = Op.getOperand(5);

20318

SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?

20319

Src1 : getZeroVector(VT, Subtarget, DAG, dl);

20320

// We specify 2 possible modes for intrinsics, with/without rounding

20321

// modes.

20322

// First, we check if the intrinsic have rounding mode (7 operands),

20323

// if not, we set rounding mode to "current".

20324

SDValue Rnd;

20325

if (Op.getNumOperands() == 7)

20326

Rnd = Op.getOperand(6);

20327

else

20328

Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);

20329

if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)

20330

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

20331

Src1, Src2, Src3, Imm, Rnd),

20332

Mask, Passthru, Subtarget, DAG);

20333

else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ

20334

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

20335

Src1, Src2, Src3, Imm, Rnd),

20336

Mask, Passthru, Subtarget, DAG);

20337

}

20338

case CONVERT_TO_MASK: {

20339

MVT SrcVT = Op.getOperand(1).getSimpleValueType();

20340

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

20341

MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

20342

20343

SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,

20344

Op.getOperand(1));

20345

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,

20346

DAG.getUNDEF(BitcastVT), CvtMask,

20347

DAG.getIntPtrConstant(0, dl));

20348

return DAG.getBitcast(Op.getValueType(), Res);

20349

}

20350

case ROUNDP: {

20351

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20351, __extension__ __PRETTY_FUNCTION__));

20352

// Clear the upper bits of the rounding immediate so that the legacy

20353

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

20354

SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,

20355

Op.getOperand(2),

20356

DAG.getConstant(0xf, dl, MVT::i32));

20357

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

20358

Op.getOperand(1), RoundingMode);

20359

}

20360

case ROUNDS: {

20361

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20361, __extension__ __PRETTY_FUNCTION__));

20362

// Clear the upper bits of the rounding immediate so that the legacy

20363

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

20364

SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,

20365

Op.getOperand(3),

20366

DAG.getConstant(0xf, dl, MVT::i32));

20367

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

20368

Op.getOperand(1), Op.getOperand(2), RoundingMode);

20369

}

20370

default:

20371

break;

20372

}

20373

}

20374

20375

switch (IntNo) {

20376

default: return SDValue(); // Don't custom lower most intrinsics.

20377

20378

case Intrinsic::x86_avx2_permd:

20379

case Intrinsic::x86_avx2_permps:

20380

// Operands intentionally swapped. Mask is last operand to intrinsic,

20381

// but second operand for node/instruction.

20382

return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),

20383

Op.getOperand(2), Op.getOperand(1));

20384

20385

// ptest and testp intrinsics. The intrinsic these come from are designed to

20386

// return an integer value, not just an instruction so lower it to the ptest

20387

// or testp pattern and a setcc for the result.

20388

case Intrinsic::x86_sse41_ptestz:

20389

case Intrinsic::x86_sse41_ptestc:

20390

case Intrinsic::x86_sse41_ptestnzc:

20391

case Intrinsic::x86_avx_ptestz_256:

20392

case Intrinsic::x86_avx_ptestc_256:

20393

case Intrinsic::x86_avx_ptestnzc_256:

20394

case Intrinsic::x86_avx_vtestz_ps:

20395

case Intrinsic::x86_avx_vtestc_ps:

20396

case Intrinsic::x86_avx_vtestnzc_ps:

20397

case Intrinsic::x86_avx_vtestz_pd:

20398

case Intrinsic::x86_avx_vtestc_pd:

20399

case Intrinsic::x86_avx_vtestnzc_pd:

20400

case Intrinsic::x86_avx_vtestz_ps_256:

20401

case Intrinsic::x86_avx_vtestc_ps_256:

20402

case Intrinsic::x86_avx_vtestnzc_ps_256:

20403

case Intrinsic::x86_avx_vtestz_pd_256:

20404

case Intrinsic::x86_avx_vtestc_pd_256:

20405

case Intrinsic::x86_avx_vtestnzc_pd_256: {

20406

bool IsTestPacked = false;

20407

X86::CondCode X86CC;

20408

switch (IntNo) {

20409

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20409);

20410

case Intrinsic::x86_avx_vtestz_ps:

20411

case Intrinsic::x86_avx_vtestz_pd:

20412

case Intrinsic::x86_avx_vtestz_ps_256:

20413

case Intrinsic::x86_avx_vtestz_pd_256:

20414

IsTestPacked = true;

20415

LLVM_FALLTHROUGH[[clang::fallthrough]];

20416

case Intrinsic::x86_sse41_ptestz:

20417

case Intrinsic::x86_avx_ptestz_256:

20418

// ZF = 1

20419

X86CC = X86::COND_E;

20420

break;

20421

case Intrinsic::x86_avx_vtestc_ps:

20422

case Intrinsic::x86_avx_vtestc_pd:

20423

case Intrinsic::x86_avx_vtestc_ps_256:

20424

case Intrinsic::x86_avx_vtestc_pd_256:

20425

IsTestPacked = true;

20426

LLVM_FALLTHROUGH[[clang::fallthrough]];

20427

case Intrinsic::x86_sse41_ptestc:

20428

case Intrinsic::x86_avx_ptestc_256:

20429

// CF = 1

20430

X86CC = X86::COND_B;

20431

break;

20432

case Intrinsic::x86_avx_vtestnzc_ps:

20433

case Intrinsic::x86_avx_vtestnzc_pd:

20434

case Intrinsic::x86_avx_vtestnzc_ps_256:

20435

case Intrinsic::x86_avx_vtestnzc_pd_256:

20436

IsTestPacked = true;

20437

LLVM_FALLTHROUGH[[clang::fallthrough]];

20438

case Intrinsic::x86_sse41_ptestnzc:

20439

case Intrinsic::x86_avx_ptestnzc_256:

20440

// ZF and CF = 0

20441

X86CC = X86::COND_A;

20442

break;

20443

}

20444

20445

SDValue LHS = Op.getOperand(1);

20446

SDValue RHS = Op.getOperand(2);

20447

unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;

20448

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

20449

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

20450

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

20451

}

20452

case Intrinsic::x86_avx512_kortestz_w:

20453

case Intrinsic::x86_avx512_kortestc_w: {

20454

X86::CondCode X86CC =

20455

(IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;

20456

SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));

20457

SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));

20458

SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

20459

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

20460

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

20461

}

20462

20463

case Intrinsic::x86_avx512_knot_w: {

20464

SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));

20465

SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);

20466

SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);

20467

return DAG.getBitcast(MVT::i16, Res);

20468

}

20469

20470

case Intrinsic::x86_avx512_kandn_w: {

20471

SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));

20472

// Invert LHS for the not.

20473

LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,

20474

DAG.getConstant(1, dl, MVT::v16i1));

20475

SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));

20476

SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);

20477

return DAG.getBitcast(MVT::i16, Res);

20478

}

20479

20480

case Intrinsic::x86_avx512_kxnor_w: {

20481

SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));

20482

SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));

20483

SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);

20484

// Invert result for the not.

20485

Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,

20486

DAG.getConstant(1, dl, MVT::v16i1));

20487

return DAG.getBitcast(MVT::i16, Res);

20488

}

20489

20490

case Intrinsic::x86_sse42_pcmpistria128:

20491

case Intrinsic::x86_sse42_pcmpestria128:

20492

case Intrinsic::x86_sse42_pcmpistric128:

20493

case Intrinsic::x86_sse42_pcmpestric128:

20494

case Intrinsic::x86_sse42_pcmpistrio128:

20495

case Intrinsic::x86_sse42_pcmpestrio128:

20496

case Intrinsic::x86_sse42_pcmpistris128:

20497

case Intrinsic::x86_sse42_pcmpestris128:

20498

case Intrinsic::x86_sse42_pcmpistriz128:

20499

case Intrinsic::x86_sse42_pcmpestriz128: {

20500

unsigned Opcode;

20501

X86::CondCode X86CC;

20502

switch (IntNo) {

20503

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20503); // Can't reach here.

20504

case Intrinsic::x86_sse42_pcmpistria128:

20505

Opcode = X86ISD::PCMPISTRI;

20506

X86CC = X86::COND_A;

20507

break;

20508

case Intrinsic::x86_sse42_pcmpestria128:

20509

Opcode = X86ISD::PCMPESTRI;

20510

X86CC = X86::COND_A;

20511

break;

20512

case Intrinsic::x86_sse42_pcmpistric128:

20513

Opcode = X86ISD::PCMPISTRI;

20514

X86CC = X86::COND_B;

20515

break;

20516

case Intrinsic::x86_sse42_pcmpestric128:

20517

Opcode = X86ISD::PCMPESTRI;

20518

X86CC = X86::COND_B;

20519

break;

20520

case Intrinsic::x86_sse42_pcmpistrio128:

20521

Opcode = X86ISD::PCMPISTRI;

20522

X86CC = X86::COND_O;

20523

break;

20524

case Intrinsic::x86_sse42_pcmpestrio128:

20525

Opcode = X86ISD::PCMPESTRI;

20526

X86CC = X86::COND_O;

20527

break;

20528

case Intrinsic::x86_sse42_pcmpistris128:

20529

Opcode = X86ISD::PCMPISTRI;

20530

X86CC = X86::COND_S;

20531

break;

20532

case Intrinsic::x86_sse42_pcmpestris128:

20533

Opcode = X86ISD::PCMPESTRI;

20534

X86CC = X86::COND_S;

20535

break;

20536

case Intrinsic::x86_sse42_pcmpistriz128:

20537

Opcode = X86ISD::PCMPISTRI;

20538

X86CC = X86::COND_E;

20539

break;

20540

case Intrinsic::x86_sse42_pcmpestriz128:

20541

Opcode = X86ISD::PCMPESTRI;

20542

X86CC = X86::COND_E;

20543

break;

20544

}

20545

SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());

20546

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

20547

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);

20548

SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);

20549

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

20550

}

20551

20552

case Intrinsic::x86_sse42_pcmpistri128:

20553

case Intrinsic::x86_sse42_pcmpestri128: {

20554

unsigned Opcode;

20555

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

20556

Opcode = X86ISD::PCMPISTRI;

20557

else

20558

Opcode = X86ISD::PCMPESTRI;

20559

20560

SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());

20561

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

20562

return DAG.getNode(Opcode, dl, VTs, NewOps);

20563

}

20564

20565

case Intrinsic::eh_sjlj_lsda: {

20566

MachineFunction &MF = DAG.getMachineFunction();

20567

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

20568

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

20569

auto &Context = MF.getMMI().getContext();

20570

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

20571

Twine(MF.getFunctionNumber()));

20572

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

20573

DAG.getMCSymbol(S, PtrVT));

20574

}

20575

20576

case Intrinsic::x86_seh_lsda: {

20577

// Compute the symbol for the LSDA. We know it'll get emitted later.

20578

MachineFunction &MF = DAG.getMachineFunction();

20579

SDValue Op1 = Op.getOperand(1);

20580

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

20581

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

20582

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

20583

20584

// Generate a simple absolute symbol reference. This intrinsic is only

20585

// supported on 32-bit Windows, which isn't PIC.

20586

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

20587

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

20588

}

20589

20590

case Intrinsic::x86_seh_recoverfp: {

20591

SDValue FnOp = Op.getOperand(1);

20592

SDValue IncomingFPOp = Op.getOperand(2);

20593

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

20594

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

20595

if (!Fn)

20596

report_fatal_error(

20597

"llvm.x86.seh.recoverfp must take a function as the first argument");

20598

return recoverFramePointer(DAG, Fn, IncomingFPOp);

20599

}

20600

20601

case Intrinsic::localaddress: {

20602

// Returns one of the stack, base, or frame pointer registers, depending on

20603

// which is used to reference local variables.

20604

MachineFunction &MF = DAG.getMachineFunction();

20605

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

20606

unsigned Reg;

20607

if (RegInfo->hasBasePointer(MF))

20608

Reg = RegInfo->getBaseRegister();

20609

else // This function handles the SP or FP case.

20610

Reg = RegInfo->getPtrSizedFrameRegister(MF);

20611

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

20612

}

20613

}

20614

}

20615

20616

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

20617

SDValue Src, SDValue Mask, SDValue Base,

20618

SDValue Index, SDValue ScaleOp, SDValue Chain,

20619

const X86Subtarget &Subtarget) {

20620

SDLoc dl(Op);

20621

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

20622

// Scale must be constant.

20623

if (!C)

20624

return SDValue();

20625

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);

20626

EVT MaskVT = Mask.getValueType();

20627

SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);

20628

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

20629

SDValue Segment = DAG.getRegister(0, MVT::i32);

20630

// If source is undef or we know it won't be used, use a zero vector

20631

// to break register dependency.

20632

// TODO: use undef instead and let ExecutionDepsFix deal with it?

20633

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

20634

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

20635

SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};

20636

SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);

20637

SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };

20638

return DAG.getMergeValues(RetOps, dl);

20639

}

20640

20641

static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

20642

SDValue Src, SDValue Mask, SDValue Base,

20643

SDValue Index, SDValue ScaleOp, SDValue Chain,

20644

const X86Subtarget &Subtarget) {

20645

SDLoc dl(Op);

20646

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

20647

// Scale must be constant.

20648

if (!C)

20649

return SDValue();

20650

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);

20651

MVT MaskVT = MVT::getVectorVT(MVT::i1,

20652

Index.getSimpleValueType().getVectorNumElements());

20653

20654

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

20655

SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);

20656

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

20657

SDValue Segment = DAG.getRegister(0, MVT::i32);

20658

// If source is undef or we know it won't be used, use a zero vector

20659

// to break register dependency.

20660

// TODO: use undef instead and let ExecutionDepsFix deal with it?

20661

if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))

20662

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

20663

SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};

20664

SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);

20665

SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };

20666

return DAG.getMergeValues(RetOps, dl);

20667

}

20668

20669

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

20670

SDValue Src, SDValue Mask, SDValue Base,

20671

SDValue Index, SDValue ScaleOp, SDValue Chain,

20672

const X86Subtarget &Subtarget) {

20673

SDLoc dl(Op);

20674

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

20675

// Scale must be constant.

20676

if (!C)

20677

return SDValue();

20678

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);

20679

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

20680

SDValue Segment = DAG.getRegister(0, MVT::i32);

20681

MVT MaskVT = MVT::getVectorVT(MVT::i1,

20682

Index.getSimpleValueType().getVectorNumElements());

20683

20684

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

20685

SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);

20686

SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};

20687

SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);

20688

return SDValue(Res, 1);

20689

}

20690

20691

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

20692

SDValue Mask, SDValue Base, SDValue Index,

20693

SDValue ScaleOp, SDValue Chain,

20694

const X86Subtarget &Subtarget) {

20695

SDLoc dl(Op);

20696

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

20697

// Scale must be constant.

20698

if (!C)

20699

return SDValue();

20700

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);

20701

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

20702

SDValue Segment = DAG.getRegister(0, MVT::i32);

20703

MVT MaskVT =

20704

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

20705

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

20706

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

20707

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

20708

return SDValue(Res, 0);

20709

}

20710

20711

/// Handles the lowering of builtin intrinsic that return the value

20712

/// of the extended control register.

20713

static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,

20714

SelectionDAG &DAG,

20715

const X86Subtarget &Subtarget,

20716

SmallVectorImpl<SDValue> &Results) {

20717

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20717, __extension__ __PRETTY_FUNCTION__));

20718

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

20719

SDValue LO, HI;

20720

20721

// The ECX register is used to select the index of the XCR register to

20722

// return.

20723

SDValue Chain =

20724

DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));

20725

SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);

20726

Chain = SDValue(N1, 0);

20727

20728

// Reads the content of XCR and returns it in registers EDX:EAX.

20729

if (Subtarget.is64Bit()) {

20730

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

20731

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

20732

LO.getValue(2));

20733

} else {

20734

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

20735

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

20736

LO.getValue(2));

20737

}

20738

Chain = HI.getValue(1);

20739

20740

if (Subtarget.is64Bit()) {

20741

// Merge the two 32-bit values into a 64-bit one..

20742

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

20743

DAG.getConstant(32, DL, MVT::i8));

20744

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

20745

Results.push_back(Chain);

20746

return;

20747

}

20748

20749

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

20750

SDValue Ops[] = { LO, HI };

20751

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

20752

Results.push_back(Pair);

20753

Results.push_back(Chain);

20754

}

20755

20756

/// Handles the lowering of builtin intrinsics that read performance monitor

20757

/// counters (x86_rdpmc).

20758

static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,

20759

SelectionDAG &DAG,

20760

const X86Subtarget &Subtarget,

20761

SmallVectorImpl<SDValue> &Results) {

20762

20763

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

20764

SDValue LO, HI;

20765

20766

// The ECX register is used to select the index of the performance counter

20767

// to read.

20768

SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,

20769

N->getOperand(2));

20770

SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);

20771

20772

// Reads the content of a 64-bit performance counter and returns it in the

20773

// registers EDX:EAX.

20774

if (Subtarget.is64Bit()) {

20775

LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));

20776

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

20777

LO.getValue(2));

20778

} else {

20779

LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));

20780

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

20781

LO.getValue(2));

20782

}

20783

Chain = HI.getValue(1);

20784

20785

if (Subtarget.is64Bit()) {

20786

// The EAX register is loaded with the low-order 32 bits. The EDX register

20787

// is loaded with the supported high-order bits of the counter.

20788

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

20789

DAG.getConstant(32, DL, MVT::i8));

20790

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

20791

Results.push_back(Chain);

20792

return;

20793

}

20794

20795

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

20796

SDValue Ops[] = { LO, HI };

20797

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

20798

Results.push_back(Pair);

20799

Results.push_back(Chain);

20800

}

20801

20802

/// Handles the lowering of builtin intrinsics that read the time stamp counter

20803

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

20804

/// READCYCLECOUNTER nodes.

20805

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

20806

SelectionDAG &DAG,

20807

const X86Subtarget &Subtarget,

20808

SmallVectorImpl<SDValue> &Results) {

20809

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

20810

SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));

20811

SDValue LO, HI;

20812

20813

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

20814

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

20815

// and the EAX register is loaded with the low-order 32 bits.

20816

if (Subtarget.is64Bit()) {

20817

LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));

20818

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

20819

LO.getValue(2));

20820

} else {

20821

LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));

20822

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

20823

LO.getValue(2));

20824

}

20825

SDValue Chain = HI.getValue(1);

20826

20827

if (Opcode == X86ISD::RDTSCP_DAG) {

20828

20829

20830

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

20831

// the ECX register. Add 'ecx' explicitly to the chain.

20832

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,

20833

HI.getValue(2));

20834

// Explicitly store the content of ECX at the location passed in input

20835

// to the 'rdtscp' intrinsic.

20836

Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),

20837

MachinePointerInfo());

20838

}

20839

20840

if (Subtarget.is64Bit()) {

20841

// The EDX register is loaded with the high-order 32 bits of the MSR, and

20842

// the EAX register is loaded with the low-order 32 bits.

20843

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

20844

DAG.getConstant(32, DL, MVT::i8));

20845

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

20846

Results.push_back(Chain);

20847

return;

20848

}

20849

20850

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

20851

SDValue Ops[] = { LO, HI };

20852

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

20853

Results.push_back(Pair);

20854

Results.push_back(Chain);

20855

}

20856

20857

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

20858

SelectionDAG &DAG) {

20859

SmallVector<SDValue, 2> Results;

20860

SDLoc DL(Op);

20861

getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,

20862

Results);

20863

return DAG.getMergeValues(Results, DL);

20864

}

20865

20866

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

20867

MachineFunction &MF = DAG.getMachineFunction();

20868

SDValue Chain = Op.getOperand(0);

20869

SDValue RegNode = Op.getOperand(2);

20870

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

20871

if (!EHInfo)

20872

report_fatal_error("EH registrations only live in functions using WinEH");

20873

20874

// Cast the operand to an alloca, and remember the frame index.

20875

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

20876

if (!FINode)

20877

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

20878

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

20879

20880

// Return the chain operand without making any DAG nodes.

20881

return Chain;

20882

}

20883

20884

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

20885

MachineFunction &MF = DAG.getMachineFunction();

20886

SDValue Chain = Op.getOperand(0);

20887

SDValue EHGuard = Op.getOperand(2);

20888

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

20889

if (!EHInfo)

20890

report_fatal_error("EHGuard only live in functions using WinEH");

20891

20892

// Cast the operand to an alloca, and remember the frame index.

20893

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

20894

if (!FINode)

20895

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

20896

EHInfo->EHGuardFrameIndex = FINode->getIndex();

20897

20898

// Return the chain operand without making any DAG nodes.

20899

return Chain;

20900

}

20901

20902

/// Emit Truncating Store with signed or unsigned saturation.

20903

static SDValue

20904

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

20905

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

20906

SelectionDAG &DAG) {

20907

20908

SDVTList VTs = DAG.getVTList(MVT::Other);

20909

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

20910

SDValue Ops[] = { Chain, Val, Ptr, Undef };

20911

return SignedSat ?

20912

DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :

20913

DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);

20914

}

20915

20916

/// Emit Masked Truncating Store with signed or unsigned saturation.

20917

static SDValue

20918

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

20919

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

20920

MachineMemOperand *MMO, SelectionDAG &DAG) {

20921

20922

SDVTList VTs = DAG.getVTList(MVT::Other);

20923

SDValue Ops[] = { Chain, Ptr, Mask, Val };

20924

return SignedSat ?

20925

DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :

20926

DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);

20927

}

20928

20929

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

20930

SelectionDAG &DAG) {

20931

unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

20932

20933

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

20934

if (!IntrData) {

20935

switch (IntNo) {

20936

case llvm::Intrinsic::x86_seh_ehregnode:

20937

return MarkEHRegistrationNode(Op, DAG);

20938

case llvm::Intrinsic::x86_seh_ehguard:

20939

return MarkEHGuard(Op, DAG);

20940

case llvm::Intrinsic::x86_flags_read_u32:

20941

case llvm::Intrinsic::x86_flags_read_u64:

20942

case llvm::Intrinsic::x86_flags_write_u32:

20943

case llvm::Intrinsic::x86_flags_write_u64: {

20944

// We need a frame pointer because this will get lowered to a PUSH/POP

20945

// sequence.

20946

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

20947

MFI.setHasCopyImplyingStackAdjustment(true);

20948

// Don't do anything here, we will expand these intrinsics out later

20949

// during ExpandISelPseudos in EmitInstrWithCustomInserter.

20950

return SDValue();

20951

}

20952

case Intrinsic::x86_lwpins32:

20953

case Intrinsic::x86_lwpins64: {

20954

SDLoc dl(Op);

20955

SDValue Chain = Op->getOperand(0);

20956

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

20957

SDValue LwpIns =

20958

DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),

20959

Op->getOperand(3), Op->getOperand(4));

20960

SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);

20961

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);

20962

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

20963

LwpIns.getValue(1));

20964

}

20965

}

20966

return SDValue();

20967

}

20968

20969

SDLoc dl(Op);

20970

switch(IntrData->Type) {

20971

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20971);

20972

case RDSEED:

20973

case RDRAND: {

20974

// Emit the node with the right value type.

20975

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

20976

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

20977

20978

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

20979

// Otherwise return the value from Rand, which is always 0, casted to i32.

20980

SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

20981

DAG.getConstant(1, dl, Op->getValueType(1)),

20982

DAG.getConstant(X86::COND_B, dl, MVT::i8),

20983

SDValue(Result.getNode(), 1) };

20984

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

20985

20986

// Return { result, isValid, chain }.

20987

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

20988

SDValue(Result.getNode(), 2));

20989

}

20990

case GATHER_AVX2: {

20991

SDValue Chain = Op.getOperand(0);

20992

SDValue Src = Op.getOperand(2);

20993

SDValue Base = Op.getOperand(3);

20994

SDValue Index = Op.getOperand(4);

20995

SDValue Mask = Op.getOperand(5);

20996

SDValue Scale = Op.getOperand(6);

20997

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

20998

Scale, Chain, Subtarget);

20999

}

21000

case GATHER: {

21001

//gather(v1, mask, index, base, scale);

21002

SDValue Chain = Op.getOperand(0);

21003

SDValue Src = Op.getOperand(2);

21004

SDValue Base = Op.getOperand(3);

21005

SDValue Index = Op.getOperand(4);

21006

SDValue Mask = Op.getOperand(5);

21007

SDValue Scale = Op.getOperand(6);

21008

return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,

21009

Chain, Subtarget);

21010

}

21011

case SCATTER: {

21012

//scatter(base, mask, index, v1, scale);

21013

SDValue Chain = Op.getOperand(0);

21014

SDValue Base = Op.getOperand(2);

21015

SDValue Mask = Op.getOperand(3);

21016

SDValue Index = Op.getOperand(4);

21017

SDValue Src = Op.getOperand(5);

21018

SDValue Scale = Op.getOperand(6);

21019

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

21020

Scale, Chain, Subtarget);

21021

}

21022

case PREFETCH: {

21023

SDValue Hint = Op.getOperand(6);

21024

unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();

21025

assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21026, __extension__ __PRETTY_FUNCTION__))

21026

"Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21026, __extension__ __PRETTY_FUNCTION__));

21027

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

21028

SDValue Chain = Op.getOperand(0);

21029

SDValue Mask = Op.getOperand(2);

21030

SDValue Index = Op.getOperand(3);

21031

SDValue Base = Op.getOperand(4);

21032

SDValue Scale = Op.getOperand(5);

21033

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

21034

Subtarget);

21035

}

21036

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

21037

case RDTSC: {

21038

SmallVector<SDValue, 2> Results;

21039

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

21040

Results);

21041

return DAG.getMergeValues(Results, dl);

21042

}

21043

// Read Performance Monitoring Counters.

21044

case RDPMC: {

21045

SmallVector<SDValue, 2> Results;

21046

getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);

21047

return DAG.getMergeValues(Results, dl);

21048

}

21049

// Get Extended Control Register.

21050

case XGETBV: {

21051

SmallVector<SDValue, 2> Results;

21052

getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);

21053

return DAG.getMergeValues(Results, dl);

21054

}

21055

// XTEST intrinsics.

21056

case XTEST: {

21057

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

21058

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

21059

21060

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

21061

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

21062

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

21063

Ret, SDValue(InTrans.getNode(), 1));

21064

}

21065

// ADC/ADCX/SBB

21066

case ADX: {

21067

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

21068

SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);

21069

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),

21070

DAG.getConstant(-1, dl, MVT::i8));

21071

SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),

21072

Op.getOperand(4), GenCF.getValue(1));

21073

SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),

21074

Op.getOperand(5), MachinePointerInfo());

21075

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

21076

SDValue Results[] = { SetCC, Store };

21077

return DAG.getMergeValues(Results, dl);

21078

}

21079

case COMPRESS_TO_MEM: {

21080

SDValue Mask = Op.getOperand(4);

21081

SDValue DataToCompress = Op.getOperand(3);

21082

SDValue Addr = Op.getOperand(2);

21083

SDValue Chain = Op.getOperand(0);

21084

MVT VT = DataToCompress.getSimpleValueType();

21085

21086

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

21087

assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21087, __extension__ __PRETTY_FUNCTION__));

21088

21089

if (isAllOnesConstant(Mask)) // return just a store

21090

return DAG.getStore(Chain, dl, DataToCompress, Addr,

21091

MemIntr->getMemOperand());

21092

21093

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

21094

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

21095

21096

return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,

21097

MemIntr->getMemOperand(),

21098

false /* truncating */, true /* compressing */);

21099

}

21100

case TRUNCATE_TO_MEM_VI8:

21101

case TRUNCATE_TO_MEM_VI16:

21102

case TRUNCATE_TO_MEM_VI32: {

21103

SDValue Mask = Op.getOperand(4);

21104

SDValue DataToTruncate = Op.getOperand(3);

21105

SDValue Addr = Op.getOperand(2);

21106

SDValue Chain = Op.getOperand(0);

21107

21108

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

21109

21110

21111

EVT MemVT = MemIntr->getMemoryVT();

21112

21113

uint16_t TruncationOp = IntrData->Opc0;

21114

switch (TruncationOp) {

21115

case X86ISD::VTRUNC: {

21116

if (isAllOnesConstant(Mask)) // return just a truncate store

21117

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

21118

MemIntr->getMemOperand());

21119

21120

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

21121

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

21122

21123

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,

21124

MemIntr->getMemOperand(), true /* truncating */);

21125

}

21126

case X86ISD::VTRUNCUS:

21127

case X86ISD::VTRUNCS: {

21128

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

21129

if (isAllOnesConstant(Mask))

21130

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

21131

MemIntr->getMemOperand(), DAG);

21132

21133

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

21134

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

21135

21136

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

21137

VMask, MemVT, MemIntr->getMemOperand(), DAG);

21138

}

21139

default:

21140

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21140);

21141

}

21142

}

21143

21144

case EXPAND_FROM_MEM: {

21145

SDValue Mask = Op.getOperand(4);

21146

SDValue PassThru = Op.getOperand(3);

21147

SDValue Addr = Op.getOperand(2);

21148

SDValue Chain = Op.getOperand(0);

21149

MVT VT = Op.getSimpleValueType();

21150

21151

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

21152

21153

21154

if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.

21155

return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());

21156

if (X86::isZeroNode(Mask))

21157

return DAG.getUNDEF(VT);

21158

21159

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

21160

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

21161

return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,

21162

MemIntr->getMemOperand(), ISD::NON_EXTLOAD,

21163

true /* expanding */);

21164

}

21165

}

21166

}

21167

21168

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

21169

SelectionDAG &DAG) const {

21170

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

21171

MFI.setReturnAddressIsTaken(true);

21172

21173

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

21174

return SDValue();

21175

21176

unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

21177

SDLoc dl(Op);

21178

EVT PtrVT = getPointerTy(DAG.getDataLayout());

21179

21180

if (Depth > 0) {

21181

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

21182

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

21183

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

21184

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

21185

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

21186

MachinePointerInfo());

21187

}

21188

21189

// Just load the return address.

21190

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

21191

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

21192

MachinePointerInfo());

21193

}

21194

21195

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

21196

SelectionDAG &DAG) const {

21197

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

21198

return getReturnAddressFrameIndex(DAG);

21199

}

21200

21201

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

21202

MachineFunction &MF = DAG.getMachineFunction();

21203

MachineFrameInfo &MFI = MF.getFrameInfo();

21204

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

21205

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

21206

EVT VT = Op.getValueType();

21207

21208

MFI.setFrameAddressIsTaken(true);

21209

21210

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

21211

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

21212

// is not possible to crawl up the stack without looking at the unwind codes

21213

// simultaneously.

21214

int FrameAddrIndex = FuncInfo->getFAIndex();

21215

if (!FrameAddrIndex) {

21216

// Set up a frame object for the return address.

21217

unsigned SlotSize = RegInfo->getSlotSize();

21218

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

21219

SlotSize, /*Offset=*/0, /*IsImmutable=*/false);

21220

FuncInfo->setFAIndex(FrameAddrIndex);

21221

}

21222

return DAG.getFrameIndex(FrameAddrIndex, VT);

21223

}

21224

21225

unsigned FrameReg =

21226

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

21227

SDLoc dl(Op); // FIXME probably not meaningful

21228

unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

21229

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21231, __extension__ __PRETTY_FUNCTION__))

21230

(FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21231, __extension__ __PRETTY_FUNCTION__))

21231

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21231, __extension__ __PRETTY_FUNCTION__));

21232

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

21233

while (Depth--)

21234

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

21235

MachinePointerInfo());

21236

return FrameAddr;

21237

}

21238

21239

// FIXME? Maybe this could be a TableGen attribute on some registers and

21240

// this table could be generated automatically from RegInfo.

21241

unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,

21242

SelectionDAG &DAG) const {

21243

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

21244

const MachineFunction &MF = DAG.getMachineFunction();

21245

21246

unsigned Reg = StringSwitch<unsigned>(RegName)

21247

.Case("esp", X86::ESP)

21248

.Case("rsp", X86::RSP)

21249

.Case("ebp", X86::EBP)

21250

.Case("rbp", X86::RBP)

21251

.Default(0);

21252

21253

if (Reg == X86::EBP || Reg == X86::RBP) {

21254

if (!TFI.hasFP(MF))

21255

report_fatal_error("register " + StringRef(RegName) +

21256

" is allocatable: function has no frame pointer");

21257

#ifndef NDEBUG

21258

else {

21259

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

21260

unsigned FrameReg =

21261

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

21262

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21263, __extension__ __PRETTY_FUNCTION__))

21263

"Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21263, __extension__ __PRETTY_FUNCTION__));

21264

}

21265

#endif

21266

}

21267

21268

if (Reg)

21269

return Reg;

21270

21271

report_fatal_error("Invalid register name global variable");

21272

}

21273

21274

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

21275

SelectionDAG &DAG) const {

21276

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

21277

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

21278

}

21279

21280

unsigned X86TargetLowering::getExceptionPointerRegister(

21281

const Constant *PersonalityFn) const {

21282

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

21283

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

21284

21285

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

21286

}

21287

21288

unsigned X86TargetLowering::getExceptionSelectorRegister(

21289

const Constant *PersonalityFn) const {

21290

// Funclet personalities don't use selectors (the runtime does the selection).

21291

assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))(static_cast <bool> (!isFuncletEHPersonality(classifyEHPersonality
(PersonalityFn))) ? void (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21291, __extension__ __PRETTY_FUNCTION__));

21292

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

21293

}

21294

21295

bool X86TargetLowering::needsFixedCatchObjects() const {

21296

return Subtarget.isTargetWin64();

21297

}

21298

21299

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

21300

SDValue Chain = Op.getOperand(0);

21301

SDValue Offset = Op.getOperand(1);

21302

SDValue Handler = Op.getOperand(2);

21303

SDLoc dl (Op);

21304

21305

EVT PtrVT = getPointerTy(DAG.getDataLayout());

21306

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

21307

unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

21308

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21310, __extension__ __PRETTY_FUNCTION__))

21309

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21310, __extension__ __PRETTY_FUNCTION__))

21310

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21310, __extension__ __PRETTY_FUNCTION__));

21311

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

21312

unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

21313

21314

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

21315

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

21316

dl));

21317

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

21318

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

21319

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

21320

21321

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

21322

DAG.getRegister(StoreAddrReg, PtrVT));

21323

}

21324

21325

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

21326

SelectionDAG &DAG) const {

21327

SDLoc DL(Op);

21328

// If the subtarget is not 64bit, we may need the global base reg

21329

// after isel expand pseudo, i.e., after CGBR pass ran.

21330

// Therefore, ask for the GlobalBaseReg now, so that the pass

21331

// inserts the code for us in case we need it.

21332

// Otherwise, we will end up in a situation where we will

21333

// reference a virtual register that is not defined!

21334

if (!Subtarget.is64Bit()) {

21335

const X86InstrInfo *TII = Subtarget.getInstrInfo();

21336

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

21337

}

21338

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

21339

DAG.getVTList(MVT::i32, MVT::Other),

21340

Op.getOperand(0), Op.getOperand(1));

21341

}

21342

21343

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

21344

SelectionDAG &DAG) const {

21345

SDLoc DL(Op);

21346

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

21347

Op.getOperand(0), Op.getOperand(1));

21348

}

21349

21350

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

21351

SelectionDAG &DAG) const {

21352

SDLoc DL(Op);

21353

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

21354

Op.getOperand(0));

21355

}

21356

21357

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

21358

return Op.getOperand(0);

21359

}

21360

21361

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

21362

SelectionDAG &DAG) const {

21363

SDValue Root = Op.getOperand(0);

21364

SDValue Trmp = Op.getOperand(1); // trampoline

21365

SDValue FPtr = Op.getOperand(2); // nested function

21366

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

21367

SDLoc dl (Op);

21368

21369

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

21370

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

21371

21372

if (Subtarget.is64Bit()) {

21373

SDValue OutChains[6];

21374

21375

// Large code-model.

21376

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

21377

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

21378

21379

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

21380

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

21381

21382

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

21383

21384

// Load the pointer to the nested function into R11.

21385

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

21386

SDValue Addr = Trmp;

21387

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

21388

Addr, MachinePointerInfo(TrmpAddr));

21389

21390

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

21391

DAG.getConstant(2, dl, MVT::i64));

21392

OutChains[1] =

21393

DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),

21394

/* Alignment = */ 2);

21395

21396

// Load the 'nest' parameter value into R10.

21397

// R10 is specified in X86CallingConv.td

21398

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

21399

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

21400

DAG.getConstant(10, dl, MVT::i64));

21401

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

21402

Addr, MachinePointerInfo(TrmpAddr, 10));

21403

21404

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

21405

DAG.getConstant(12, dl, MVT::i64));

21406

OutChains[3] =

21407

DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),

21408

/* Alignment = */ 2);

21409

21410

// Jump to the nested function.

21411

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

21412

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

21413

DAG.getConstant(20, dl, MVT::i64));

21414

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

21415

Addr, MachinePointerInfo(TrmpAddr, 20));

21416

21417

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

21418

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

21419

DAG.getConstant(22, dl, MVT::i64));

21420

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

21421

Addr, MachinePointerInfo(TrmpAddr, 22));

21422

21423

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

21424

} else {

21425

const Function *Func =

21426

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

21427

CallingConv::ID CC = Func->getCallingConv();

21428

unsigned NestReg;

21429

21430

switch (CC) {

21431

default:

21432

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21432);

21433

case CallingConv::C:

21434

case CallingConv::X86_StdCall: {

21435

// Pass 'nest' parameter in ECX.

21436

// Must be kept in sync with X86CallingConv.td

21437

NestReg = X86::ECX;

21438

21439

// Check that ECX wasn't needed by an 'inreg' parameter.

21440

FunctionType *FTy = Func->getFunctionType();

21441

const AttributeList &Attrs = Func->getAttributes();

21442

21443

if (!Attrs.isEmpty() && !Func->isVarArg()) {

21444

unsigned InRegCount = 0;

21445

unsigned Idx = 1;

21446

21447

for (FunctionType::param_iterator I = FTy->param_begin(),

21448

E = FTy->param_end(); I != E; ++I, ++Idx)

21449

if (Attrs.hasAttribute(Idx, Attribute::InReg)) {

21450

auto &DL = DAG.getDataLayout();

21451

// FIXME: should only count parameters that are lowered to integers.

21452

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

21453

}

21454

21455

if (InRegCount > 2) {

21456

report_fatal_error("Nest register in use - reduce number of inreg"

21457

" parameters!");

21458

}

21459

}

21460

break;

21461

}

21462

case CallingConv::X86_FastCall:

21463

case CallingConv::X86_ThisCall:

21464

case CallingConv::Fast:

21465

// Pass 'nest' parameter in EAX.

21466

// Must be kept in sync with X86CallingConv.td

21467

NestReg = X86::EAX;

21468

break;

21469

}

21470

21471

SDValue OutChains[4];

21472

SDValue Addr, Disp;

21473

21474

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

21475

DAG.getConstant(10, dl, MVT::i32));

21476

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

21477

21478

// This is storing the opcode for MOV32ri.

21479

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

21480

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

21481

OutChains[0] =

21482

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

21483

Trmp, MachinePointerInfo(TrmpAddr));

21484

21485

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

21486

DAG.getConstant(1, dl, MVT::i32));

21487

OutChains[1] =

21488

DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),

21489

/* Alignment = */ 1);

21490

21491

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

21492

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

21493

DAG.getConstant(5, dl, MVT::i32));

21494

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),

21495

Addr, MachinePointerInfo(TrmpAddr, 5),

21496

/* Alignment = */ 1);

21497

21498

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

21499

DAG.getConstant(6, dl, MVT::i32));

21500

OutChains[3] =

21501

DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),

21502

/* Alignment = */ 1);

21503

21504

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

21505

}

21506

}

21507

21508

SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,

21509

SelectionDAG &DAG) const {

21510

21511

The rounding mode is in bits 11:10 of FPSR, and has the following

21512

settings:

21513

00 Round to nearest

21514

01 Round to -inf

21515

10 Round to +inf

21516

11 Round to 0

21517

21518

FLT_ROUNDS, on the other hand, expects the following:

21519

-1 Undefined

21520

0 Round to 0

21521

1 Round to nearest

21522

2 Round to +inf

21523

3 Round to -inf

21524

21525

To perform the conversion, we do:

21526

(((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)

21527

21528

21529

MachineFunction &MF = DAG.getMachineFunction();

21530

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

21531

unsigned StackAlignment = TFI.getStackAlignment();

21532

MVT VT = Op.getSimpleValueType();

21533

SDLoc DL(Op);

21534

21535

// Save FP Control Word to stack slot

21536

int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);

21537

SDValue StackSlot =

21538

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

21539

21540

MachineMemOperand *MMO =

21541

MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),

21542

MachineMemOperand::MOStore, 2, 2);

21543

21544

SDValue Ops[] = { DAG.getEntryNode(), StackSlot };

21545

SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

21546

DAG.getVTList(MVT::Other),

21547

Ops, MVT::i16, MMO);

21548

21549

// Load FP Control Word from stack slot

21550

SDValue CWD =

21551

DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

21552

21553

// Transform as necessary

21554

SDValue CWD1 =

21555

DAG.getNode(ISD::SRL, DL, MVT::i16,

21556

DAG.getNode(ISD::AND, DL, MVT::i16,

21557

CWD, DAG.getConstant(0x800, DL, MVT::i16)),

21558

DAG.getConstant(11, DL, MVT::i8));

21559

SDValue CWD2 =

21560

DAG.getNode(ISD::SRL, DL, MVT::i16,

21561

DAG.getNode(ISD::AND, DL, MVT::i16,

21562

CWD, DAG.getConstant(0x400, DL, MVT::i16)),

21563

DAG.getConstant(9, DL, MVT::i8));

21564

21565

SDValue RetVal =

21566

DAG.getNode(ISD::AND, DL, MVT::i16,

21567

DAG.getNode(ISD::ADD, DL, MVT::i16,

21568

DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),

21569

DAG.getConstant(1, DL, MVT::i16)),

21570

DAG.getConstant(3, DL, MVT::i16));

21571

21572

return DAG.getNode((VT.getSizeInBits() < 16 ?

21573

ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);

21574

}

21575

21576

// Split an unary integer op into 2 half sized ops.

21577

static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

21578

MVT VT = Op.getSimpleValueType();

21579

unsigned NumElems = VT.getVectorNumElements();

21580

unsigned SizeInBits = VT.getSizeInBits();

21581

21582

// Extract the Lo/Hi vectors

21583

SDLoc dl(Op);

21584

SDValue Src = Op.getOperand(0);

21585

SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);

21586

SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

21587

21588

MVT EltVT = VT.getVectorElementType();

21589

MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);

21590

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

21591

DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),

21592

DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));

21593

}

21594

21595

// Decompose 256-bit ops into smaller 128-bit ops.

21596

static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {

21597

assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21599, __extension__ __PRETTY_FUNCTION__))

21598

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21599, __extension__ __PRETTY_FUNCTION__))

21599

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21599, __extension__ __PRETTY_FUNCTION__));

21600

return LowerVectorIntUnary(Op, DAG);

21601

}

21602

21603

// Decompose 512-bit ops into smaller 256-bit ops.

21604

static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {

21605

assert(Op.getSimpleValueType().is512BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is512BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21607, __extension__ __PRETTY_FUNCTION__))

21606

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is512BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21607, __extension__ __PRETTY_FUNCTION__))

21607

"Only handle AVX 512-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is512BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21607, __extension__ __PRETTY_FUNCTION__));

21608

return LowerVectorIntUnary(Op, DAG);

21609

}

21610

21611

/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.

21612

21613

// i8/i16 vector implemented using dword LZCNT vector instruction

21614

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

21615

// split the vector, perform operation on it's Lo a Hi part and

21616

// concatenate the results.

21617

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {

21618

assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21618, __extension__ __PRETTY_FUNCTION__));

21619

SDLoc dl(Op);

21620

MVT VT = Op.getSimpleValueType();

21621

MVT EltVT = VT.getVectorElementType();

21622

unsigned NumElems = VT.getVectorNumElements();

21623

21624

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21625, __extension__ __PRETTY_FUNCTION__))

21625

"Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21625, __extension__ __PRETTY_FUNCTION__));

21626

21627

// Split vector, it's Lo and Hi parts will be handled in next iteration.

21628

if (16 < NumElems)

21629

return LowerVectorIntUnary(Op, DAG);

21630

21631

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

21632

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21633, __extension__ __PRETTY_FUNCTION__))

21633

"Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21633, __extension__ __PRETTY_FUNCTION__));

21634

21635

// Use native supported vector instruction vplzcntd.

21636

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

21637

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

21638

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

21639

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

21640

21641

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

21642

}

21643

21644

// Lower CTLZ using a PSHUFB lookup table implementation.

21645

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

21646

const X86Subtarget &Subtarget,

21647

SelectionDAG &DAG) {

21648

MVT VT = Op.getSimpleValueType();

21649

int NumElts = VT.getVectorNumElements();

21650

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

21651

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

21652

21653

// Per-nibble leading zero PSHUFB lookup table.

21654

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

21655

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

21656

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

21657

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

21658

21659

SmallVector<SDValue, 64> LUTVec;

21660

for (int i = 0; i < NumBytes; ++i)

21661

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

21662

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

21663

21664

// Begin by bitcasting the input to byte vector, then split those bytes

21665

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

21666

// If the hi input nibble is zero then we add both results together, otherwise

21667

// we just take the hi result (by masking the lo result to zero before the

21668

// add).

21669

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

21670

SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);

21671

21672

SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);

21673

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

21674

SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);

21675

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

21676

SDValue HiZ;

21677

if (CurrVT.is512BitVector()) {

21678

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

21679

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

21680

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

21681

} else {

21682

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

21683

}

21684

21685

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

21686

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

21687

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

21688

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

21689

21690

// Merge result back from vXi8 back to VT, working on the lo/hi halves

21691

// of the current vector width in the same way we did for the nibbles.

21692

// If the upper half of the input element is zero then add the halves'

21693

// leading zero counts together, otherwise just use the upper half's.

21694

// Double the width of the result until we are at target width.

21695

while (CurrVT != VT) {

21696

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

21697

int CurrNumElts = CurrVT.getVectorNumElements();

21698

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

21699

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

21700

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

21701

21702

// Check if the upper half of the input element is zero.

21703

if (CurrVT.is512BitVector()) {

21704

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

21705

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

21706

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

21707

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

21708

} else {

21709

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

21710

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

21711

}

21712

HiZ = DAG.getBitcast(NextVT, HiZ);

21713

21714

// Move the upper/lower halves to the lower bits as we'll be extending to

21715

// NextVT. Mask the lower result to zero if HiZ is true and add the results

21716

// together.

21717

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

21718

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

21719

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

21720

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

21721

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

21722

CurrVT = NextVT;

21723

}

21724

21725

return Res;

21726

}

21727

21728

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

21729

const X86Subtarget &Subtarget,

21730

SelectionDAG &DAG) {

21731

MVT VT = Op.getSimpleValueType();

21732

21733

if (Subtarget.hasCDI())

21734

return LowerVectorCTLZ_AVX512CDI(Op, DAG);

21735

21736

// Decompose 256-bit ops into smaller 128-bit ops.

21737

if (VT.is256BitVector() && !Subtarget.hasInt256())

21738

return Lower256IntUnary(Op, DAG);

21739

21740

// Decompose 512-bit ops into smaller 256-bit ops.

21741

if (VT.is512BitVector() && !Subtarget.hasBWI())

21742

return Lower512IntUnary(Op, DAG);

21743

21744

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21744, __extension__ __PRETTY_FUNCTION__));

21745

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

21746

}

21747

21748

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

21749

SelectionDAG &DAG) {

21750

MVT VT = Op.getSimpleValueType();

21751

MVT OpVT = VT;

21752

unsigned NumBits = VT.getSizeInBits();

21753

SDLoc dl(Op);

21754

unsigned Opc = Op.getOpcode();

21755

21756

if (VT.isVector())

21757

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

21758

21759

Op = Op.getOperand(0);

21760

if (VT == MVT::i8) {

21761

// Zero extend to i32 since there is not an i8 bsr.

21762

OpVT = MVT::i32;

21763

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

21764

}

21765

21766

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

21767

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

21768

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

21769

21770

if (Opc == ISD::CTLZ) {

21771

// If src is zero (i.e. bsr sets ZF), returns NumBits.

21772

SDValue Ops[] = {

21773

Op,

21774

DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

21775

DAG.getConstant(X86::COND_E, dl, MVT::i8),

21776

Op.getValue(1)

21777

};

21778

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

21779

}

21780

21781

// Finally xor with NumBits-1.

21782

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

21783

DAG.getConstant(NumBits - 1, dl, OpVT));

21784

21785

if (VT == MVT::i8)

21786

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

21787

return Op;

21788

}

21789

21790

static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {

21791

MVT VT = Op.getSimpleValueType();

21792

unsigned NumBits = VT.getScalarSizeInBits();

21793

SDLoc dl(Op);

21794

21795

if (VT.isVector()) {

21796

SDValue N0 = Op.getOperand(0);

21797

SDValue Zero = DAG.getConstant(0, dl, VT);

21798

21799

// lsb(x) = (x & -x)

21800

SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,

21801

DAG.getNode(ISD::SUB, dl, VT, Zero, N0));

21802

21803

// cttz_undef(x) = (width - 1) - ctlz(lsb)

21804

if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {

21805

SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);

21806

return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,

21807

DAG.getNode(ISD::CTLZ, dl, VT, LSB));

21808

}

21809

21810

// cttz(x) = ctpop(lsb - 1)

21811

SDValue One = DAG.getConstant(1, dl, VT);

21812

return DAG.getNode(ISD::CTPOP, dl, VT,

21813

DAG.getNode(ISD::SUB, dl, VT, LSB, One));

21814

}

21815

21816

assert(Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? void (0) : __assert_fail
("Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21817, __extension__ __PRETTY_FUNCTION__))

21817

"Only scalar CTTZ requires custom lowering")(static_cast <bool> (Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? void (0) : __assert_fail
("Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21817, __extension__ __PRETTY_FUNCTION__));

21818

21819

// Issue a bsf (scan bits forward) which also sets EFLAGS.

21820

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

21821

Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));

21822

21823

// If src is zero (i.e. bsf sets ZF), returns NumBits.

21824

SDValue Ops[] = {

21825

Op,

21826

DAG.getConstant(NumBits, dl, VT),

21827

DAG.getConstant(X86::COND_E, dl, MVT::i8),

21828

Op.getValue(1)

21829

};

21830

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

21831

}

21832

21833

/// Break a 256-bit integer operation into two new 128-bit ones and then

21834

/// concatenate the result back.

21835

static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {

21836

MVT VT = Op.getSimpleValueType();

21837

21838

assert(VT.is256BitVector() && VT.isInteger() &&(static_cast <bool> (VT.is256BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21839, __extension__ __PRETTY_FUNCTION__))

21839

"Unsupported value type for operation")(static_cast <bool> (VT.is256BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21839, __extension__ __PRETTY_FUNCTION__));

21840

21841

unsigned NumElems = VT.getVectorNumElements();

21842

SDLoc dl(Op);

21843

21844

// Extract the LHS vectors

21845

SDValue LHS = Op.getOperand(0);

21846

SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);

21847

SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

21848

21849

// Extract the RHS vectors

21850

SDValue RHS = Op.getOperand(1);

21851

SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);

21852

SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

21853

21854

MVT EltVT = VT.getVectorElementType();

21855

MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

21856

21857

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

21858

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),

21859

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));

21860

}

21861

21862

/// Break a 512-bit integer operation into two new 256-bit ones and then

21863

/// concatenate the result back.

21864

static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {

21865

MVT VT = Op.getSimpleValueType();

21866

21867

assert(VT.is512BitVector() && VT.isInteger() &&(static_cast <bool> (VT.is512BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21868, __extension__ __PRETTY_FUNCTION__))

21868

"Unsupported value type for operation")(static_cast <bool> (VT.is512BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21868, __extension__ __PRETTY_FUNCTION__));

21869

21870

unsigned NumElems = VT.getVectorNumElements();

21871

SDLoc dl(Op);

21872

21873

// Extract the LHS vectors

21874

SDValue LHS = Op.getOperand(0);

21875

SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);

21876

SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

21877

21878

// Extract the RHS vectors

21879

SDValue RHS = Op.getOperand(1);

21880

SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);

21881

SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

21882

21883

MVT EltVT = VT.getVectorElementType();

21884

MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

21885

21886

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

21887

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),

21888

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));

21889

}

21890

21891

static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {

21892

MVT VT = Op.getSimpleValueType();

21893

if (VT.getScalarType() == MVT::i1)

21894

return DAG.getNode(ISD::XOR, SDLoc(Op), VT,

21895

Op.getOperand(0), Op.getOperand(1));

21896

21897

21898

21899

return Lower256IntArith(Op, DAG);

21900

}

21901

21902

static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {

21903

MVT VT = Op.getSimpleValueType();

21904

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

21905

// Since X86 does not have CMOV for 8-bit integer, we don't convert

21906

// 8-bit integer abs to NEG and CMOV.

21907

SDLoc DL(Op);

21908

SDValue N0 = Op.getOperand(0);

21909

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

21910

DAG.getConstant(0, DL, VT), N0);

21911

SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),

21912

SDValue(Neg.getNode(), 1)};

21913

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

21914

}

21915

21916

21917

21918

21919

return Lower256IntUnary(Op, DAG);

21920

}

21921

21922

static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {

21923

21924

21925

21926

return Lower256IntArith(Op, DAG);

21927

}

21928

21929

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

21930

SelectionDAG &DAG) {

21931

SDLoc dl(Op);

21932

MVT VT = Op.getSimpleValueType();

21933

21934

if (VT.getScalarType() == MVT::i1)

21935

return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

21936

21937

// Decompose 256-bit ops into smaller 128-bit ops.

21938

if (VT.is256BitVector() && !Subtarget.hasInt256())

21939

return Lower256IntArith(Op, DAG);

21940

21941

SDValue A = Op.getOperand(0);

21942

SDValue B = Op.getOperand(1);

21943

21944

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

21945

// vector pairs, multiply and truncate.

21946

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

21947

if (Subtarget.hasInt256()) {

21948

// For 512-bit vectors, split into 256-bit vectors to allow the

21949

// sign-extension to occur.

21950

if (VT == MVT::v64i8)

21951

return Lower512IntArith(Op, DAG);

21952

21953

// For 256-bit vectors, split into 128-bit vectors to allow the

21954

// sign-extension to occur. We don't need this on AVX512BW as we can

21955

// safely sign-extend to v32i16.

21956

if (VT == MVT::v32i8 && !Subtarget.hasBWI())

21957

return Lower256IntArith(Op, DAG);

21958

21959

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

21960

return DAG.getNode(

21961

ISD::TRUNCATE, dl, VT,

21962

DAG.getNode(ISD::MUL, dl, ExVT,

21963

DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),

21964

DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));

21965

}

21966

21967

assert(VT == MVT::v16i8 &&(static_cast <bool> (VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21968, __extension__ __PRETTY_FUNCTION__))

21968

"Pre-AVX2 support only supports v16i8 multiplication")(static_cast <bool> (VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21968, __extension__ __PRETTY_FUNCTION__));

21969

MVT ExVT = MVT::v8i16;

21970

21971

// Extract the lo parts and sign extend to i16

21972

SDValue ALo, BLo;

21973

if (Subtarget.hasSSE41()) {

21974

ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);

21975

BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);

21976

} else {

21977

const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,

21978

-1, 4, -1, 5, -1, 6, -1, 7};

21979

ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);

21980

BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);

21981

ALo = DAG.getBitcast(ExVT, ALo);

21982

BLo = DAG.getBitcast(ExVT, BLo);

21983

ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));

21984

BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));

21985

}

21986

21987

// Extract the hi parts and sign extend to i16

21988

SDValue AHi, BHi;

21989

if (Subtarget.hasSSE41()) {

21990

const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,

21991

-1, -1, -1, -1, -1, -1, -1, -1};

21992

AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);

21993

BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);

21994

AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);

21995

BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);

21996

} else {

21997

const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,

21998

-1, 12, -1, 13, -1, 14, -1, 15};

21999

AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);

22000

BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);

22001

AHi = DAG.getBitcast(ExVT, AHi);

22002

BHi = DAG.getBitcast(ExVT, BHi);

22003

AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));

22004

BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));

22005

}

22006

22007

// Multiply, mask the lower 8bits of the lo/hi results and pack

22008

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

22009

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

22010

RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));

22011

RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));

22012

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

22013

}

22014

22015

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

22016

if (VT == MVT::v4i32) {

22017

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmuldq is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22018, __extension__ __PRETTY_FUNCTION__))

22018

"Should not custom lower when pmuldq is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmuldq is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22018, __extension__ __PRETTY_FUNCTION__));

22019

22020

// Extract the odd parts.

22021

static const int UnpackMask[] = { 1, -1, 3, -1 };

22022

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

22023

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

22024

22025

// Multiply the even parts.

22026

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);

22027

// Now multiply odd parts.

22028

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);

22029

22030

Evens = DAG.getBitcast(VT, Evens);

22031

Odds = DAG.getBitcast(VT, Odds);

22032

22033

// Merge the two vectors back together with a shuffle. This expands into 2

22034

// shuffles.

22035

static const int ShufMask[] = { 0, 4, 2, 6 };

22036

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

22037

}

22038

22039

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22040, __extension__ __PRETTY_FUNCTION__))

22040

"Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22040, __extension__ __PRETTY_FUNCTION__));

22041

22042

// 32-bit vector types used for MULDQ/MULUDQ.

22043

MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

22044

22045

// MULDQ returns the 64-bit result of the signed multiplication of the lower

22046

// 32-bits. We can lower with this if the sign bits stretch that far.

22047

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&

22048

DAG.ComputeNumSignBits(B) > 32) {

22049

return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),

22050

DAG.getBitcast(MulVT, B));

22051

}

22052

22053

// Ahi = psrlqi(a, 32);

22054

// Bhi = psrlqi(b, 32);

22055

22056

// AloBlo = pmuludq(a, b);

22057

// AloBhi = pmuludq(a, Bhi);

22058

// AhiBlo = pmuludq(Ahi, b);

22059

22060

// Hi = psllqi(AloBhi + AhiBlo, 32);

22061

// return AloBlo + Hi;

22062

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

22063

bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);

22064

bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);

22065

22066

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

22067

bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);

22068

bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);

22069

22070

// Bit cast to 32-bit vectors for MULUDQ.

22071

SDValue Alo = DAG.getBitcast(MulVT, A);

22072

SDValue Blo = DAG.getBitcast(MulVT, B);

22073

22074

SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

22075

22076

// Only multiply lo/hi halves that aren't known to be zero.

22077

SDValue AloBlo = Zero;

22078

if (!ALoIsZero && !BLoIsZero)

22079

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);

22080

22081

SDValue AloBhi = Zero;

22082

if (!ALoIsZero && !BHiIsZero) {

22083

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

22084

Bhi = DAG.getBitcast(MulVT, Bhi);

22085

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);

22086

}

22087

22088

SDValue AhiBlo = Zero;

22089

if (!AHiIsZero && !BLoIsZero) {

22090

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

22091

Ahi = DAG.getBitcast(MulVT, Ahi);

22092

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);

22093

}

22094

22095

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

22096

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

22097

22098

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

22099

}

22100

22101

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

22102

SelectionDAG &DAG) {

22103

SDLoc dl(Op);

22104

MVT VT = Op.getSimpleValueType();

22105

22106

// Decompose 256-bit ops into smaller 128-bit ops.

22107

if (VT.is256BitVector() && !Subtarget.hasInt256())

22108

return Lower256IntArith(Op, DAG);

22109

22110

// Only i8 vectors should need custom lowering after this.

22111

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22113, __extension__ __PRETTY_FUNCTION__))

22112

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22113, __extension__ __PRETTY_FUNCTION__))

22113

"Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22113, __extension__ __PRETTY_FUNCTION__));

22114

22115

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

22116

// logical shift down the upper half and pack back to i8.

22117

SDValue A = Op.getOperand(0);

22118

SDValue B = Op.getOperand(1);

22119

22120

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

22121

// and then ashr/lshr the upper bits down to the lower bits before multiply.

22122

unsigned Opcode = Op.getOpcode();

22123

unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);

22124

unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);

22125

22126

// For 512-bit vectors, split into 256-bit vectors to allow the

22127

// sign-extension to occur.

22128

if (VT == MVT::v64i8)

22129

return Lower512IntArith(Op, DAG);

22130

22131

// AVX2 implementations - extend xmm subvectors to ymm.

22132

if (Subtarget.hasInt256()) {

22133

unsigned NumElems = VT.getVectorNumElements();

22134

SDValue Lo = DAG.getIntPtrConstant(0, dl);

22135

SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);

22136

22137

if (VT == MVT::v32i8) {

22138

if (Subtarget.hasBWI()) {

22139

SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);

22140

SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);

22141

SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);

22142

Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,

22143

DAG.getConstant(8, dl, MVT::v32i16));

22144

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

22145

}

22146

SDValue ALo = extract128BitVector(A, 0, DAG, dl);

22147

SDValue BLo = extract128BitVector(B, 0, DAG, dl);

22148

SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);

22149

SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);

22150

ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);

22151

BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);

22152

AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);

22153

BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);

22154

Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,

22155

DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),

22156

DAG.getConstant(8, dl, MVT::v16i16));

22157

Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,

22158

DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),

22159

DAG.getConstant(8, dl, MVT::v16i16));

22160

// The ymm variant of PACKUS treats the 128-bit lanes separately, so before

22161

// using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.

22162

const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,

22163

16, 17, 18, 19, 20, 21, 22, 23};

22164

const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,

22165

24, 25, 26, 27, 28, 29, 30, 31};

22166

return DAG.getNode(X86ISD::PACKUS, dl, VT,

22167

DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),

22168

DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));

22169

}

22170

22171

SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);

22172

SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);

22173

SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);

22174

Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,

22175

DAG.getConstant(8, dl, MVT::v16i16));

22176

// If we have BWI we can use truncate instruction.

22177

if (Subtarget.hasBWI())

22178

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

22179

Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);

22180

Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);

22181

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

22182

}

22183

22184

22185

22186

MVT ExVT = MVT::v8i16;

22187

unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);

22188

22189

// Extract the lo parts and zero/sign extend to i16.

22190

SDValue ALo, BLo;

22191

if (Subtarget.hasSSE41()) {

22192

ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);

22193

BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);

22194

} else {

22195

const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,

22196

-1, 4, -1, 5, -1, 6, -1, 7};

22197

ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);

22198

BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);

22199

ALo = DAG.getBitcast(ExVT, ALo);

22200

BLo = DAG.getBitcast(ExVT, BLo);

22201

ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));

22202

BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));

22203

}

22204

22205

// Extract the hi parts and zero/sign extend to i16.

22206

SDValue AHi, BHi;

22207

if (Subtarget.hasSSE41()) {

22208

const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,

22209

-1, -1, -1, -1, -1, -1, -1, -1};

22210

AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);

22211

BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);

22212

AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);

22213

BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);

22214

} else {

22215

const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,

22216

-1, 12, -1, 13, -1, 14, -1, 15};

22217

AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);

22218

BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);

22219

AHi = DAG.getBitcast(ExVT, AHi);

22220

BHi = DAG.getBitcast(ExVT, BHi);

22221

AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));

22222

BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));

22223

}

22224

22225

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

22226

// pack back to v16i8.

22227

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

22228

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

22229

RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));

22230

RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));

22231

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

22232

}

22233

22234

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

22235

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22235, __extension__ __PRETTY_FUNCTION__));

22236

EVT VT = Op.getValueType();

22237

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22238, __extension__ __PRETTY_FUNCTION__))

22238

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22238, __extension__ __PRETTY_FUNCTION__));

22239

22240

RTLIB::Libcall LC;

22241

bool isSigned;

22242

switch (Op->getOpcode()) {

22243

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22243);

22244

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

22245

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

22246

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

22247

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

22248

case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;

22249

case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;

22250

}

22251

22252

SDLoc dl(Op);

22253

SDValue InChain = DAG.getEntryNode();

22254

22255

TargetLowering::ArgListTy Args;

22256

TargetLowering::ArgListEntry Entry;

22257

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

22258

EVT ArgVT = Op->getOperand(i).getValueType();

22259

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22260, __extension__ __PRETTY_FUNCTION__))

22260

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22260, __extension__ __PRETTY_FUNCTION__));

22261

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

22262

Entry.Node = StackPtr;

22263

InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,

22264

MachinePointerInfo(), /* Alignment = */ 16);

22265

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

22266

Entry.Ty = PointerType::get(ArgTy,0);

22267

Entry.IsSExt = false;

22268

Entry.IsZExt = false;

22269

Args.push_back(Entry);

22270

}

22271

22272

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

22273

getPointerTy(DAG.getDataLayout()));

22274

22275

TargetLowering::CallLoweringInfo CLI(DAG);

22276

CLI.setDebugLoc(dl)

22277

.setChain(InChain)

22278

.setLibCallee(

22279

getLibcallCallingConv(LC),

22280

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

22281

std::move(Args))

22282

.setInRegister()

22283

.setSExtResult(isSigned)

22284

.setZExtResult(!isSigned);

22285

22286

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

22287

return DAG.getBitcast(VT, CallInfo.first);

22288

}

22289

22290

static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,

22291

SelectionDAG &DAG) {

22292

SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);

22293

MVT VT = Op0.getSimpleValueType();

22294

SDLoc dl(Op);

22295

22296

// Decompose 256-bit ops into smaller 128-bit ops.

22297

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

22298

unsigned Opcode = Op.getOpcode();

22299

unsigned NumElems = VT.getVectorNumElements();

22300

MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);

22301

SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);

22302

SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);

22303

SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);

22304

SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);

22305

SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);

22306

SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);

22307

SDValue Ops[] = {

22308

DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),

22309

DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))

22310

};

22311

return DAG.getMergeValues(Ops, dl);

22312

}

22313

22314

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22316, __extension__ __PRETTY_FUNCTION__))

22315

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22316, __extension__ __PRETTY_FUNCTION__))

22316

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22316, __extension__ __PRETTY_FUNCTION__));

22317

22318

int NumElts = VT.getVectorNumElements();

22319

22320

// PMULxD operations multiply each even value (starting at 0) of LHS with

22321

// the related value of RHS and produce a widen result.

22322

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

22323

// => <2 x i64> <ae|cg>

22324

22325

// In other word, to have all the results, we need to perform two PMULxD:

22326

// 1. one with the even values.

22327

// 2. one with the odd values.

22328

// To achieve #2, with need to place the odd values at an even position.

22329

22330

// Place the odd value at an even position (basically, shift all values 1

22331

// step to the left):

22332

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};

22333

// <a|b|c|d> => <b|undef|d|undef>

22334

SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,

22335

makeArrayRef(&Mask[0], NumElts));

22336

// <e|f|g|h> => <f|undef|h|undef>

22337

SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,

22338

makeArrayRef(&Mask[0], NumElts));

22339

22340

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

22341

// ints.

22342

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

22343

bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;

22344

unsigned Opcode =

22345

(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;

22346

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

22347

// => <2 x i64> <ae|cg>

22348

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));

22349

22350

// => <2 x i64> <bf|dh>

22351

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));

22352

22353

// Shuffle it back into the right order.

22354

SmallVector<int, 16> HighMask(NumElts);

22355

SmallVector<int, 16> LowMask(NumElts);

22356

for (int i = 0; i != NumElts; ++i) {

22357

HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

22358

LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);

22359

}

22360

22361

SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);

22362

SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);

22363

22364

// If we have a signed multiply but no PMULDQ fix up the high parts of a

22365

// unsigned multiply.

22366

if (IsSigned && !Subtarget.hasSSE41()) {

22367

SDValue ShAmt = DAG.getConstant(

22368

31, dl,

22369

DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));

22370

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

22371

DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);

22372

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

22373

DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

22374

22375

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

22376

Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);

22377

}

22378

22379

// The first result of MUL_LOHI is actually the low value, followed by the

22380

// high value.

22381

SDValue Ops[] = {Lows, Highs};

22382

return DAG.getMergeValues(Ops, dl);

22383

}

22384

22385

// Return true if the required (according to Opcode) shift-imm form is natively

22386

// supported by the Subtarget

22387

static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,

22388

unsigned Opcode) {

22389

if (VT.getScalarSizeInBits() < 16)

22390

return false;

22391

22392

if (VT.is512BitVector() && Subtarget.hasAVX512() &&

22393

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

22394

return true;

22395

22396

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

22397

(VT.is256BitVector() && Subtarget.hasInt256());

22398

22399

bool AShift = LShift && (Subtarget.hasAVX512() ||

22400

(VT != MVT::v2i64 && VT != MVT::v4i64));

22401

return (Opcode == ISD::SRA) ? AShift : LShift;

22402

}

22403

22404

// The shift amount is a variable, but it is the same for all vector lanes.

22405

// These instructions are defined together with shift-immediate.

22406

static

22407

bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,

22408

unsigned Opcode) {

22409

return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);

22410

}

22411

22412

// Return true if the required (according to Opcode) variable-shift form is

22413

// natively supported by the Subtarget

22414

static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,

22415

unsigned Opcode) {

22416

22417

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

22418

return false;

22419

22420

// vXi16 supported only on AVX-512, BWI

22421

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

22422

return false;

22423

22424

if (Subtarget.hasAVX512())

22425

return true;

22426

22427

bool LShift = VT.is128BitVector() || VT.is256BitVector();

22428

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

22429

return (Opcode == ISD::SRA) ? AShift : LShift;

22430

}

22431

22432

static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,

22433

const X86Subtarget &Subtarget) {

22434

MVT VT = Op.getSimpleValueType();

22435

SDLoc dl(Op);

22436

SDValue R = Op.getOperand(0);

22437

SDValue Amt = Op.getOperand(1);

22438

22439

unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :

22440

(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

22441

22442

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

22443

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22443, __extension__ __PRETTY_FUNCTION__));

22444

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

22445

SDValue Ex = DAG.getBitcast(ExVT, R);

22446

22447

// ashr(R, 63) === cmp_slt(R, 0)

22448

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

22449

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22450, __extension__ __PRETTY_FUNCTION__))

22450

"Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22450, __extension__ __PRETTY_FUNCTION__));

22451

return DAG.getNode(X86ISD::PCMPGT, dl, VT,

22452

getZeroVector(VT, Subtarget, DAG, dl), R);

22453

}

22454

22455

if (ShiftAmt >= 32) {

22456

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

22457

SDValue Upper =

22458

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

22459

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

22460

ShiftAmt - 32, DAG);

22461

if (VT == MVT::v2i64)

22462

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

22463

if (VT == MVT::v4i64)

22464

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

22465

{9, 1, 11, 3, 13, 5, 15, 7});

22466

} else {

22467

// SRA upper i32, SHL whole i64 and select lower i32.

22468

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

22469

ShiftAmt, DAG);

22470

SDValue Lower =

22471

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

22472

Lower = DAG.getBitcast(ExVT, Lower);

22473

if (VT == MVT::v2i64)

22474

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

22475

if (VT == MVT::v4i64)

22476

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

22477

{8, 1, 10, 3, 12, 5, 14, 7});

22478

}

22479

return DAG.getBitcast(VT, Ex);

22480

};

22481

22482

// Optimize shl/srl/sra with constant shift amount.

22483

if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {

22484

if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {

22485

uint64_t ShiftAmt = ShiftConst->getZExtValue();

22486

22487

if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))

22488

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

22489

22490

// i64 SRA needs to be performed as partial shifts.

22491

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

22492

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

22493

Op.getOpcode() == ISD::SRA)

22494

return ArithmeticShiftRight64(ShiftAmt);

22495

22496

if (VT == MVT::v16i8 ||

22497

(Subtarget.hasInt256() && VT == MVT::v32i8) ||

22498

VT == MVT::v64i8) {

22499

unsigned NumElts = VT.getVectorNumElements();

22500

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

22501

22502

// Simple i8 add case

22503

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)

22504

return DAG.getNode(ISD::ADD, dl, VT, R, R);

22505

22506

// ashr(R, 7) === cmp_slt(R, 0)

22507

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

22508

SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);

22509

if (VT.is512BitVector()) {

22510

assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22510, __extension__ __PRETTY_FUNCTION__));

22511

SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);

22512

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

22513

}

22514

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

22515

}

22516

22517

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

22518

if (VT == MVT::v16i8 && Subtarget.hasXOP())

22519

return SDValue();

22520

22521

if (Op.getOpcode() == ISD::SHL) {

22522

// Make a large shift.

22523

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,

22524

R, ShiftAmt, DAG);

22525

SHL = DAG.getBitcast(VT, SHL);

22526

// Zero out the rightmost bits.

22527

return DAG.getNode(ISD::AND, dl, VT, SHL,

22528

DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));

22529

}

22530

if (Op.getOpcode() == ISD::SRL) {

22531

// Make a large shift.

22532

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,

22533

R, ShiftAmt, DAG);

22534

SRL = DAG.getBitcast(VT, SRL);

22535

// Zero out the leftmost bits.

22536

return DAG.getNode(ISD::AND, dl, VT, SRL,

22537

DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));

22538

}

22539

if (Op.getOpcode() == ISD::SRA) {

22540

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

22541

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

22542

22543

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

22544

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

22545

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

22546

return Res;

22547

}

22548

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22548);

22549

}

22550

}

22551

}

22552

22553

// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.

22554

// TODO: Replace constant extraction with getTargetConstantBitsFromNode.

22555

if (!Subtarget.hasXOP() &&

22556

(VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||

22557

(Subtarget.hasAVX512() && VT == MVT::v8i64))) {

22558

22559

// AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.

22560

unsigned SubVectorScale = 1;

22561

if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

22562

SubVectorScale =

22563

Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();

22564

Amt = Amt.getOperand(0);

22565

}

22566

22567

// Peek through any splat that was introduced for i64 shift vectorization.

22568

int SplatIndex = -1;

22569

if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))

22570

if (SVN->isSplat()) {

22571

SplatIndex = SVN->getSplatIndex();

22572

Amt = Amt.getOperand(0);

22573

assert(SplatIndex < (int)VT.getVectorNumElements() &&(static_cast <bool> (SplatIndex < (int)VT.getVectorNumElements
() && "Splat shuffle referencing second operand") ? void
(0) : __assert_fail ("SplatIndex < (int)VT.getVectorNumElements() && \"Splat shuffle referencing second operand\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22574, __extension__ __PRETTY_FUNCTION__))

22574

"Splat shuffle referencing second operand")(static_cast <bool> (SplatIndex < (int)VT.getVectorNumElements
() && "Splat shuffle referencing second operand") ? void
(0) : __assert_fail ("SplatIndex < (int)VT.getVectorNumElements() && \"Splat shuffle referencing second operand\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22574, __extension__ __PRETTY_FUNCTION__));

22575

}

22576

22577

if (Amt.getOpcode() != ISD::BITCAST ||

22578

Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)

22579

return SDValue();

22580

22581

Amt = Amt.getOperand(0);

22582

unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /

22583

(SubVectorScale * VT.getVectorNumElements());

22584

unsigned RatioInLog2 = Log2_32_Ceil(Ratio);

22585

uint64_t ShiftAmt = 0;

22586

unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);

22587

for (unsigned i = 0; i != Ratio; ++i) {

22588

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));

22589

if (!C)

22590

return SDValue();

22591

// 6 == Log2(64)

22592

ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));

22593

}

22594

22595

// Check remaining shift amounts (if not a splat).

22596

if (SplatIndex < 0) {

22597

for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {

22598

uint64_t ShAmt = 0;

22599

for (unsigned j = 0; j != Ratio; ++j) {

22600

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));

22601

if (!C)

22602

return SDValue();

22603

// 6 == Log2(64)

22604

ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));

22605

}

22606

if (ShAmt != ShiftAmt)

22607

return SDValue();

22608

}

22609

}

22610

22611

if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))

22612

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

22613

22614

if (Op.getOpcode() == ISD::SRA)

22615

return ArithmeticShiftRight64(ShiftAmt);

22616

}

22617

22618

return SDValue();

22619

}

22620

22621

static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,

22622

const X86Subtarget &Subtarget) {

22623

MVT VT = Op.getSimpleValueType();

22624

SDLoc dl(Op);

22625

SDValue R = Op.getOperand(0);

22626

SDValue Amt = Op.getOperand(1);

22627

22628

unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :

22629

(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

22630

22631

unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :

22632

(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;

22633

22634

if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {

22635

SDValue BaseShAmt;

22636

MVT EltVT = VT.getVectorElementType();

22637

22638

if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {

22639

// Check if this build_vector node is doing a splat.

22640

// If so, then set BaseShAmt equal to the splat value.

22641

BaseShAmt = BV->getSplatValue();

22642

if (BaseShAmt && BaseShAmt.isUndef())

22643

BaseShAmt = SDValue();

22644

} else {

22645

if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)

22646

Amt = Amt.getOperand(0);

22647

22648

ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);

22649

if (SVN && SVN->isSplat()) {

22650

unsigned SplatIdx = (unsigned)SVN->getSplatIndex();

22651

SDValue InVec = Amt.getOperand(0);

22652

if (InVec.getOpcode() == ISD::BUILD_VECTOR) {

22653

assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&(static_cast <bool> ((SplatIdx < InVec.getSimpleValueType
().getVectorNumElements()) && "Unexpected shuffle index found!"
) ? void (0) : __assert_fail ("(SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22654, __extension__ __PRETTY_FUNCTION__))

22654

"Unexpected shuffle index found!")(static_cast <bool> ((SplatIdx < InVec.getSimpleValueType
().getVectorNumElements()) && "Unexpected shuffle index found!"
) ? void (0) : __assert_fail ("(SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22654, __extension__ __PRETTY_FUNCTION__));

22655

BaseShAmt = InVec.getOperand(SplatIdx);

22656

} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {

22657

if (ConstantSDNode *C =

22658

dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {

22659

if (C->getZExtValue() == SplatIdx)

22660

BaseShAmt = InVec.getOperand(1);

22661

}

22662

}

22663

22664

if (!BaseShAmt)

22665

// Avoid introducing an extract element from a shuffle.

22666

BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,

22667

DAG.getIntPtrConstant(SplatIdx, dl));

22668

}

22669

}

22670

22671

if (BaseShAmt.getNode()) {

22672

assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")(static_cast <bool> (EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? void (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22672, __extension__ __PRETTY_FUNCTION__));

22673

if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))

22674

BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);

22675

else if (EltVT.bitsLT(MVT::i32))

22676

BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

22677

22678

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);

22679

}

22680

}

22681

22682

// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.

22683

if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&

22684

Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {

22685

Amt = Amt.getOperand(0);

22686

unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /

22687

VT.getVectorNumElements();

22688

std::vector<SDValue> Vals(Ratio);

22689

for (unsigned i = 0; i != Ratio; ++i)

22690

Vals[i] = Amt.getOperand(i);

22691

for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {

22692

for (unsigned j = 0; j != Ratio; ++j)

22693

if (Vals[j] != Amt.getOperand(i + j))

22694

return SDValue();

22695

}

22696

22697

if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))

22698

return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));

22699

}

22700

return SDValue();

22701

}

22702

22703

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

22704

SelectionDAG &DAG) {

22705

MVT VT = Op.getSimpleValueType();

22706

SDLoc dl(Op);

22707

SDValue R = Op.getOperand(0);

22708

SDValue Amt = Op.getOperand(1);

22709

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

22710

22711

assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22711, __extension__ __PRETTY_FUNCTION__));

22712

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22712, __extension__ __PRETTY_FUNCTION__));

22713

22714

if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))

22715

return V;

22716

22717

if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))

22718

return V;

22719

22720

if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))

22721

return Op;

22722

22723

// XOP has 128-bit variable logical/arithmetic shifts.

22724

// +ve/-ve Amt = shift left/right.

22725

if (Subtarget.hasXOP() &&

22726

(VT == MVT::v2i64 || VT == MVT::v4i32 ||

22727

VT == MVT::v8i16 || VT == MVT::v16i8)) {

22728

if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {

22729

SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

22730

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

22731

}

22732

if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)

22733

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

22734

if (Op.getOpcode() == ISD::SRA)

22735

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

22736

}

22737

22738

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

22739

// shifts per-lane and then shuffle the partial results back together.

22740

if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {

22741

// Splat the shift amounts so the scalar shifts above will catch it.

22742

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

22743

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

22744

SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);

22745

SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);

22746

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

22747

}

22748

22749

// i64 vector arithmetic shift can be emulated with the transform:

22750

// M = lshr(SIGN_MASK, Amt)

22751

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

22752

if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&

22753

Op.getOpcode() == ISD::SRA) {

22754

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

22755

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

22756

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

22757

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

22758

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

22759

return R;

22760

}

22761

22762

// If possible, lower this packed shift into a vector multiply instead of

22763

// expanding it into a sequence of scalar shifts.

22764

// Do this only if the vector shift count is a constant build_vector.

22765

if (ConstantAmt && Op.getOpcode() == ISD::SHL &&

22766

(VT == MVT::v8i16 || VT == MVT::v4i32 ||

22767

(Subtarget.hasInt256() && VT == MVT::v16i16))) {

22768

SmallVector<SDValue, 8> Elts;

22769

MVT SVT = VT.getVectorElementType();

22770

unsigned SVTBits = SVT.getSizeInBits();

22771

APInt One(SVTBits, 1);

22772

unsigned NumElems = VT.getVectorNumElements();

22773

22774

for (unsigned i=0; i !=NumElems; ++i) {

22775

SDValue Op = Amt->getOperand(i);

22776

if (Op->isUndef()) {

22777

Elts.push_back(Op);

22778

continue;

22779

}

22780

22781

ConstantSDNode *ND = cast<ConstantSDNode>(Op);

22782

APInt C(SVTBits, ND->getAPIntValue().getZExtValue());

22783

uint64_t ShAmt = C.getZExtValue();

22784

if (ShAmt >= SVTBits) {

22785

Elts.push_back(DAG.getUNDEF(SVT));

22786

continue;

22787

}

22788

Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));

22789

}

22790

SDValue BV = DAG.getBuildVector(VT, dl, Elts);

22791

return DAG.getNode(ISD::MUL, dl, VT, R, BV);

22792

}

22793

22794

// Lower SHL with variable shift amount.

22795

if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {

22796

Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

22797

22798

Op = DAG.getNode(ISD::ADD, dl, VT, Op,

22799

DAG.getConstant(0x3f800000U, dl, VT));

22800

Op = DAG.getBitcast(MVT::v4f32, Op);

22801

Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);

22802

return DAG.getNode(ISD::MUL, dl, VT, Op, R);

22803

}

22804

22805

// If possible, lower this shift as a sequence of two shifts by

22806

// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.

22807

// Example:

22808

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

22809

22810

// Could be rewritten as:

22811

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

22812

22813

// The advantage is that the two shifts from the example would be

22814

// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing

22815

// the vector shift into four scalar shifts plus four pairs of vector

22816

// insert/extract.

22817

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {

22818

bool UseMOVSD = false;

22819

bool CanBeSimplified;

22820

// The splat value for the first packed shift (the 'X' from the example).

22821

SDValue Amt1 = Amt->getOperand(0);

22822

// The splat value for the second packed shift (the 'Y' from the example).

22823

SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);

22824

22825

// See if it is possible to replace this node with a sequence of

22826

// two shifts followed by a MOVSS/MOVSD/PBLEND.

22827

if (VT == MVT::v4i32) {

22828

// Check if it is legal to use a MOVSS.

22829

CanBeSimplified = Amt2 == Amt->getOperand(2) &&

22830

Amt2 == Amt->getOperand(3);

22831

if (!CanBeSimplified) {

22832

// Otherwise, check if we can still simplify this node using a MOVSD.

22833

CanBeSimplified = Amt1 == Amt->getOperand(1) &&

22834

Amt->getOperand(2) == Amt->getOperand(3);

22835

UseMOVSD = true;

22836

Amt2 = Amt->getOperand(2);

22837

}

22838

} else {

22839

// Do similar checks for the case where the machine value type

22840

// is MVT::v8i16.

22841

CanBeSimplified = Amt1 == Amt->getOperand(1);

22842

for (unsigned i=3; i != 8 && CanBeSimplified; ++i)

22843

CanBeSimplified = Amt2 == Amt->getOperand(i);

22844

22845

if (!CanBeSimplified) {

22846

UseMOVSD = true;

22847

CanBeSimplified = true;

22848

Amt2 = Amt->getOperand(4);

22849

for (unsigned i=0; i != 4 && CanBeSimplified; ++i)

22850

CanBeSimplified = Amt1 == Amt->getOperand(i);

22851

for (unsigned j=4; j != 8 && CanBeSimplified; ++j)

22852

CanBeSimplified = Amt2 == Amt->getOperand(j);

22853

}

22854

}

22855

22856

if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&

22857

isa<ConstantSDNode>(Amt2)) {

22858

// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.

22859

SDValue Splat1 =

22860

DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);

22861

SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);

22862

SDValue Splat2 =

22863

DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);

22864

SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);

22865

SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);

22866

SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);

22867

if (UseMOVSD)

22868

return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,

22869

BitCast2, {0, 1, 6, 7}));

22870

return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,

22871

BitCast2, {0, 5, 6, 7}));

22872

}

22873

}

22874

22875

// v4i32 Non Uniform Shifts.

22876

// If the shift amount is constant we can shift each lane using the SSE2

22877

// immediate shifts, else we need to zero-extend each lane to the lower i64

22878

// and shift using the SSE2 variable shifts.

22879

// The separate results can then be blended together.

22880

if (VT == MVT::v4i32) {

22881

unsigned Opc = Op.getOpcode();

22882

SDValue Amt0, Amt1, Amt2, Amt3;

22883

if (ConstantAmt) {

22884

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

22885

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

22886

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

22887

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

22888

} else {

22889

// ISD::SHL is handled above but we include it here for completeness.

22890

switch (Opc) {

22891

default:

22892

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22892);

22893

case ISD::SHL:

22894

Opc = X86ISD::VSHL;

22895

break;

22896

case ISD::SRL:

22897

Opc = X86ISD::VSRL;

22898

break;

22899

case ISD::SRA:

22900

Opc = X86ISD::VSRA;

22901

break;

22902

}

22903

// The SSE2 shifts use the lower i64 as the same shift amount for

22904

// all lanes and the upper i64 is ignored. These shuffle masks

22905

// optimally zero-extend each lanes on SSE2/SSE41/AVX targets.

22906

SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);

22907

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

22908

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

22909

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

22910

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

22911

}

22912

22913

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

22914

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

22915

SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);

22916

SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);

22917

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

22918

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

22919

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

22920

}

22921

22922

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

22923

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

22924

// make the existing SSE solution better.

22925

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

22926

(Subtarget.hasAVX512() && VT == MVT::v16i16) ||

22927

(Subtarget.hasAVX512() && VT == MVT::v16i8) ||

22928

(Subtarget.hasBWI() && VT == MVT::v32i8)) {

22929

MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);

22930

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

22931

unsigned ExtOpc =

22932

Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

22933

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

22934

Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);

22935

return DAG.getNode(ISD::TRUNCATE, dl, VT,

22936

DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));

22937

}

22938

22939

if (VT == MVT::v16i8 ||

22940

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

22941

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

22942

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

22943

unsigned ShiftOpcode = Op->getOpcode();

22944

22945

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

22946

if (VT.is512BitVector()) {

22947

// On AVX512BW targets we make use of the fact that VSELECT lowers

22948

// to a masked blend which selects bytes based just on the sign bit

22949

// extracted to a mask.

22950

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

22951

V0 = DAG.getBitcast(VT, V0);

22952

V1 = DAG.getBitcast(VT, V1);

22953

Sel = DAG.getBitcast(VT, Sel);

22954

Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);

22955

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

22956

} else if (Subtarget.hasSSE41()) {

22957

// On SSE41 targets we make use of the fact that VSELECT lowers

22958

// to PBLENDVB which selects bytes based just on the sign bit.

22959

V0 = DAG.getBitcast(VT, V0);

22960

V1 = DAG.getBitcast(VT, V1);

22961

Sel = DAG.getBitcast(VT, Sel);

22962

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

22963

}

22964

// On pre-SSE41 targets we test for the sign bit by comparing to

22965

// zero - a negative value will set all bits of the lanes to true

22966

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

22967

SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);

22968

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

22969

return DAG.getSelect(dl, SelVT, C, V0, V1);

22970

};

22971

22972

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

22973

// We can safely do this using i16 shifts as we're only interested in

22974

// the 3 lower bits of each byte.

22975

Amt = DAG.getBitcast(ExtVT, Amt);

22976

Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));

22977

Amt = DAG.getBitcast(VT, Amt);

22978

22979

if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {

22980

// r = VSELECT(r, shift(r, 4), a);

22981

SDValue M =

22982

DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));

22983

R = SignBitSelect(VT, Amt, M, R);

22984

22985

// a += a

22986

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

22987

22988

// r = VSELECT(r, shift(r, 2), a);

22989

M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));

22990

R = SignBitSelect(VT, Amt, M, R);

22991

22992

// a += a

22993

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

22994

22995

// return VSELECT(r, shift(r, 1), a);

22996

M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));

22997

R = SignBitSelect(VT, Amt, M, R);

22998

return R;

22999

}

23000

23001

if (Op->getOpcode() == ISD::SRA) {

23002

// For SRA we need to unpack each byte to the higher byte of a i16 vector

23003

// so we can correctly sign extend. We don't care what happens to the

23004

// lower byte.

23005

SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);

23006

SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);

23007

SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);

23008

SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);

23009

ALo = DAG.getBitcast(ExtVT, ALo);

23010

AHi = DAG.getBitcast(ExtVT, AHi);

23011

RLo = DAG.getBitcast(ExtVT, RLo);

23012

RHi = DAG.getBitcast(ExtVT, RHi);

23013

23014

// r = VSELECT(r, shift(r, 4), a);

23015

SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,

23016

DAG.getConstant(4, dl, ExtVT));

23017

SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,

23018

DAG.getConstant(4, dl, ExtVT));

23019

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

23020

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

23021

23022

// a += a

23023

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

23024

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

23025

23026

// r = VSELECT(r, shift(r, 2), a);

23027

MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,

23028

DAG.getConstant(2, dl, ExtVT));

23029

MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,

23030

DAG.getConstant(2, dl, ExtVT));

23031

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

23032

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

23033

23034

// a += a

23035

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

23036

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

23037

23038

// r = VSELECT(r, shift(r, 1), a);

23039

MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,

23040

DAG.getConstant(1, dl, ExtVT));

23041

MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,

23042

DAG.getConstant(1, dl, ExtVT));

23043

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

23044

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

23045

23046

// Logical shift the result back to the lower byte, leaving a zero upper

23047

// byte

23048

// meaning that we can safely pack with PACKUSWB.

23049

RLo =

23050

DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));

23051

RHi =

23052

DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));

23053

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

23054

}

23055

}

23056

23057

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

23058

MVT ExtVT = MVT::v8i32;

23059

SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);

23060

SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);

23061

SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);

23062

SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);

23063

SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);

23064

ALo = DAG.getBitcast(ExtVT, ALo);

23065

AHi = DAG.getBitcast(ExtVT, AHi);

23066

RLo = DAG.getBitcast(ExtVT, RLo);

23067

RHi = DAG.getBitcast(ExtVT, RHi);

23068

SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);

23069

SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);

23070

Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));

23071

Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));

23072

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

23073

}

23074

23075

if (VT == MVT::v8i16) {

23076

unsigned ShiftOpcode = Op->getOpcode();

23077

23078

// If we have a constant shift amount, the non-SSE41 path is best as

23079

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

23080

bool UseSSE41 = Subtarget.hasSSE41() &&

23081

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

23082

23083

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

23084

// On SSE41 targets we make use of the fact that VSELECT lowers

23085

// to PBLENDVB which selects bytes based just on the sign bit.

23086

if (UseSSE41) {

23087

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

23088

V0 = DAG.getBitcast(ExtVT, V0);

23089

V1 = DAG.getBitcast(ExtVT, V1);

23090

Sel = DAG.getBitcast(ExtVT, Sel);

23091

return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));

23092

}

23093

// On pre-SSE41 targets we splat the sign bit - a negative value will

23094

// set all bits of the lanes to true and VSELECT uses that in

23095

// its OR(AND(V0,C),AND(V1,~C)) lowering.

23096

SDValue C =

23097

DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));

23098

return DAG.getSelect(dl, VT, C, V0, V1);

23099

};

23100

23101

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

23102

if (UseSSE41) {

23103

// On SSE41 targets we need to replicate the shift mask in both

23104

// bytes for PBLENDVB.

23105

Amt = DAG.getNode(

23106

ISD::OR, dl, VT,

23107

DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),

23108

DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));

23109

} else {

23110

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));

23111

}

23112

23113

// r = VSELECT(r, shift(r, 8), a);

23114

SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));

23115

R = SignBitSelect(Amt, M, R);

23116

23117

// a += a

23118

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

23119

23120

// r = VSELECT(r, shift(r, 4), a);

23121

M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));

23122

R = SignBitSelect(Amt, M, R);

23123

23124

// a += a

23125

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

23126

23127

// r = VSELECT(r, shift(r, 2), a);

23128

M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));

23129

R = SignBitSelect(Amt, M, R);

23130

23131

// a += a

23132

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

23133

23134

// return VSELECT(r, shift(r, 1), a);

23135

M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));

23136

R = SignBitSelect(Amt, M, R);

23137

return R;

23138

}

23139

23140

// Decompose 256-bit shifts into smaller 128-bit shifts.

23141

if (VT.is256BitVector())

23142

return Lower256IntArith(Op, DAG);

23143

23144

return SDValue();

23145

}

23146

23147

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

23148

SelectionDAG &DAG) {

23149

MVT VT = Op.getSimpleValueType();

23150

SDLoc DL(Op);

23151

SDValue R = Op.getOperand(0);

23152

SDValue Amt = Op.getOperand(1);

23153

unsigned Opcode = Op.getOpcode();

23154

unsigned EltSizeInBits = VT.getScalarSizeInBits();

23155

23156

if (Subtarget.hasAVX512()) {

23157

// Attempt to rotate by immediate.

23158

APInt UndefElts;

23159

SmallVector<APInt, 16> EltBits;

23160

if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {

23161

if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {

23162

return EltBits[0] == V;

23163

})) {

23164

unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);

23165

uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);

23166

return DAG.getNode(Op, DL, VT, R,

23167

DAG.getConstant(RotateAmt, DL, MVT::i8));

23168

}

23169

}

23170

23171

// Else, fall-back on VPROLV/VPRORV.

23172

return Op;

23173

}

23174

23175

assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23175, __extension__ __PRETTY_FUNCTION__));

23176

assert(Subtarget.hasXOP() && "XOP support required for vector rotates!")(static_cast <bool> (Subtarget.hasXOP() && "XOP support required for vector rotates!"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"XOP support required for vector rotates!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23176, __extension__ __PRETTY_FUNCTION__));

23177

assert((Opcode == ISD::ROTL) && "Only ROTL supported")(static_cast <bool> ((Opcode == ISD::ROTL) && "Only ROTL supported"
) ? void (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23177, __extension__ __PRETTY_FUNCTION__));

23178

23179

// XOP has 128-bit vector variable + immediate rotates.

23180

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

23181

23182

// Split 256-bit integers.

23183

if (VT.is256BitVector())

23184

return Lower256IntArith(Op, DAG);

23185

23186

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23186, __extension__ __PRETTY_FUNCTION__));

23187

23188

// Attempt to rotate by immediate.

23189

if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {

23190

if (auto *RotateConst = BVAmt->getConstantSplatNode()) {

23191

uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();

23192

assert(RotateAmt < EltSizeInBits && "Rotation out of range")(static_cast <bool> (RotateAmt < EltSizeInBits &&
"Rotation out of range") ? void (0) : __assert_fail ("RotateAmt < EltSizeInBits && \"Rotation out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23192, __extension__ __PRETTY_FUNCTION__));

23193

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

23194

DAG.getConstant(RotateAmt, DL, MVT::i8));

23195

}

23196

}

23197

23198

// Use general rotate by variable (per-element).

23199

return Op;

23200

}

23201

23202

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

23203

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

23204

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

23205

// looks for this combo and may remove the "setcc" instruction if the "setcc"

23206

// has only one use.

23207

SDNode *N = Op.getNode();

23208

SDValue LHS = N->getOperand(0);

23209

SDValue RHS = N->getOperand(1);

23210

unsigned BaseOp = 0;

23211

X86::CondCode Cond;

23212

SDLoc DL(Op);

23213

switch (Op.getOpcode()) {

23214

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23214);

23215

case ISD::SADDO:

23216

// A subtract of one will be selected as a INC. Note that INC doesn't

23217

// set CF, so we can't do this for UADDO.

23218

if (isOneConstant(RHS)) {

23219

BaseOp = X86ISD::INC;

23220

Cond = X86::COND_O;

23221

break;

23222

}

23223

BaseOp = X86ISD::ADD;

23224

Cond = X86::COND_O;

23225

break;

23226

case ISD::UADDO:

23227

BaseOp = X86ISD::ADD;

23228

Cond = X86::COND_B;

23229

break;

23230

case ISD::SSUBO:

23231

// A subtract of one will be selected as a DEC. Note that DEC doesn't

23232

// set CF, so we can't do this for USUBO.

23233

if (isOneConstant(RHS)) {

23234

BaseOp = X86ISD::DEC;

23235

Cond = X86::COND_O;

23236

break;

23237

}

23238

BaseOp = X86ISD::SUB;

23239

Cond = X86::COND_O;

23240

break;

23241

case ISD::USUBO:

23242

BaseOp = X86ISD::SUB;

23243

Cond = X86::COND_B;

23244

break;

23245

case ISD::SMULO:

23246

BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;

23247

Cond = X86::COND_O;

23248

break;

23249

case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs

23250

if (N->getValueType(0) == MVT::i8) {

23251

BaseOp = X86ISD::UMUL8;

23252

Cond = X86::COND_O;

23253

break;

23254

}

23255

SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),

23256

MVT::i32);

23257

SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);

23258

23259

SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);

23260

23261

if (N->getValueType(1) == MVT::i1)

23262

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

23263

23264

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

23265

}

23266

}

23267

23268

// Also sets EFLAGS.

23269

SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);

23270

SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

23271

23272

SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);

23273

23274

if (N->getValueType(1) == MVT::i1)

23275

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

23276

23277

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

23278

}

23279

23280

/// Returns true if the operand type is exactly twice the native width, and

23281

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

23282

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

23283

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

23284

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

23285

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

23286

23287

if (OpWidth == 64)

23288

return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b

23289

else if (OpWidth == 128)

23290

return Subtarget.hasCmpxchg16b();

23291

else

23292

return false;

23293

}

23294

23295

bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

23296

return needsCmpXchgNb(SI->getValueOperand()->getType());

23297

}

23298

23299

// Note: this turns large loads into lock cmpxchg8b/16b.

23300

// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.

23301

TargetLowering::AtomicExpansionKind

23302

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

23303

auto PTy = cast<PointerType>(LI->getPointerOperandType());

23304

return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg

23305

: AtomicExpansionKind::None;

23306

}

23307

23308

TargetLowering::AtomicExpansionKind

23309

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

23310

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

23311

Type *MemType = AI->getType();

23312

23313

// If the operand is too big, we must see if cmpxchg8/16b is available

23314

// and default to library calls otherwise.

23315

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

23316

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

23317

: AtomicExpansionKind::None;

23318

}

23319

23320

AtomicRMWInst::BinOp Op = AI->getOperation();

23321

switch (Op) {

23322

default:

23323

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23323);

23324

case AtomicRMWInst::Xchg:

23325

case AtomicRMWInst::Add:

23326

case AtomicRMWInst::Sub:

23327

// It's better to use xadd, xsub or xchg for these in all cases.

23328

return AtomicExpansionKind::None;

23329

case AtomicRMWInst::Or:

23330

case AtomicRMWInst::And:

23331

case AtomicRMWInst::Xor:

23332

// If the atomicrmw's result isn't actually used, we can just add a "lock"

23333

// prefix to a normal instruction for these operations.

23334

return !AI->use_empty() ? AtomicExpansionKind::CmpXChg

23335

: AtomicExpansionKind::None;

23336

case AtomicRMWInst::Nand:

23337

case AtomicRMWInst::Max:

23338

case AtomicRMWInst::Min:

23339

case AtomicRMWInst::UMax:

23340

case AtomicRMWInst::UMin:

23341

// These always require a non-trivial set of data operations on x86. We must

23342

// use a cmpxchg loop.

23343

return AtomicExpansionKind::CmpXChg;

23344

}

23345

}

23346

23347

LoadInst *

23348

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

23349

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

23350

Type *MemType = AI->getType();

23351

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

23352

// there is no benefit in turning such RMWs into loads, and it is actually

23353

// harmful as it introduces a mfence.

23354

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

23355

return nullptr;

23356

23357

auto Builder = IRBuilder<>(AI);

23358

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

23359

auto SSID = AI->getSyncScopeID();

23360

// We must restrict the ordering to avoid generating loads with Release or

23361

// ReleaseAcquire orderings.

23362

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

23363

auto Ptr = AI->getPointerOperand();

23364

23365

// Before the load we need a fence. Here is an example lifted from

23366

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

23367

// is required:

23368

// Thread 0:

23369

// x.store(1, relaxed);

23370

// r1 = y.fetch_add(0, release);

23371

// Thread 1:

23372

// y.fetch_add(42, acquire);

23373

// r2 = x.load(relaxed);

23374

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

23375

// lowered to just a load without a fence. A mfence flushes the store buffer,

23376

// making the optimization clearly correct.

23377

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

23378

// otherwise, we might be able to be more aggressive on relaxed idempotent

23379

// rmw. In practice, they do not look useful, so we don't try to be

23380

// especially clever.

23381

if (SSID == SyncScope::SingleThread)

23382

// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at

23383

// the IR level, so we must wrap it in an intrinsic.

23384

return nullptr;

23385

23386

if (!Subtarget.hasMFence())

23387

// FIXME: it might make sense to use a locked operation here but on a

23388

// different cache-line to prevent cache-line bouncing. In practice it

23389

// is probably a small win, and x86 processors without mfence are rare

23390

// enough that we do not bother.

23391

return nullptr;

23392

23393

Function *MFence =

23394

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

23395

Builder.CreateCall(MFence, {});

23396

23397

// Finally we can emit the atomic load.

23398

LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,

23399

AI->getType()->getPrimitiveSizeInBits());

23400

Loaded->setAtomic(Order, SSID);

23401

AI->replaceAllUsesWith(Loaded);

23402

AI->eraseFromParent();

23403

return Loaded;

23404

}

23405

23406

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

23407

SelectionDAG &DAG) {

23408

SDLoc dl(Op);

23409

AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(

23410

cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());

23411

SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(

23412

cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

23413

23414

// The only fence that needs an instruction is a sequentially-consistent

23415

// cross-thread fence.

23416

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

23417

FenceSSID == SyncScope::System) {

23418

if (Subtarget.hasMFence())

23419

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

23420

23421

SDValue Chain = Op.getOperand(0);

23422

SDValue Zero = DAG.getConstant(0, dl, MVT::i32);

23423

SDValue Ops[] = {

23424

DAG.getRegister(X86::ESP, MVT::i32), // Base

23425

DAG.getTargetConstant(1, dl, MVT::i8), // Scale

23426

DAG.getRegister(0, MVT::i32), // Index

23427

DAG.getTargetConstant(0, dl, MVT::i32), // Disp

23428

DAG.getRegister(0, MVT::i32), // Segment.

23429

Zero,

23430

Chain

23431

};

23432

SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);

23433

return SDValue(Res, 0);

23434

}

23435

23436

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

23437

return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

23438

}

23439

23440

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

23441

SelectionDAG &DAG) {

23442

MVT T = Op.getSimpleValueType();

23443

SDLoc DL(Op);

23444

unsigned Reg = 0;

23445

unsigned size = 0;

23446

switch(T.SimpleTy) {

23447

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23447);

23448

case MVT::i8: Reg = X86::AL; size = 1; break;

23449

case MVT::i16: Reg = X86::AX; size = 2; break;

23450

case MVT::i32: Reg = X86::EAX; size = 4; break;

23451

case MVT::i64:

23452

assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23452, __extension__ __PRETTY_FUNCTION__));

23453

Reg = X86::RAX; size = 8;

23454

break;

23455

}

23456

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

23457

Op.getOperand(2), SDValue());

23458

SDValue Ops[] = { cpIn.getValue(0),

23459

Op.getOperand(1),

23460

Op.getOperand(3),

23461

DAG.getTargetConstant(size, DL, MVT::i8),

23462

cpIn.getValue(1) };

23463

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

23464

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

23465

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

23466

Ops, T, MMO);

23467

23468

SDValue cpOut =

23469

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

23470

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

23471

MVT::i32, cpOut.getValue(2));

23472

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

23473

23474

DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);

23475

DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);

23476

DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));

23477

return SDValue();

23478

}

23479

23480

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

23481

SelectionDAG &DAG) {

23482

MVT SrcVT = Op.getOperand(0).getSimpleValueType();

23483

MVT DstVT = Op.getSimpleValueType();

23484

23485

if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||

23486

SrcVT == MVT::i64) {

23487

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23487, __extension__ __PRETTY_FUNCTION__));

23488

if (DstVT != MVT::f64)

23489

// This conversion needs to be expanded.

23490

return SDValue();

23491

23492

SDValue Op0 = Op->getOperand(0);

23493

SmallVector<SDValue, 16> Elts;

23494

SDLoc dl(Op);

23495

unsigned NumElts;

23496

MVT SVT;

23497

if (SrcVT.isVector()) {

23498

NumElts = SrcVT.getVectorNumElements();

23499

SVT = SrcVT.getVectorElementType();

23500

23501

// Widen the vector in input in the case of MVT::v2i32.

23502

// Example: from MVT::v2i32 to MVT::v4i32.

23503

for (unsigned i = 0, e = NumElts; i != e; ++i)

23504

Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,

23505

DAG.getIntPtrConstant(i, dl)));

23506

} else {

23507

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23508, __extension__ __PRETTY_FUNCTION__))

23508

"Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23508, __extension__ __PRETTY_FUNCTION__));

23509

Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,

23510

DAG.getIntPtrConstant(0, dl)));

23511

Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,

23512

DAG.getIntPtrConstant(1, dl)));

23513

NumElts = 2;

23514

SVT = MVT::i32;

23515

}

23516

// Explicitly mark the extra elements as Undef.

23517

Elts.append(NumElts, DAG.getUNDEF(SVT));

23518

23519

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);

23520

SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);

23521

SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);

23522

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,

23523

DAG.getIntPtrConstant(0, dl));

23524

}

23525

23526

assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&(static_cast <bool> (Subtarget.is64Bit() && !Subtarget
.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23527, __extension__ __PRETTY_FUNCTION__))

23527

Subtarget.hasMMX() && "Unexpected custom BITCAST")(static_cast <bool> (Subtarget.is64Bit() && !Subtarget
.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23527, __extension__ __PRETTY_FUNCTION__));

23528

assert((DstVT == MVT::i64 ||(static_cast <bool> ((DstVT == MVT::i64 || (DstVT.isVector
() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23530, __extension__ __PRETTY_FUNCTION__))

23529

(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&(static_cast <bool> ((DstVT == MVT::i64 || (DstVT.isVector
() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23530, __extension__ __PRETTY_FUNCTION__))

23530

"Unexpected custom BITCAST")(static_cast <bool> ((DstVT == MVT::i64 || (DstVT.isVector
() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23530, __extension__ __PRETTY_FUNCTION__));

23531

// i64 <=> MMX conversions are Legal.

23532

if (SrcVT==MVT::i64 && DstVT.isVector())

23533

return Op;

23534

if (DstVT==MVT::i64 && SrcVT.isVector())

23535

return Op;

23536

// MMX <=> MMX conversions are Legal.

23537

if (SrcVT.isVector() && DstVT.isVector())

23538

return Op;

23539

// All other conversions need to be expanded.

23540

return SDValue();

23541

}

23542

23543

/// Compute the horizontal sum of bytes in V for the elements of VT.

23544

///

23545

/// Requires V to be a byte vector and VT to be an integer vector type with

23546

/// wider elements than V's type. The width of the elements of VT determines

23547

/// how many bytes of V are summed horizontally to produce each element of the

23548

/// result.

23549

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

23550

const X86Subtarget &Subtarget,

23551

SelectionDAG &DAG) {

23552

SDLoc DL(V);

23553

MVT ByteVecVT = V.getSimpleValueType();

23554

MVT EltVT = VT.getVectorElementType();

23555

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23556, __extension__ __PRETTY_FUNCTION__))

23556

"Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23556, __extension__ __PRETTY_FUNCTION__));

23557

assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23558, __extension__ __PRETTY_FUNCTION__))

23558

"Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23558, __extension__ __PRETTY_FUNCTION__));

23559

unsigned VecSize = VT.getSizeInBits();

23560

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23560, __extension__ __PRETTY_FUNCTION__));

23561

23562

// PSADBW instruction horizontally add all bytes and leave the result in i64

23563

// chunks, thus directly computes the pop count for v2i64 and v4i64.

23564

if (EltVT == MVT::i64) {

23565

SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);

23566

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

23567

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

23568

return DAG.getBitcast(VT, V);

23569

}

23570

23571

if (EltVT == MVT::i32) {

23572

// We unpack the low half and high half into i32s interleaved with zeros so

23573

// that we can use PSADBW to horizontally sum them. The most useful part of

23574

// this is that it lines up the results of two PSADBW instructions to be

23575

// two v2i64 vectors which concatenated are the 4 population counts. We can

23576

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

23577

SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);

23578

SDValue V32 = DAG.getBitcast(VT, V);

23579

SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);

23580

SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);

23581

23582

// Do the horizontal sums into two v2i64s.

23583

Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);

23584

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

23585

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

23586

DAG.getBitcast(ByteVecVT, Low), Zeros);

23587

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

23588

DAG.getBitcast(ByteVecVT, High), Zeros);

23589

23590

// Merge them together.

23591

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

23592

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

23593

DAG.getBitcast(ShortVecVT, Low),

23594

DAG.getBitcast(ShortVecVT, High));

23595

23596

return DAG.getBitcast(VT, V);

23597

}

23598

23599

// The only element type left is i16.

23600

assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23600, __extension__ __PRETTY_FUNCTION__));

23601

23602

// To obtain pop count for each i16 element starting from the pop count for

23603

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

23604

// right by 8. It is important to shift as i16s as i8 vector shift isn't

23605

// directly supported.

23606

SDValue ShifterV = DAG.getConstant(8, DL, VT);

23607

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

23608

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

23609

DAG.getBitcast(ByteVecVT, V));

23610

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

23611

}

23612

23613

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

23614

const X86Subtarget &Subtarget,

23615

SelectionDAG &DAG) {

23616

MVT VT = Op.getSimpleValueType();

23617

MVT EltVT = VT.getVectorElementType();

23618

unsigned VecSize = VT.getSizeInBits();

23619

23620

// Implement a lookup table in register by using an algorithm based on:

23621

// http://wm.ite.pl/articles/sse-popcount.html

23622

23623

// The general idea is that every lower byte nibble in the input vector is an

23624

// index into a in-register pre-computed pop count table. We then split up the

23625

// input vector in two new ones: (1) a vector with only the shifted-right

23626

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

23627

// masked out higher ones) for each byte. PSHUFB is used separately with both

23628

// to index the in-register table. Next, both are added and the result is a

23629

// i8 vector where each element contains the pop count for input byte.

23630

23631

// To obtain the pop count for elements != i8, we follow up with the same

23632

// approach and use additional tricks as described below.

23633

23634

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

23635

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

23636

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

23637

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

23638

23639

int NumByteElts = VecSize / 8;

23640

MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);

23641

SDValue In = DAG.getBitcast(ByteVecVT, Op);

23642

SmallVector<SDValue, 64> LUTVec;

23643

for (int i = 0; i < NumByteElts; ++i)

23644

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

23645

SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);

23646

SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);

23647

23648

// High nibbles

23649

SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);

23650

SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);

23651

23652

// Low nibbles

23653

SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);

23654

23655

// The input vector is used as the shuffle mask that index elements into the

23656

// LUT. After counting low and high nibbles, add the vector to obtain the

23657

// final pop count per i8 element.

23658

SDValue HighPopCnt =

23659

DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);

23660

SDValue LowPopCnt =

23661

DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);

23662

SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);

23663

23664

if (EltVT == MVT::i8)

23665

return PopCnt;

23666

23667

return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);

23668

}

23669

23670

static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,

23671

const X86Subtarget &Subtarget,

23672

SelectionDAG &DAG) {

23673

MVT VT = Op.getSimpleValueType();

23674

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitmath lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23675, __extension__ __PRETTY_FUNCTION__))

23675

"Only 128-bit vector bitmath lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitmath lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23675, __extension__ __PRETTY_FUNCTION__));

23676

23677

int VecSize = VT.getSizeInBits();

23678

MVT EltVT = VT.getVectorElementType();

23679

int Len = EltVT.getSizeInBits();

23680

23681

// This is the vectorized version of the "best" algorithm from

23682

// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel

23683

// with a minor tweak to use a series of adds + shifts instead of vector

23684

// multiplications. Implemented for all integer vector types. We only use

23685

// this when we don't have SSSE3 which allows a LUT-based lowering that is

23686

// much faster, even faster than using native popcnt instructions.

23687

23688

auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {

23689

MVT VT = V.getSimpleValueType();

23690

SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);

23691

return DAG.getNode(OpCode, DL, VT, V, ShifterV);

23692

};

23693

auto GetMask = [&](SDValue V, APInt Mask) {

23694

MVT VT = V.getSimpleValueType();

23695

SDValue MaskV = DAG.getConstant(Mask, DL, VT);

23696

return DAG.getNode(ISD::AND, DL, VT, V, MaskV);

23697

};

23698

23699

// We don't want to incur the implicit masks required to SRL vNi8 vectors on

23700

// x86, so set the SRL type to have elements at least i16 wide. This is

23701

// correct because all of our SRLs are followed immediately by a mask anyways

23702

// that handles any bits that sneak into the high bits of the byte elements.

23703

MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);

23704

23705

SDValue V = Op;

23706

23707

// v = v - ((v >> 1) & 0x55555555...)

23708

SDValue Srl =

23709

DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));

23710

SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));

23711

V = DAG.getNode(ISD::SUB, DL, VT, V, And);

23712

23713

// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)

23714

SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));

23715

Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));

23716

SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));

23717

V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);

23718

23719

// v = (v + (v >> 4)) & 0x0F0F0F0F...

23720

Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));

23721

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);

23722

V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));

23723

23724

// At this point, V contains the byte-wise population count, and we are

23725

// merely doing a horizontal sum if necessary to get the wider element

23726

// counts.

23727

if (EltVT == MVT::i8)

23728

return V;

23729

23730

return LowerHorizontalByteSum(

23731

DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,

23732

DAG);

23733

}

23734

23735

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

23736

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

23737

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

23738

SelectionDAG &DAG) {

23739

MVT VT = Op.getSimpleValueType();

23740

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23741, __extension__ __PRETTY_FUNCTION__))

23741

"Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23741, __extension__ __PRETTY_FUNCTION__));

23742

SDLoc DL(Op.getNode());

23743

SDValue Op0 = Op.getOperand(0);

23744

23745

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

23746

if (Subtarget.hasVPOPCNTDQ()) {

23747

if (VT == MVT::v8i16) {

23748

Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);

23749

Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);

23750

return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);

23751

}

23752

if (VT == MVT::v16i8 || VT == MVT::v16i16) {

23753

Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);

23754

Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);

23755

return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);

23756

}

23757

}

23758

23759

if (!Subtarget.hasSSSE3()) {

23760

// We can't use the fast LUT approach, so fall back on vectorized bitmath.

23761

assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported in SSE!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23761, __extension__ __PRETTY_FUNCTION__));

23762

return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);

23763

}

23764

23765

// Decompose 256-bit ops into smaller 128-bit ops.

23766

if (VT.is256BitVector() && !Subtarget.hasInt256())

23767

return Lower256IntUnary(Op, DAG);

23768

23769

// Decompose 512-bit ops into smaller 256-bit ops.

23770

if (VT.is512BitVector() && !Subtarget.hasBWI())

23771

return Lower512IntUnary(Op, DAG);

23772

23773

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

23774

}

23775

23776

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

23777

SelectionDAG &DAG) {

23778

assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23779, __extension__ __PRETTY_FUNCTION__))

23779

"We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23779, __extension__ __PRETTY_FUNCTION__));

23780

return LowerVectorCTPOP(Op, Subtarget, DAG);

23781

}

23782

23783

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

23784

MVT VT = Op.getSimpleValueType();

23785

SDValue In = Op.getOperand(0);

23786

SDLoc DL(Op);

23787

23788

// For scalars, its still beneficial to transfer to/from the SIMD unit to

23789

// perform the BITREVERSE.

23790

if (!VT.isVector()) {

23791

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

23792

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

23793

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

23794

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

23795

DAG.getIntPtrConstant(0, DL));

23796

}

23797

23798

int NumElts = VT.getVectorNumElements();

23799

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

23800

23801

// Decompose 256-bit ops into smaller 128-bit ops.

23802

if (VT.is256BitVector())

23803

return Lower256IntUnary(Op, DAG);

23804

23805

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23806, __extension__ __PRETTY_FUNCTION__))

23806

"Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23806, __extension__ __PRETTY_FUNCTION__));

23807

23808

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

23809

// perform the BSWAP in the shuffle.

23810

// Its best to shuffle using the second operand as this will implicitly allow

23811

// memory folding for multiple vectors.

23812

SmallVector<SDValue, 16> MaskElts;

23813

for (int i = 0; i != NumElts; ++i) {

23814

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

23815

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

23816

int PermuteByte = SourceByte | (2 << 5);

23817

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

23818

}

23819

}

23820

23821

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

23822

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

23823

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

23824

Res, Mask);

23825

return DAG.getBitcast(VT, Res);

23826

}

23827

23828

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

23829

SelectionDAG &DAG) {

23830

MVT VT = Op.getSimpleValueType();

23831

23832

if (Subtarget.hasXOP() && !VT.is512BitVector())

23833

return LowerBITREVERSE_XOP(Op, DAG);

23834

23835

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23835, __extension__ __PRETTY_FUNCTION__));

23836

23837

SDValue In = Op.getOperand(0);

23838

SDLoc DL(Op);

23839

23840

unsigned NumElts = VT.getVectorNumElements();

23841

assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23842, __extension__ __PRETTY_FUNCTION__))

23842

"Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23842, __extension__ __PRETTY_FUNCTION__));

23843

23844

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

23845

if (VT.is256BitVector() && !Subtarget.hasInt256())

23846

return Lower256IntUnary(Op, DAG);

23847

23848

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

23849

// two nibbles and a PSHUFB lookup to find the bitreverse of each

23850

// 0-15 value (moved to the other nibble).

23851

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

23852

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

23853

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

23854

23855

const int LoLUT[16] = {

23856

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

23857

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

23858

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

23859

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

23860

const int HiLUT[16] = {

23861

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

23862

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

23863

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

23864

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

23865

23866

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

23867

for (unsigned i = 0; i < NumElts; ++i) {

23868

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

23869

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

23870

}

23871

23872

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

23873

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

23874

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

23875

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

23876

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

23877

}

23878

23879

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

23880

const X86Subtarget &Subtarget,

23881

bool AllowIncDec = true) {

23882

unsigned NewOpc = 0;

23883

switch (N->getOpcode()) {

23884

case ISD::ATOMIC_LOAD_ADD:

23885

NewOpc = X86ISD::LADD;

23886

break;

23887

case ISD::ATOMIC_LOAD_SUB:

23888

NewOpc = X86ISD::LSUB;

23889

break;

23890

case ISD::ATOMIC_LOAD_OR:

23891

NewOpc = X86ISD::LOR;

23892

break;

23893

case ISD::ATOMIC_LOAD_XOR:

23894

NewOpc = X86ISD::LXOR;

23895

break;

23896

case ISD::ATOMIC_LOAD_AND:

23897

NewOpc = X86ISD::LAND;

23898

break;

23899

default:

23900

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23900);

23901

}

23902

23903

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

23904

23905

if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {

23906

// Convert to inc/dec if they aren't slow or we are optimizing for size.

23907

if (AllowIncDec && (!Subtarget.slowIncDec() ||

23908

DAG.getMachineFunction().getFunction()->optForSize())) {

23909

if ((NewOpc == X86ISD::LADD && C->isOne()) ||

23910

(NewOpc == X86ISD::LSUB && C->isAllOnesValue()))

23911

return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),

23912

DAG.getVTList(MVT::i32, MVT::Other),

23913

{N->getOperand(0), N->getOperand(1)},

23914

/*MemVT=*/N->getSimpleValueType(0), MMO);

23915

if ((NewOpc == X86ISD::LSUB && C->isOne()) ||

23916

(NewOpc == X86ISD::LADD && C->isAllOnesValue()))

23917

return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),

23918

DAG.getVTList(MVT::i32, MVT::Other),

23919

{N->getOperand(0), N->getOperand(1)},

23920

/*MemVT=*/N->getSimpleValueType(0), MMO);

23921

}

23922

}

23923

23924

return DAG.getMemIntrinsicNode(

23925

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

23926

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

23927

/*MemVT=*/N->getSimpleValueType(0), MMO);

23928

}

23929

23930

/// Lower atomic_load_ops into LOCK-prefixed operations.

23931

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

23932

const X86Subtarget &Subtarget) {

23933

SDValue Chain = N->getOperand(0);

23934

SDValue LHS = N->getOperand(1);

23935

SDValue RHS = N->getOperand(2);

23936

unsigned Opc = N->getOpcode();

23937

MVT VT = N->getSimpleValueType(0);

23938

SDLoc DL(N);

23939

23940

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

23941

// can only be lowered when the result is unused. They should have already

23942

// been transformed into a cmpxchg loop in AtomicExpand.

23943

if (N->hasAnyUseOfValue(0)) {

23944

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

23945

// select LXADD if LOCK_SUB can't be selected.

23946

if (Opc == ISD::ATOMIC_LOAD_SUB) {

23947

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

23948

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

23949

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

23950

RHS, AN->getMemOperand());

23951

}

23952

assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23953, __extension__ __PRETTY_FUNCTION__))

23953

"Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23953, __extension__ __PRETTY_FUNCTION__));

23954

return N;

23955

}

23956

23957

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

23958

// RAUW the chain, but don't worry about the result, as it's unused.

23959

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23959, __extension__ __PRETTY_FUNCTION__));

23960

DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));

23961

return SDValue();

23962

}

23963

23964

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {

23965

SDNode *Node = Op.getNode();

23966

SDLoc dl(Node);

23967

EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();

23968

23969

// Convert seq_cst store -> xchg

23970

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

23971

// FIXME: On 32-bit, store -> fist or movq would be more efficient

23972

// (The only way to get a 16-byte store is cmpxchg16b)

23973

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

23974

if (cast<AtomicSDNode>(Node)->getOrdering() ==

23975

AtomicOrdering::SequentiallyConsistent ||

23976

!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

23977

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

23978

cast<AtomicSDNode>(Node)->getMemoryVT(),

23979

Node->getOperand(0),

23980

Node->getOperand(1), Node->getOperand(2),

23981

cast<AtomicSDNode>(Node)->getMemOperand());

23982

return Swap.getValue(1);

23983

}

23984

// Other atomic stores have a simple pattern.

23985

return Op;

23986

}

23987

23988

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

23989

SDNode *N = Op.getNode();

23990

MVT VT = N->getSimpleValueType(0);

23991

23992

// Let legalize expand this if it isn't a legal type yet.

23993

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

23994

return SDValue();

23995

23996

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

23997

SDLoc DL(N);

23998

23999

// Set the carry flag.

24000

SDValue Carry = Op.getOperand(2);

24001

EVT CarryVT = Carry.getValueType();

24002

APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());

24003

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

24004

Carry, DAG.getConstant(NegOne, DL, CarryVT));

24005

24006

unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;

24007

SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),

24008

Op.getOperand(1), Carry.getValue(1));

24009

24010

SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);

24011

if (N->getValueType(1) == MVT::i1)

24012

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

24013

24014

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

24015

}

24016

24017

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

24018

SelectionDAG &DAG) {

24019

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24019, __extension__ __PRETTY_FUNCTION__));

24020

24021

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

24022

// which returns the values as { float, float } (in XMM0) or

24023

// { double, double } (which is returned in XMM0, XMM1).

24024

SDLoc dl(Op);

24025

SDValue Arg = Op.getOperand(0);

24026

EVT ArgVT = Arg.getValueType();

24027

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

24028

24029

TargetLowering::ArgListTy Args;

24030

TargetLowering::ArgListEntry Entry;

24031

24032

Entry.Node = Arg;

24033

Entry.Ty = ArgTy;

24034

Entry.IsSExt = false;

24035

Entry.IsZExt = false;

24036

Args.push_back(Entry);

24037

24038

bool isF64 = ArgVT == MVT::f64;

24039

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

24040

// the small struct {f32, f32} is returned in (eax, edx). For f64,

24041

// the results are returned via SRet in memory.

24042

const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";

24043

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24044

SDValue Callee =

24045

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

24046

24047

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

24048

: (Type *)VectorType::get(ArgTy, 4);

24049

24050

TargetLowering::CallLoweringInfo CLI(DAG);

24051

CLI.setDebugLoc(dl)

24052

.setChain(DAG.getEntryNode())

24053

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

24054

24055

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

24056

24057

if (isF64)

24058

// Returned in xmm0 and xmm1.

24059

return CallResult.first;

24060

24061

// Returned in bits 0:31 and 32:64 xmm0.

24062

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

24063

CallResult.first, DAG.getIntPtrConstant(0, dl));

24064

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

24065

CallResult.first, DAG.getIntPtrConstant(1, dl));

24066

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

24067

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

24068

}

24069

24070

/// Widen a vector input to a vector of NVT. The

24071

/// input vector must have the same element type as NVT.

24072

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

24073

bool FillWithZeroes = false) {

24074

// Check if InOp already has the right width.

24075

MVT InVT = InOp.getSimpleValueType();

24076

if (InVT == NVT)

24077

return InOp;

24078

24079

if (InOp.isUndef())

24080

return DAG.getUNDEF(NVT);

24081

24082

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24083, __extension__ __PRETTY_FUNCTION__))

24083

"input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24083, __extension__ __PRETTY_FUNCTION__));

24084

24085

unsigned InNumElts = InVT.getVectorNumElements();

24086

unsigned WidenNumElts = NVT.getVectorNumElements();

24087

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24088, __extension__ __PRETTY_FUNCTION__))

24088

"Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24088, __extension__ __PRETTY_FUNCTION__));

24089

24090

SDLoc dl(InOp);

24091

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

24092

InOp.getNumOperands() == 2) {

24093

SDValue N1 = InOp.getOperand(1);

24094

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

24095

N1.isUndef()) {

24096

InOp = InOp.getOperand(0);

24097

InVT = InOp.getSimpleValueType();

24098

InNumElts = InVT.getVectorNumElements();

24099

}

24100

}

24101

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

24102

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

24103

SmallVector<SDValue, 16> Ops;

24104

for (unsigned i = 0; i < InNumElts; ++i)

24105

Ops.push_back(InOp.getOperand(i));

24106

24107

EVT EltVT = InOp.getOperand(0).getValueType();

24108

24109

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

24110

DAG.getUNDEF(EltVT);

24111

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

24112

Ops.push_back(FillVal);

24113

return DAG.getBuildVector(NVT, dl, Ops);

24114

}

24115

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

24116

DAG.getUNDEF(NVT);

24117

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

24118

InOp, DAG.getIntPtrConstant(0, dl));

24119

}

24120

24121

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

24122

SelectionDAG &DAG) {

24123

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24124, __extension__ __PRETTY_FUNCTION__))

24124

"MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24124, __extension__ __PRETTY_FUNCTION__));

24125

24126

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

24127

SDValue Src = N->getValue();

24128

MVT VT = Src.getSimpleValueType();

24129

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24129, __extension__ __PRETTY_FUNCTION__));

24130

SDLoc dl(Op);

24131

24132

SDValue Index = N->getIndex();

24133

SDValue Mask = N->getMask();

24134

SDValue Chain = N->getChain();

24135

SDValue BasePtr = N->getBasePtr();

24136

MVT MemVT = N->getMemoryVT().getSimpleVT();

24137

MVT IndexVT = Index.getSimpleValueType();

24138

MVT MaskVT = Mask.getSimpleValueType();

24139

24140

if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {

24141

// The v2i32 value was promoted to v2i64.

24142

// Now we "redo" the type legalizer's work and widen the original

24143

// v2i32 value to v4i32. The original v2i32 is retrieved from v2i64

24144

// with a shuffle.

24145

assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&(static_cast <bool> ((MemVT == MVT::v2i32 && VT
== MVT::v2i64) && "Unexpected memory type") ? void (
0) : __assert_fail ("(MemVT == MVT::v2i32 && VT == MVT::v2i64) && \"Unexpected memory type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24146, __extension__ __PRETTY_FUNCTION__))

24146

"Unexpected memory type")(static_cast <bool> ((MemVT == MVT::v2i32 && VT
== MVT::v2i64) && "Unexpected memory type") ? void (
0) : __assert_fail ("(MemVT == MVT::v2i32 && VT == MVT::v2i64) && \"Unexpected memory type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24146, __extension__ __PRETTY_FUNCTION__));

24147

int ShuffleMask[] = {0, 2, -1, -1};

24148

Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),

24149

DAG.getUNDEF(MVT::v4i32), ShuffleMask);

24150

// Now we have 4 elements instead of 2.

24151

// Expand the index.

24152

MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);

24153

Index = ExtendToType(Index, NewIndexVT, DAG);

24154

24155

// Expand the mask with zeroes

24156

// Mask may be <2 x i64> or <2 x i1> at this moment

24157

assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&(static_cast <bool> ((MaskVT == MVT::v2i1 || MaskVT == MVT
::v2i64) && "Unexpected mask type") ? void (0) : __assert_fail
("(MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24158, __extension__ __PRETTY_FUNCTION__))

24158

"Unexpected mask type")(static_cast <bool> ((MaskVT == MVT::v2i1 || MaskVT == MVT
::v2i64) && "Unexpected mask type") ? void (0) : __assert_fail
("(MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24158, __extension__ __PRETTY_FUNCTION__));

24159

MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);

24160

Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);

24161

VT = MVT::v4i32;

24162

}

24163

24164

unsigned NumElts = VT.getVectorNumElements();

24165

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

24166

!Index.getSimpleValueType().is512BitVector()) {

24167

// AVX512F supports only 512-bit vectors. Or data or index should

24168

// be 512 bit wide. If now the both index and data are 256-bit, but

24169

// the vector contains 8 elements, we just sign-extend the index

24170

if (IndexVT == MVT::v8i32)

24171

// Just extend index

24172

Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

24173

else {

24174

// The minimal number of elts in scatter is 8

24175

NumElts = 8;

24176

// Index

24177

MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);

24178

// Use original index here, do not modify the index twice

24179

Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);

24180

if (IndexVT.getScalarType() == MVT::i32)

24181

Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

24182

24183

// Mask

24184

// At this point we have promoted mask operand

24185

assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type")(static_cast <bool> (MaskVT.getScalarSizeInBits() >=
32 && "unexpected mask type") ? void (0) : __assert_fail
("MaskVT.getScalarSizeInBits() >= 32 && \"unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24185, __extension__ __PRETTY_FUNCTION__));

24186

MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);

24187

// Use the original mask here, do not modify the mask twice

24188

Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);

24189

24190

// The value that should be stored

24191

MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);

24192

Src = ExtendToType(Src, NewVT, DAG);

24193

}

24194

}

24195

// If the mask is "wide" at this point - truncate it to i1 vector

24196

MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);

24197

Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);

24198

24199

// The mask is killed by scatter, add it to the values

24200

SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);

24201

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};

24202

SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(

24203

VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());

24204

DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));

24205

return SDValue(NewScatter.getNode(), 1);

24206

}

24207

24208

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

24209

SelectionDAG &DAG) {

24210

24211

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

24212

MVT VT = Op.getSimpleValueType();

24213

MVT ScalarVT = VT.getScalarType();

24214

SDValue Mask = N->getMask();

24215

SDLoc dl(Op);

24216

24217

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24218, __extension__ __PRETTY_FUNCTION__))

24218

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24218, __extension__ __PRETTY_FUNCTION__));

24219

24220

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24221, __extension__ __PRETTY_FUNCTION__))

24221

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24221, __extension__ __PRETTY_FUNCTION__));

24222

24223

// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of

24224

// VLX. These types for exp-loads are handled here.

24225

if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)

24226

return Op;

24227

24228

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24229, __extension__ __PRETTY_FUNCTION__))

24229

"Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24229, __extension__ __PRETTY_FUNCTION__));

24230

24231

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__))

24232

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__))

24233

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__))

24234

"Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__));

24235

24236

// This operation is legal for targets with VLX, but without

24237

// VLX the vector should be widened to 512 bit

24238

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

24239

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

24240

SDValue Src0 = N->getSrc0();

24241

Src0 = ExtendToType(Src0, WideDataVT, DAG);

24242

24243

// Mask element has to be i1.

24244

MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();

24245

assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&(static_cast <bool> ((MaskEltTy == MVT::i1 || VT.getVectorNumElements
() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"
) ? void (0) : __assert_fail ("(MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24246, __extension__ __PRETTY_FUNCTION__))

24246

"We handle 4x32, 4x64 and 2x64 vectors only in this case")(static_cast <bool> ((MaskEltTy == MVT::i1 || VT.getVectorNumElements
() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"
) ? void (0) : __assert_fail ("(MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24246, __extension__ __PRETTY_FUNCTION__));

24247

24248

MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

24249

24250

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

24251

if (MaskEltTy != MVT::i1)

24252

Mask = DAG.getNode(ISD::TRUNCATE, dl,

24253

MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);

24254

SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),

24255

N->getBasePtr(), Mask, Src0,

24256

N->getMemoryVT(), N->getMemOperand(),

24257

N->getExtensionType(),

24258

N->isExpandingLoad());

24259

24260

SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,

24261

NewLoad.getValue(0),

24262

DAG.getIntPtrConstant(0, dl));

24263

SDValue RetOps[] = {Exract, NewLoad.getValue(1)};

24264

return DAG.getMergeValues(RetOps, dl);

24265

}

24266

24267

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

24268

SelectionDAG &DAG) {

24269

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

24270

SDValue DataToStore = N->getValue();

24271

MVT VT = DataToStore.getSimpleValueType();

24272

MVT ScalarVT = VT.getScalarType();

24273

SDValue Mask = N->getMask();

24274

SDLoc dl(Op);

24275

24276

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24277, __extension__ __PRETTY_FUNCTION__))

24277

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24277, __extension__ __PRETTY_FUNCTION__));

24278

24279

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24280, __extension__ __PRETTY_FUNCTION__))

24280

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24280, __extension__ __PRETTY_FUNCTION__));

24281

24282

// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.

24283

if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)

24284

return Op;

24285

24286

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24287, __extension__ __PRETTY_FUNCTION__))

24287

"Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24287, __extension__ __PRETTY_FUNCTION__));

24288

24289

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__))

24290

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__))

24291

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__))

24292

"Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__));

24293

24294

// This operation is legal for targets with VLX, but without

24295

// VLX the vector should be widened to 512 bit

24296

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

24297

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

24298

24299

// Mask element has to be i1.

24300

MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();

24301

24302

24303

24304

MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

24305

24306

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

24307

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

24308

if (MaskEltTy != MVT::i1)

24309

Mask = DAG.getNode(ISD::TRUNCATE, dl,

24310

MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);

24311

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

24312

Mask, N->getMemoryVT(), N->getMemOperand(),

24313

N->isTruncatingStore(), N->isCompressingStore());

24314

}

24315

24316

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

24317

SelectionDAG &DAG) {

24318

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24319, __extension__ __PRETTY_FUNCTION__))

24319

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24319, __extension__ __PRETTY_FUNCTION__));

24320

24321

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

24322

SDLoc dl(Op);

24323

MVT VT = Op.getSimpleValueType();

24324

SDValue Index = N->getIndex();

24325

SDValue Mask = N->getMask();

24326

SDValue Src0 = N->getValue();

24327

MVT IndexVT = Index.getSimpleValueType();

24328

MVT MaskVT = Mask.getSimpleValueType();

24329

24330

unsigned NumElts = VT.getVectorNumElements();

24331

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24331, __extension__ __PRETTY_FUNCTION__));

24332

24333

// If the index is v2i32, we're being called by type legalization.

24334

if (IndexVT == MVT::v2i32)

24335

return SDValue();

24336

24337

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

24338

!Index.getSimpleValueType().is512BitVector()) {

24339

// AVX512F supports only 512-bit vectors. Or data or index should

24340

// be 512 bit wide. If now the both index and data are 256-bit, but

24341

// the vector contains 8 elements, we just sign-extend the index

24342

if (NumElts == 8) {

24343

Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

24344

SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };

24345

SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

24346

DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),

24347

N->getMemOperand());

24348

return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);

24349

}

24350

24351

// Minimal number of elements in Gather

24352

NumElts = 8;

24353

// Index

24354

MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);

24355

Index = ExtendToType(Index, NewIndexVT, DAG);

24356

if (IndexVT.getScalarType() == MVT::i32)

24357

Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

24358

24359

// Mask

24360

MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);

24361

// At this point we have promoted mask operand

24362

24363

MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);

24364

Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);

24365

Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);

24366

24367

// The pass-through value

24368

MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);

24369

Src0 = ExtendToType(Src0, NewVT, DAG);

24370

24371

SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };

24372

SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

24373

DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),

24374

N->getMemOperand());

24375

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,

24376

NewGather.getValue(0),

24377

DAG.getIntPtrConstant(0, dl));

24378

SDValue RetOps[] = {Extract, NewGather.getValue(2)};

24379

return DAG.getMergeValues(RetOps, dl);

24380

}

24381

24382

SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };

24383

SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

24384

DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),

24385

N->getMemOperand());

24386

return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);

24387

}

24388

24389

SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,

24390

SelectionDAG &DAG) const {

24391

// TODO: Eventually, the lowering of these nodes should be informed by or

24392

// deferred to the GC strategy for the function in which they appear. For

24393

// now, however, they must be lowered to something. Since they are logically

24394

// no-ops in the case of a null GC strategy (or a GC strategy which does not

24395

// require special handling for these nodes), lower them as literal NOOPs for

24396

// the time being.

24397

SmallVector<SDValue, 2> Ops;

24398

24399

Ops.push_back(Op.getOperand(0));

24400

if (Op->getGluedNode())

24401

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

24402

24403

SDLoc OpDL(Op);

24404

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

24405

SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

24406

24407

return NOOP;

24408

}

24409

24410

SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,

24411

SelectionDAG &DAG) const {

24412

// TODO: Eventually, the lowering of these nodes should be informed by or

24413

// deferred to the GC strategy for the function in which they appear. For

24414

// now, however, they must be lowered to something. Since they are logically

24415

// no-ops in the case of a null GC strategy (or a GC strategy which does not

24416

// require special handling for these nodes), lower them as literal NOOPs for

24417

// the time being.

24418

SmallVector<SDValue, 2> Ops;

24419

24420

Ops.push_back(Op.getOperand(0));

24421

if (Op->getGluedNode())

24422

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

24423

24424

SDLoc OpDL(Op);

24425

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

24426

SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

24427

24428

return NOOP;

24429

}

24430

24431

/// Provide custom lowering hooks for some operations.

24432

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

24433

switch (Op.getOpcode()) {

24434

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24434);

24435

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

24436

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

24437

return LowerCMP_SWAP(Op, Subtarget, DAG);

24438

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

24439

case ISD::ATOMIC_LOAD_ADD:

24440

case ISD::ATOMIC_LOAD_SUB:

24441

case ISD::ATOMIC_LOAD_OR:

24442

case ISD::ATOMIC_LOAD_XOR:

24443

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

24444

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);

24445

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

24446

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

24447

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

24448

case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);

24449

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

24450

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

24451

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

24452

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

24453

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

24454

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

24455

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

24456

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

24457

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

24458

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

24459

case ISD::SHL_PARTS:

24460

case ISD::SRA_PARTS:

24461

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

24462

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

24463

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

24464

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

24465

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

24466

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

24467

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

24468

case ISD::ZERO_EXTEND_VECTOR_INREG:

24469

case ISD::SIGN_EXTEND_VECTOR_INREG:

24470

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

24471

case ISD::FP_TO_SINT:

24472

case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

24473

case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

24474

case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);

24475

case ISD::FABS:

24476

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

24477

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

24478

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

24479

case ISD::SETCC: return LowerSETCC(Op, DAG);

24480

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

24481

case ISD::SELECT: return LowerSELECT(Op, DAG);

24482

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

24483

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

24484

case ISD::VASTART: return LowerVASTART(Op, DAG);

24485

case ISD::VAARG: return LowerVAARG(Op, DAG);

24486

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

24487

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

24488

case ISD::INTRINSIC_VOID:

24489

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

24490

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

24491

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

24492

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

24493

case ISD::FRAME_TO_ARGS_OFFSET:

24494

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

24495

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

24496

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

24497

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

24498

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

24499

case ISD::EH_SJLJ_SETUP_DISPATCH:

24500

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

24501

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

24502

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

24503

case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);

24504

case ISD::CTLZ:

24505

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

24506

case ISD::CTTZ:

24507

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);

24508

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

24509

case ISD::MULHS:

24510

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

24511

case ISD::UMUL_LOHI:

24512

case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);

24513

case ISD::ROTL:

24514

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

24515

case ISD::SRA:

24516

case ISD::SRL:

24517

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

24518

case ISD::SADDO:

24519

case ISD::UADDO:

24520

case ISD::SSUBO:

24521

case ISD::USUBO:

24522

case ISD::SMULO:

24523

case ISD::UMULO: return LowerXALUO(Op, DAG);

24524

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

24525

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

24526

case ISD::ADDCARRY:

24527

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

24528

case ISD::ADD:

24529

case ISD::SUB: return LowerADD_SUB(Op, DAG);

24530

case ISD::SMAX:

24531

case ISD::SMIN:

24532

case ISD::UMAX:

24533

case ISD::UMIN: return LowerMINMAX(Op, DAG);

24534

case ISD::ABS: return LowerABS(Op, DAG);

24535

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

24536

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

24537

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

24538

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

24539

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

24540

case ISD::GC_TRANSITION_START:

24541

return LowerGC_TRANSITION_START(Op, DAG);

24542

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);

24543

case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);

24544

}

24545

}

24546

24547

/// Places new result values for the node in Results (their number

24548

/// and types must exactly match those of the original return values of

24549

/// the node), or leaves Results empty, which indicates that the node is not

24550

/// to be custom lowered after all.

24551

void X86TargetLowering::LowerOperationWrapper(SDNode *N,

24552

SmallVectorImpl<SDValue> &Results,

24553

SelectionDAG &DAG) const {

24554

SDValue Res = LowerOperation(SDValue(N, 0), DAG);

24555

24556

if (!Res.getNode())

24557

return;

24558

24559

assert((N->getNumValues() <= Res->getNumValues()) &&(static_cast <bool> ((N->getNumValues() <= Res->
getNumValues()) && "Lowering returned the wrong number of results!"
) ? void (0) : __assert_fail ("(N->getNumValues() <= Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24560, __extension__ __PRETTY_FUNCTION__))

24560

"Lowering returned the wrong number of results!")(static_cast <bool> ((N->getNumValues() <= Res->
getNumValues()) && "Lowering returned the wrong number of results!"
) ? void (0) : __assert_fail ("(N->getNumValues() <= Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24560, __extension__ __PRETTY_FUNCTION__));

24561

24562

// Places new result values base on N result number.

24563

// In some cases (LowerSINT_TO_FP for example) Res has more result values

24564

// than original node, chain should be dropped(last value).

24565

for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)

24566

Results.push_back(Res.getValue(I));

24567

}

24568

24569

/// Replace a node with an illegal result type with a new node built out of

24570

/// custom code.

24571

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

24572

SmallVectorImpl<SDValue>&Results,

24573

SelectionDAG &DAG) const {

24574

SDLoc dl(N);

24575

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24576

switch (N->getOpcode()) {

24577

default:

24578

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24578);

24579

case X86ISD::AVG: {

24580

// Legalize types for X86ISD::AVG by expanding vectors.

24581

24582

24583

auto InVT = N->getValueType(0);

24584

auto InVTSize = InVT.getSizeInBits();

24585

const unsigned RegSize =

24586

(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;

24587

assert((Subtarget.hasBWI() || RegSize < 512) &&(static_cast <bool> ((Subtarget.hasBWI() || RegSize <
512) && "512-bit vector requires AVX512BW") ? void (
0) : __assert_fail ("(Subtarget.hasBWI() || RegSize < 512) && \"512-bit vector requires AVX512BW\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24588, __extension__ __PRETTY_FUNCTION__))

24588

"512-bit vector requires AVX512BW")(static_cast <bool> ((Subtarget.hasBWI() || RegSize <
512) && "512-bit vector requires AVX512BW") ? void (
0) : __assert_fail ("(Subtarget.hasBWI() || RegSize < 512) && \"512-bit vector requires AVX512BW\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24588, __extension__ __PRETTY_FUNCTION__));

24589

assert((Subtarget.hasAVX2() || RegSize < 256) &&(static_cast <bool> ((Subtarget.hasAVX2() || RegSize <
256) && "256-bit vector requires AVX2") ? void (0) :
__assert_fail ("(Subtarget.hasAVX2() || RegSize < 256) && \"256-bit vector requires AVX2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24590, __extension__ __PRETTY_FUNCTION__))

24590

"256-bit vector requires AVX2")(static_cast <bool> ((Subtarget.hasAVX2() || RegSize <
256) && "256-bit vector requires AVX2") ? void (0) :
__assert_fail ("(Subtarget.hasAVX2() || RegSize < 256) && \"256-bit vector requires AVX2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24590, __extension__ __PRETTY_FUNCTION__));

24591

24592

auto ElemVT = InVT.getVectorElementType();

24593

auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,

24594

RegSize / ElemVT.getSizeInBits());

24595

assert(RegSize % InVT.getSizeInBits() == 0)(static_cast <bool> (RegSize % InVT.getSizeInBits() == 0
) ? void (0) : __assert_fail ("RegSize % InVT.getSizeInBits() == 0"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24595, __extension__ __PRETTY_FUNCTION__));

24596

unsigned NumConcat = RegSize / InVT.getSizeInBits();

24597

24598

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

24599

Ops[0] = N->getOperand(0);

24600

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);

24601

Ops[0] = N->getOperand(1);

24602

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);

24603

24604

SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);

24605

if (!ExperimentalVectorWideningLegalization)

24606

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,

24607

DAG.getIntPtrConstant(0, dl));

24608

Results.push_back(Res);

24609

return;

24610

}

24611

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

24612

case X86ISD::FMINC:

24613

case X86ISD::FMIN:

24614

case X86ISD::FMAXC:

24615

case X86ISD::FMAX: {

24616

EVT VT = N->getValueType(0);

24617

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24617, __extension__ __PRETTY_FUNCTION__));

24618

SDValue UNDEF = DAG.getUNDEF(VT);

24619

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

24620

N->getOperand(0), UNDEF);

24621

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

24622

N->getOperand(1), UNDEF);

24623

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

24624

return;

24625

}

24626

case ISD::SDIV:

24627

case ISD::UDIV:

24628

case ISD::SREM:

24629

case ISD::UREM:

24630

case ISD::SDIVREM:

24631

case ISD::UDIVREM: {

24632

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

24633

Results.push_back(V);

24634

return;

24635

}

24636

case ISD::FP_TO_SINT:

24637

case ISD::FP_TO_UINT: {

24638

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

24639

24640

if (N->getValueType(0) == MVT::v2i32) {

24641

assert((IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((IsSigned || Subtarget.hasAVX512()
) && "Can only handle signed conversion without AVX512"
) ? void (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24642, __extension__ __PRETTY_FUNCTION__))

24642

"Can only handle signed conversion without AVX512")(static_cast <bool> ((IsSigned || Subtarget.hasAVX512()
) && "Can only handle signed conversion without AVX512"
) ? void (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24642, __extension__ __PRETTY_FUNCTION__));

24643

24644

SDValue Src = N->getOperand(0);

24645

if (Src.getValueType() == MVT::v2f64) {

24646

SDValue Idx = DAG.getIntPtrConstant(0, dl);

24647

SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI

24648

: X86ISD::CVTTP2UI,

24649

dl, MVT::v4i32, Src);

24650

if (!ExperimentalVectorWideningLegalization)

24651

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);

24652

Results.push_back(Res);

24653

return;

24654

}

24655

if (Src.getValueType() == MVT::v2f32) {

24656

SDValue Idx = DAG.getIntPtrConstant(0, dl);

24657

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

24658

DAG.getUNDEF(MVT::v2f32));

24659

Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT

24660

: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);

24661

if (!ExperimentalVectorWideningLegalization)

24662

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);

24663

Results.push_back(Res);

24664

return;

24665

}

24666

24667

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

24668

// so early out here.

24669

return;

24670

}

24671

24672

std::pair<SDValue,SDValue> Vals =

24673

FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);

24674

SDValue FIST = Vals.first, StackSlot = Vals.second;

24675

if (FIST.getNode()) {

24676

EVT VT = N->getValueType(0);

24677

// Return a load from the stack slot.

24678

if (StackSlot.getNode())

24679

Results.push_back(

24680

DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));

24681

else

24682

Results.push_back(FIST);

24683

}

24684

return;

24685

}

24686

case ISD::SINT_TO_FP: {

24687

24688

SDValue Src = N->getOperand(0);

24689

if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)

24690

return;

24691

Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));

24692

return;

24693

}

24694

case ISD::UINT_TO_FP: {

24695

24696

EVT VT = N->getValueType(0);

24697

if (VT != MVT::v2f32)

24698

return;

24699

SDValue Src = N->getOperand(0);

24700

EVT SrcVT = Src.getValueType();

24701

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

24702

Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));

24703

return;

24704

}

24705

if (SrcVT != MVT::v2i32)

24706

return;

24707

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

24708

SDValue VBias =

24709

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);

24710

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

24711

DAG.getBitcast(MVT::v2i64, VBias));

24712

Or = DAG.getBitcast(MVT::v2f64, Or);

24713

// TODO: Are there any fast-math-flags to propagate here?

24714

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

24715

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

24716

return;

24717

}

24718

case ISD::FP_ROUND: {

24719

if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))

24720

return;

24721

SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));

24722

Results.push_back(V);

24723

return;

24724

}

24725

case ISD::FP_EXTEND: {

24726

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

24727

// No other ValueType for FP_EXTEND should reach this point.

24728

assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24729, __extension__ __PRETTY_FUNCTION__))

24729

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24729, __extension__ __PRETTY_FUNCTION__));

24730

return;

24731

}

24732

case ISD::INTRINSIC_W_CHAIN: {

24733

unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

24734

switch (IntNo) {

24735

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24736)

24736

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24736);

24737

case Intrinsic::x86_rdtsc:

24738

return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,

24739

Results);

24740

case Intrinsic::x86_rdtscp:

24741

return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,

24742

Results);

24743

case Intrinsic::x86_rdpmc:

24744

return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);

24745

24746

case Intrinsic::x86_xgetbv:

24747

return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);

24748

}

24749

}

24750

case ISD::INTRINSIC_WO_CHAIN: {

24751

if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))

24752

Results.push_back(V);

24753

return;

24754

}

24755

case ISD::READCYCLECOUNTER: {

24756

return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,

24757

Results);

24758

}

24759

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

24760

EVT T = N->getValueType(0);

24761

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24761, __extension__ __PRETTY_FUNCTION__));

24762

bool Regs64bit = T == MVT::i128;

24763

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

24764

SDValue cpInL, cpInH;

24765

cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

24766

DAG.getConstant(0, dl, HalfT));

24767

cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

24768

DAG.getConstant(1, dl, HalfT));

24769

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

24770

Regs64bit ? X86::RAX : X86::EAX,

24771

cpInL, SDValue());

24772

cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,

24773

Regs64bit ? X86::RDX : X86::EDX,

24774

cpInH, cpInL.getValue(1));

24775

SDValue swapInL, swapInH;

24776

swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

24777

DAG.getConstant(0, dl, HalfT));

24778

swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

24779

DAG.getConstant(1, dl, HalfT));

24780

swapInH =

24781

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

24782

swapInH, cpInH.getValue(1));

24783

// If the current function needs the base pointer, RBX,

24784

// we shouldn't use cmpxchg directly.

24785

// Indeed the lowering of that instruction will clobber

24786

// that register and since RBX will be a reserved register

24787

// the register allocator will not make sure its value will

24788

// be properly saved and restored around this live-range.

24789

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

24790

SDValue Result;

24791

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

24792

unsigned BasePtr = TRI->getBaseRegister();

24793

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

24794

if (TRI->hasBasePointer(DAG.getMachineFunction()) &&

24795

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

24796

// ISel prefers the LCMPXCHG64 variant.

24797

// If that assert breaks, that means it is not the case anymore,

24798

// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,

24799

// not just EBX. This is a matter of accepting i64 input for that

24800

// pseudo, and restoring into the register of the right wide

24801

// in expand pseudo. Everything else should just work.

24802

assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&(static_cast <bool> (((Regs64bit == (BasePtr == X86::RBX
)) || BasePtr == X86::EBX) && "Saving only half of the RBX"
) ? void (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24803, __extension__ __PRETTY_FUNCTION__))

24803

"Saving only half of the RBX")(static_cast <bool> (((Regs64bit == (BasePtr == X86::RBX
)) || BasePtr == X86::EBX) && "Saving only half of the RBX"
) ? void (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24803, __extension__ __PRETTY_FUNCTION__));

24804

unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG

24805

: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;

24806

SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,

24807

Regs64bit ? X86::RBX : X86::EBX,

24808

HalfT, swapInH.getValue(1));

24809

SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,

24810

RBXSave,

24811

/*Glue*/ RBXSave.getValue(2)};

24812

Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);

24813

} else {

24814

unsigned Opcode =

24815

Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;

24816

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,

24817

Regs64bit ? X86::RBX : X86::EBX, swapInL,

24818

swapInH.getValue(1));

24819

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

24820

swapInL.getValue(1)};

24821

Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);

24822

}

24823

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

24824

Regs64bit ? X86::RAX : X86::EAX,

24825

HalfT, Result.getValue(1));

24826

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

24827

Regs64bit ? X86::RDX : X86::EDX,

24828

HalfT, cpOutL.getValue(2));

24829

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

24830

24831

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

24832

MVT::i32, cpOutH.getValue(2));

24833

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

24834

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

24835

24836

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

24837

Results.push_back(Success);

24838

Results.push_back(EFLAGS.getValue(1));

24839

return;

24840

}

24841

case ISD::ATOMIC_SWAP:

24842

case ISD::ATOMIC_LOAD_ADD:

24843

case ISD::ATOMIC_LOAD_SUB:

24844

case ISD::ATOMIC_LOAD_AND:

24845

case ISD::ATOMIC_LOAD_OR:

24846

case ISD::ATOMIC_LOAD_XOR:

24847

case ISD::ATOMIC_LOAD_NAND:

24848

case ISD::ATOMIC_LOAD_MIN:

24849

case ISD::ATOMIC_LOAD_MAX:

24850

case ISD::ATOMIC_LOAD_UMIN:

24851

case ISD::ATOMIC_LOAD_UMAX:

24852

case ISD::ATOMIC_LOAD: {

24853

// Delegate to generic TypeLegalization. Situations we can really handle

24854

// should have already been dealt with by AtomicExpandPass.cpp.

24855

break;

24856

}

24857

case ISD::BITCAST: {

24858

24859

EVT DstVT = N->getValueType(0);

24860

EVT SrcVT = N->getOperand(0)->getValueType(0);

24861

24862

if (SrcVT != MVT::f64 ||

24863

(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))

24864

return;

24865

24866

unsigned NumElts = DstVT.getVectorNumElements();

24867

EVT SVT = DstVT.getVectorElementType();

24868

EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);

24869

SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

24870

MVT::v2f64, N->getOperand(0));

24871

SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);

24872

24873

if (ExperimentalVectorWideningLegalization) {

24874

// If we are legalizing vectors by widening, we already have the desired

24875

// legal vector type, just return it.

24876

Results.push_back(ToVecInt);

24877

return;

24878

}

24879

24880

SmallVector<SDValue, 8> Elts;

24881

for (unsigned i = 0, e = NumElts; i != e; ++i)

24882

Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,

24883

ToVecInt, DAG.getIntPtrConstant(i, dl)));

24884

24885

Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));

24886

return;

24887

}

24888

case ISD::MGATHER: {

24889

EVT VT = N->getValueType(0);

24890

if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

24891

auto *Gather = cast<MaskedGatherSDNode>(N);

24892

SDValue Index = Gather->getIndex();

24893

if (Index.getValueType() != MVT::v2i64)

24894

return;

24895

SDValue Mask = Gather->getMask();

24896

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24896, __extension__ __PRETTY_FUNCTION__));

24897

SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

24898

Gather->getValue(),

24899

DAG.getUNDEF(MVT::v2f32));

24900

if (!Subtarget.hasVLX()) {

24901

// We need to widen the mask, but the instruction will only use 2

24902

// of its elements. So we can use undef.

24903

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

24904

DAG.getUNDEF(MVT::v2i1));

24905

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

24906

}

24907

SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),

24908

Index };

24909

SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

24910

DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,

24911

Gather->getMemoryVT(), Gather->getMemOperand());

24912

Results.push_back(Res);

24913

Results.push_back(Res.getValue(2));

24914

return;

24915

}

24916

if (VT == MVT::v2i32) {

24917

auto *Gather = cast<MaskedGatherSDNode>(N);

24918

SDValue Index = Gather->getIndex();

24919

SDValue Mask = Gather->getMask();

24920

24921

SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,

24922

Gather->getValue(),

24923

DAG.getUNDEF(MVT::v2i32));

24924

// If the index is v2i64 we can use it directly.

24925

if (Index.getValueType() == MVT::v2i64 &&

24926

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

24927

if (!Subtarget.hasVLX()) {

24928

// We need to widen the mask, but the instruction will only use 2

24929

// of its elements. So we can use undef.

24930

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

24931

DAG.getUNDEF(MVT::v2i1));

24932

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

24933

}

24934

SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),

24935

Index };

24936

SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(

24937

DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,

24938

Gather->getMemoryVT(), Gather->getMemOperand());

24939

SDValue Chain = Res.getValue(2);

24940

if (!ExperimentalVectorWideningLegalization)

24941

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

24942

DAG.getIntPtrConstant(0, dl));

24943

Results.push_back(Res);

24944

Results.push_back(Chain);

24945

return;

24946

}

24947

EVT IndexVT = Index.getValueType();

24948

EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),

24949

IndexVT.getScalarType(), 4);

24950

// Otherwise we need to custom widen everything to avoid promotion.

24951

Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,

24952

DAG.getUNDEF(IndexVT));

24953

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

24954

DAG.getConstant(0, dl, MVT::v2i1));

24955

SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),

24956

Index };

24957

SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),

24958

Gather->getMemoryVT(), dl, Ops,

24959

Gather->getMemOperand());

24960

SDValue Chain = Res.getValue(1);

24961

if (!ExperimentalVectorWideningLegalization)

24962

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

24963

DAG.getIntPtrConstant(0, dl));

24964

Results.push_back(Res);

24965

Results.push_back(Chain);

24966

return;

24967

}

24968

break;

24969

}

24970

}

24971

}

24972

24973

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

24974

switch ((X86ISD::NodeType)Opcode) {

24975

case X86ISD::FIRST_NUMBER: break;

24976

case X86ISD::BSF: return "X86ISD::BSF";

24977

case X86ISD::BSR: return "X86ISD::BSR";

24978

case X86ISD::SHLD: return "X86ISD::SHLD";

24979

case X86ISD::SHRD: return "X86ISD::SHRD";

24980

case X86ISD::FAND: return "X86ISD::FAND";

24981

case X86ISD::FANDN: return "X86ISD::FANDN";

24982

case X86ISD::FOR: return "X86ISD::FOR";

24983

case X86ISD::FXOR: return "X86ISD::FXOR";

24984

case X86ISD::FILD: return "X86ISD::FILD";

24985

case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";

24986

case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";

24987

case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";

24988

case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";

24989

case X86ISD::FLD: return "X86ISD::FLD";

24990

case X86ISD::FST: return "X86ISD::FST";

24991

case X86ISD::CALL: return "X86ISD::CALL";

24992

case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";

24993

case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";

24994

case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";

24995

case X86ISD::BT: return "X86ISD::BT";

24996

case X86ISD::CMP: return "X86ISD::CMP";

24997

case X86ISD::COMI: return "X86ISD::COMI";

24998

case X86ISD::UCOMI: return "X86ISD::UCOMI";

24999

case X86ISD::CMPM: return "X86ISD::CMPM";

25000

case X86ISD::CMPMU: return "X86ISD::CMPMU";

25001

case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";

25002

case X86ISD::SETCC: return "X86ISD::SETCC";

25003

case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";

25004

case X86ISD::FSETCC: return "X86ISD::FSETCC";

25005

case X86ISD::FSETCCM: return "X86ISD::FSETCCM";

25006

case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";

25007

case X86ISD::CMOV: return "X86ISD::CMOV";

25008

case X86ISD::BRCOND: return "X86ISD::BRCOND";

25009

case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";

25010

case X86ISD::IRET: return "X86ISD::IRET";

25011

case X86ISD::REP_STOS: return "X86ISD::REP_STOS";

25012

case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";

25013

case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";

25014

case X86ISD::Wrapper: return "X86ISD::Wrapper";

25015

case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";

25016

case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";

25017

case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";

25018

case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";

25019

case X86ISD::PEXTRB: return "X86ISD::PEXTRB";

25020

case X86ISD::PEXTRW: return "X86ISD::PEXTRW";

25021

case X86ISD::INSERTPS: return "X86ISD::INSERTPS";

25022

case X86ISD::PINSRB: return "X86ISD::PINSRB";

25023

case X86ISD::PINSRW: return "X86ISD::PINSRW";

25024

case X86ISD::PSHUFB: return "X86ISD::PSHUFB";

25025

case X86ISD::ANDNP: return "X86ISD::ANDNP";

25026

case X86ISD::BLENDI: return "X86ISD::BLENDI";

25027

case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";

25028

case X86ISD::ADDUS: return "X86ISD::ADDUS";

25029

case X86ISD::SUBUS: return "X86ISD::SUBUS";

25030

case X86ISD::HADD: return "X86ISD::HADD";

25031

case X86ISD::HSUB: return "X86ISD::HSUB";

25032

case X86ISD::FHADD: return "X86ISD::FHADD";

25033

case X86ISD::FHSUB: return "X86ISD::FHSUB";

25034

case X86ISD::CONFLICT: return "X86ISD::CONFLICT";

25035

case X86ISD::FMAX: return "X86ISD::FMAX";

25036

case X86ISD::FMAXS: return "X86ISD::FMAXS";

25037

case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";

25038

case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";

25039

case X86ISD::FMIN: return "X86ISD::FMIN";

25040

case X86ISD::FMINS: return "X86ISD::FMINS";

25041

case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";

25042

case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";

25043

case X86ISD::FMAXC: return "X86ISD::FMAXC";

25044

case X86ISD::FMINC: return "X86ISD::FMINC";

25045

case X86ISD::FRSQRT: return "X86ISD::FRSQRT";

25046

case X86ISD::FRCP: return "X86ISD::FRCP";

25047

case X86ISD::EXTRQI: return "X86ISD::EXTRQI";

25048

case X86ISD::INSERTQI: return "X86ISD::INSERTQI";

25049

case X86ISD::TLSADDR: return "X86ISD::TLSADDR";

25050

case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";

25051

case X86ISD::TLSCALL: return "X86ISD::TLSCALL";

25052

case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";

25053

case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";

25054

case X86ISD::EH_SJLJ_SETUP_DISPATCH:

25055

return "X86ISD::EH_SJLJ_SETUP_DISPATCH";

25056

case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";

25057

case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";

25058

case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";

25059

case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";

25060

case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";

25061

case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";

25062

case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";

25063

case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:

25064

return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";

25065

case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:

25066

return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";

25067

case X86ISD::LADD: return "X86ISD::LADD";

25068

case X86ISD::LSUB: return "X86ISD::LSUB";

25069

case X86ISD::LOR: return "X86ISD::LOR";

25070

case X86ISD::LXOR: return "X86ISD::LXOR";

25071

case X86ISD::LAND: return "X86ISD::LAND";

25072

case X86ISD::LINC: return "X86ISD::LINC";

25073

case X86ISD::LDEC: return "X86ISD::LDEC";

25074

case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";

25075

case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";

25076

case X86ISD::VZEXT: return "X86ISD::VZEXT";

25077

case X86ISD::VSEXT: return "X86ISD::VSEXT";

25078

case X86ISD::VTRUNC: return "X86ISD::VTRUNC";

25079

case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";

25080

case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";

25081

case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";

25082

case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";

25083

case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";

25084

case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";

25085

case X86ISD::VFPEXT: return "X86ISD::VFPEXT";

25086

case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";

25087

case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";

25088

case X86ISD::VFPROUND: return "X86ISD::VFPROUND";

25089

case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";

25090

case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";

25091

case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";

25092

case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";

25093

case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";

25094

case X86ISD::VSHL: return "X86ISD::VSHL";

25095

case X86ISD::VSRL: return "X86ISD::VSRL";

25096

case X86ISD::VSRA: return "X86ISD::VSRA";

25097

case X86ISD::VSHLI: return "X86ISD::VSHLI";

25098

case X86ISD::VSRLI: return "X86ISD::VSRLI";

25099

case X86ISD::VSRAI: return "X86ISD::VSRAI";

25100

case X86ISD::VSRAV: return "X86ISD::VSRAV";

25101

case X86ISD::VROTLI: return "X86ISD::VROTLI";

25102

case X86ISD::VROTRI: return "X86ISD::VROTRI";

25103

case X86ISD::VPPERM: return "X86ISD::VPPERM";

25104

case X86ISD::CMPP: return "X86ISD::CMPP";

25105

case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";

25106

case X86ISD::PCMPGT: return "X86ISD::PCMPGT";

25107

case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";

25108

case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";

25109

case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";

25110

case X86ISD::ADD: return "X86ISD::ADD";

25111

case X86ISD::SUB: return "X86ISD::SUB";

25112

case X86ISD::ADC: return "X86ISD::ADC";

25113

case X86ISD::SBB: return "X86ISD::SBB";

25114

case X86ISD::SMUL: return "X86ISD::SMUL";

25115

case X86ISD::UMUL: return "X86ISD::UMUL";

25116

case X86ISD::SMUL8: return "X86ISD::SMUL8";

25117

case X86ISD::UMUL8: return "X86ISD::UMUL8";

25118

case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";

25119

case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";

25120

case X86ISD::INC: return "X86ISD::INC";

25121

case X86ISD::DEC: return "X86ISD::DEC";

25122

case X86ISD::OR: return "X86ISD::OR";

25123

case X86ISD::XOR: return "X86ISD::XOR";

25124

case X86ISD::AND: return "X86ISD::AND";

25125

case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";

25126

case X86ISD::MOVMSK: return "X86ISD::MOVMSK";

25127

case X86ISD::PTEST: return "X86ISD::PTEST";

25128

case X86ISD::TESTP: return "X86ISD::TESTP";

25129

case X86ISD::TESTM: return "X86ISD::TESTM";

25130

case X86ISD::TESTNM: return "X86ISD::TESTNM";

25131

case X86ISD::KORTEST: return "X86ISD::KORTEST";

25132

case X86ISD::KTEST: return "X86ISD::KTEST";

25133

case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";

25134

case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";

25135

case X86ISD::PACKSS: return "X86ISD::PACKSS";

25136

case X86ISD::PACKUS: return "X86ISD::PACKUS";

25137

case X86ISD::PALIGNR: return "X86ISD::PALIGNR";

25138

case X86ISD::VALIGN: return "X86ISD::VALIGN";

25139

case X86ISD::VSHLD: return "X86ISD::VSHLD";

25140

case X86ISD::VSHRD: return "X86ISD::VSHRD";

25141

case X86ISD::VSHLDV: return "X86ISD::VSHLDV";

25142

case X86ISD::VSHRDV: return "X86ISD::VSHRDV";

25143

case X86ISD::PSHUFD: return "X86ISD::PSHUFD";

25144

case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";

25145

case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";

25146

case X86ISD::SHUFP: return "X86ISD::SHUFP";

25147

case X86ISD::SHUF128: return "X86ISD::SHUF128";

25148

case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";

25149

case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";

25150

case X86ISD::MOVLPS: return "X86ISD::MOVLPS";

25151

case X86ISD::MOVLPD: return "X86ISD::MOVLPD";

25152

case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";

25153

case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";

25154

case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";

25155

case X86ISD::MOVSD: return "X86ISD::MOVSD";

25156

case X86ISD::MOVSS: return "X86ISD::MOVSS";

25157

case X86ISD::UNPCKL: return "X86ISD::UNPCKL";

25158

case X86ISD::UNPCKH: return "X86ISD::UNPCKH";

25159

case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";

25160

case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";

25161

case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";

25162

case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";

25163

case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";

25164

case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";

25165

case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";

25166

case X86ISD::VPERMV: return "X86ISD::VPERMV";

25167

case X86ISD::VPERMV3: return "X86ISD::VPERMV3";

25168

case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";

25169

case X86ISD::VPERMI: return "X86ISD::VPERMI";

25170

case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";

25171

case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";

25172

case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";

25173

case X86ISD::VRANGE: return "X86ISD::VRANGE";

25174

case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";

25175

case X86ISD::VRANGES: return "X86ISD::VRANGES";

25176

case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";

25177

case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";

25178

case X86ISD::PMULDQ: return "X86ISD::PMULDQ";

25179

case X86ISD::PSADBW: return "X86ISD::PSADBW";

25180

case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";

25181

case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";

25182

case X86ISD::VAARG_64: return "X86ISD::VAARG_64";

25183

case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";

25184

case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";

25185

case X86ISD::MFENCE: return "X86ISD::MFENCE";

25186

case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";

25187

case X86ISD::SAHF: return "X86ISD::SAHF";

25188

case X86ISD::RDRAND: return "X86ISD::RDRAND";

25189

case X86ISD::RDSEED: return "X86ISD::RDSEED";

25190

case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";

25191

case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";

25192

case X86ISD::VPSHA: return "X86ISD::VPSHA";

25193

case X86ISD::VPSHL: return "X86ISD::VPSHL";

25194

case X86ISD::VPCOM: return "X86ISD::VPCOM";

25195

case X86ISD::VPCOMU: return "X86ISD::VPCOMU";

25196

case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";

25197

case X86ISD::FMSUB: return "X86ISD::FMSUB";

25198

case X86ISD::FNMADD: return "X86ISD::FNMADD";

25199

case X86ISD::FNMSUB: return "X86ISD::FNMSUB";

25200

case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";

25201

case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";

25202

case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";

25203

case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";

25204

case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";

25205

case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";

25206

case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";

25207

case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";

25208

case X86ISD::FMADDS1: return "X86ISD::FMADDS1";

25209

case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";

25210

case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";

25211

case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";

25212

case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";

25213

case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";

25214

case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";

25215

case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";

25216

case X86ISD::FMADDS3: return "X86ISD::FMADDS3";

25217

case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";

25218

case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";

25219

case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";

25220

case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";

25221

case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";

25222

case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";

25223

case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";

25224

case X86ISD::FMADD4S: return "X86ISD::FMADD4S";

25225

case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";

25226

case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";

25227

case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";

25228

case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";

25229

case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";

25230

case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";

25231

case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";

25232

case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";

25233

case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";

25234

case X86ISD::VREDUCE: return "X86ISD::VREDUCE";

25235

case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";

25236

case X86ISD::VREDUCES: return "X86ISD::VREDUCES";

25237

case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";

25238

case X86ISD::VGETMANT: return "X86ISD::VGETMANT";

25239

case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";

25240

case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";

25241

case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";

25242

case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";

25243

case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";

25244

case X86ISD::XTEST: return "X86ISD::XTEST";

25245

case X86ISD::COMPRESS: return "X86ISD::COMPRESS";

25246

case X86ISD::EXPAND: return "X86ISD::EXPAND";

25247

case X86ISD::SELECT: return "X86ISD::SELECT";

25248

case X86ISD::SELECTS: return "X86ISD::SELECTS";

25249

case X86ISD::ADDSUB: return "X86ISD::ADDSUB";

25250

case X86ISD::RCP14: return "X86ISD::RCP14";

25251

case X86ISD::RCP14S: return "X86ISD::RCP14S";

25252

case X86ISD::RCP28: return "X86ISD::RCP28";

25253

case X86ISD::RCP28S: return "X86ISD::RCP28S";

25254

case X86ISD::EXP2: return "X86ISD::EXP2";

25255

case X86ISD::RSQRT14: return "X86ISD::RSQRT14";

25256

case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";

25257

case X86ISD::RSQRT28: return "X86ISD::RSQRT28";

25258

case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";

25259

case X86ISD::FADD_RND: return "X86ISD::FADD_RND";

25260

case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";

25261

case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";

25262

case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";

25263

case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";

25264

case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";

25265

case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";

25266

case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";

25267

case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";

25268

case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";

25269

case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";

25270

case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";

25271

case X86ISD::SCALEF: return "X86ISD::SCALEF";

25272

case X86ISD::SCALEFS: return "X86ISD::SCALEFS";

25273

case X86ISD::ADDS: return "X86ISD::ADDS";

25274

case X86ISD::SUBS: return "X86ISD::SUBS";

25275

case X86ISD::AVG: return "X86ISD::AVG";

25276

case X86ISD::MULHRS: return "X86ISD::MULHRS";

25277

case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";

25278

case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";

25279

case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";

25280

case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";

25281

case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";

25282

case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";

25283

case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";

25284

case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";

25285

case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";

25286

case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";

25287

case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";

25288

case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";

25289

case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";

25290

case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";

25291

case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";

25292

case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";

25293

case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";

25294

case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";

25295

case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";

25296

case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";

25297

case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";

25298

case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";

25299

case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";

25300

case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";

25301

case X86ISD::LWPINS: return "X86ISD::LWPINS";

25302

case X86ISD::MGATHER: return "X86ISD::MGATHER";

25303

case X86ISD::MSCATTER: return "X86ISD::MSCATTER";

25304

case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";

25305

case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";

25306

case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";

25307

case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";

25308

case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";

25309

case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";

25310

case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";

25311

case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";

25312

}

25313

return nullptr;

25314

}

25315

25316

/// Return true if the addressing mode represented by AM is legal for this

25317

/// target, for a load/store of the specified type.

25318

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

25319

const AddrMode &AM, Type *Ty,

25320

unsigned AS,

25321

Instruction *I) const {

25322

// X86 supports extremely general addressing modes.

25323

CodeModel::Model M = getTargetMachine().getCodeModel();

25324

25325

// X86 allows a sign-extended 32-bit immediate field as a displacement.

25326

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

25327

return false;

25328

25329

if (AM.BaseGV) {

25330

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

25331

25332

// If a reference to this global requires an extra load, we can't fold it.

25333

if (isGlobalStubReference(GVFlags))

25334

return false;

25335

25336

// If BaseGV requires a register for the PIC base, we cannot also have a

25337

// BaseReg specified.

25338

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

25339

return false;

25340

25341

// If lower 4G is not available, then we must use rip-relative addressing.

25342

if ((M != CodeModel::Small || isPositionIndependent()) &&

25343

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

25344

return false;

25345

}

25346

25347

switch (AM.Scale) {

25348

case 0:

25349

case 1:

25350

case 2:

25351

case 4:

25352

case 8:

25353

// These scales always work.

25354

break;

25355

case 3:

25356

case 5:

25357

case 9:

25358

// These scales are formed with basereg+scalereg. Only accept if there is

25359

// no basereg yet.

25360

if (AM.HasBaseReg)

25361

return false;

25362

break;

25363

default: // Other stuff never works.

25364

return false;

25365

}

25366

25367

return true;

25368

}

25369

25370

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

25371

unsigned Bits = Ty->getScalarSizeInBits();

25372

25373

// 8-bit shifts are always expensive, but versions with a scalar amount aren't

25374

// particularly cheaper than those without.

25375

if (Bits == 8)

25376

return false;

25377

25378

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

25379

// shifts just as cheap as scalar ones.

25380

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

25381

return false;

25382

25383

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

25384

// fully general vector.

25385

return true;

25386

}

25387

25388

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

25389

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

25390

return false;

25391

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

25392

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

25393

return NumBits1 > NumBits2;

25394

}

25395

25396

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

25397

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

25398

return false;

25399

25400

if (!isTypeLegal(EVT::getEVT(Ty1)))

25401

return false;

25402

25403

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25403, __extension__ __PRETTY_FUNCTION__));

25404

25405

// Assuming the caller doesn't have a zeroext or signext return parameter,

25406

// truncation all the way down to i1 is valid.

25407

return true;

25408

}

25409

25410

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

25411

return isInt<32>(Imm);

25412

}

25413

25414

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

25415

// Can also use sub to handle negated immediates.

25416

return isInt<32>(Imm);

25417

}

25418

25419

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

25420

if (!VT1.isInteger() || !VT2.isInteger())

25421

return false;

25422

unsigned NumBits1 = VT1.getSizeInBits();

25423

unsigned NumBits2 = VT2.getSizeInBits();

25424

return NumBits1 > NumBits2;

25425

}

25426

25427

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

25428

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

25429

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

25430

}

25431

25432

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

25433

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

25434

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

25435

}

25436

25437

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

25438

EVT VT1 = Val.getValueType();

25439

if (isZExtFree(VT1, VT2))

25440

return true;

25441

25442

if (Val.getOpcode() != ISD::LOAD)

25443

return false;

25444

25445

if (!VT1.isSimple() || !VT1.isInteger() ||

25446

!VT2.isSimple() || !VT2.isInteger())

25447

return false;

25448

25449

switch (VT1.getSimpleVT().SimpleTy) {

25450

default: break;

25451

case MVT::i8:

25452

case MVT::i16:

25453

case MVT::i32:

25454

// X86 has 8, 16, and 32-bit zero-extending loads.

25455

return true;

25456

}

25457

25458

return false;

25459

}

25460

25461

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }

25462

25463

bool

25464

X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {

25465

if (!Subtarget.hasAnyFMA())

25466

return false;

25467

25468

VT = VT.getScalarType();

25469

25470

if (!VT.isSimple())

25471

return false;

25472

25473

switch (VT.getSimpleVT().SimpleTy) {

25474

case MVT::f32:

25475

case MVT::f64:

25476

return true;

25477

default:

25478

break;

25479

}

25480

25481

return false;

25482

}

25483

25484

bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {

25485

// i16 instructions are longer (0x66 prefix) and potentially slower.

25486

return !(VT1 == MVT::i32 && VT2 == MVT::i16);

25487

}

25488

25489

/// Targets can use this to indicate that they only support *some*

25490

/// VECTOR_SHUFFLE operations, those with specific masks.

25491

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

25492

/// are assumed to be legal.

25493

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {

25494

if (!VT.isSimple())

25495

return false;

25496

25497

// Not for i1 vectors

25498

if (VT.getSimpleVT().getScalarType() == MVT::i1)

25499

return false;

25500

25501

// Very little shuffling can be done for 64-bit vectors right now.

25502

if (VT.getSimpleVT().getSizeInBits() == 64)

25503

return false;

25504

25505

// We only care that the types being shuffled are legal. The lowering can

25506

// handle any possible shuffle mask that results.

25507

return isTypeLegal(VT.getSimpleVT());

25508

}

25509

25510

bool

25511

X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,

25512

EVT VT) const {

25513

// Just delegate to the generic legality, clear masks aren't special.

25514

return isShuffleMaskLegal(Mask, VT);

25515

}

25516

25517

//===----------------------------------------------------------------------===//

25518

// X86 Scheduler Hooks

25519

//===----------------------------------------------------------------------===//

25520

25521

/// Utility function to emit xbegin specifying the start of an RTM region.

25522

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

25523

const TargetInstrInfo *TII) {

25524

DebugLoc DL = MI.getDebugLoc();

25525

25526

const BasicBlock *BB = MBB->getBasicBlock();

25527

MachineFunction::iterator I = ++MBB->getIterator();

25528

25529

// For the v = xbegin(), we generate

25530

25531

// thisMBB:

25532

// xbegin sinkMBB

25533

25534

// mainMBB:

25535

// s0 = -1

25536

25537

// fallBB:

25538

// eax = # XABORT_DEF

25539

// s1 = eax

25540

25541

// sinkMBB:

25542

// v = phi(s0/mainBB, s1/fallBB)

25543

25544

MachineBasicBlock *thisMBB = MBB;

25545

MachineFunction *MF = MBB->getParent();

25546

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

25547

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

25548

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

25549

MF->insert(I, mainMBB);

25550

MF->insert(I, fallMBB);

25551

MF->insert(I, sinkMBB);

25552

25553

// Transfer the remainder of BB and its successor edges to sinkMBB.

25554

sinkMBB->splice(sinkMBB->begin(), MBB,

25555

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

25556

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

25557

25558

MachineRegisterInfo &MRI = MF->getRegInfo();

25559

unsigned DstReg = MI.getOperand(0).getReg();

25560

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

25561

unsigned mainDstReg = MRI.createVirtualRegister(RC);

25562

unsigned fallDstReg = MRI.createVirtualRegister(RC);

25563

25564

// thisMBB:

25565

// xbegin fallMBB

25566

// # fallthrough to mainMBB

25567

// # abortion to fallMBB

25568

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

25569

thisMBB->addSuccessor(mainMBB);

25570

thisMBB->addSuccessor(fallMBB);

25571

25572

// mainMBB:

25573

// mainDstReg := -1

25574

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

25575

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

25576

mainMBB->addSuccessor(sinkMBB);

25577

25578

// fallMBB:

25579

// ; pseudo instruction to model hardware's definition from XABORT

25580

// EAX := XABORT_DEF

25581

// fallDstReg := EAX

25582

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

25583

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

25584

.addReg(X86::EAX);

25585

fallMBB->addSuccessor(sinkMBB);

25586

25587

// sinkMBB:

25588

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

25589

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

25590

.addReg(mainDstReg).addMBB(mainMBB)

25591

.addReg(fallDstReg).addMBB(fallMBB);

25592

25593

MI.eraseFromParent();

25594

return sinkMBB;

25595

}

25596

25597

// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8

25598

// or XMM0_V32I8 in AVX all of this code can be replaced with that

25599

// in the .td file.

25600

static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,

25601

const TargetInstrInfo *TII) {

25602

unsigned Opc;

25603

switch (MI.getOpcode()) {

25604

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25604);

25605

case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;

25606

case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;

25607

case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;

25608

case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;

25609

case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;

25610

case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;

25611

case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;

25612

case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;

25613

}

25614

25615

DebugLoc dl = MI.getDebugLoc();

25616

MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

25617

25618

unsigned NumArgs = MI.getNumOperands();

25619

for (unsigned i = 1; i < NumArgs; ++i) {

25620

MachineOperand &Op = MI.getOperand(i);

25621

if (!(Op.isReg() && Op.isImplicit()))

25622

MIB.add(Op);

25623

}

25624

if (MI.hasOneMemOperand())

25625

MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

25626

25627

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

25628

.addReg(X86::XMM0);

25629

25630

MI.eraseFromParent();

25631

return BB;

25632

}

25633

25634

// FIXME: Custom handling because TableGen doesn't support multiple implicit

25635

// defs in an instruction pattern

25636

static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,

25637

const TargetInstrInfo *TII) {

25638

unsigned Opc;

25639

switch (MI.getOpcode()) {

25640

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25640);

25641

case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;

25642

case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;

25643

case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;

25644

case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;

25645

case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;

25646

case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;

25647

case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;

25648

case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;

25649

}

25650

25651

DebugLoc dl = MI.getDebugLoc();

25652

MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

25653

25654

unsigned NumArgs = MI.getNumOperands(); // remove the results

25655

for (unsigned i = 1; i < NumArgs; ++i) {

25656

MachineOperand &Op = MI.getOperand(i);

25657

if (!(Op.isReg() && Op.isImplicit()))

25658

MIB.add(Op);

25659

}

25660

if (MI.hasOneMemOperand())

25661

MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

25662

25663

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

25664

.addReg(X86::ECX);

25665

25666

MI.eraseFromParent();

25667

return BB;

25668

}

25669

25670

static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,

25671

const X86Subtarget &Subtarget) {

25672

DebugLoc dl = MI.getDebugLoc();

25673

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

25674

25675

// insert input VAL into EAX

25676

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)

25677

.addReg(MI.getOperand(0).getReg());

25678

// insert zero to ECX

25679

BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

25680

25681

// insert zero to EDX

25682

BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);

25683

25684

// insert WRPKRU instruction

25685

BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));

25686

25687

MI.eraseFromParent(); // The pseudo is gone now.

25688

return BB;

25689

}

25690

25691

static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,

25692

const X86Subtarget &Subtarget) {

25693

DebugLoc dl = MI.getDebugLoc();

25694

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

25695

25696

// insert zero to ECX

25697

BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

25698

25699

// insert RDPKRU instruction

25700

BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));

25701

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

25702

.addReg(X86::EAX);

25703

25704

MI.eraseFromParent(); // The pseudo is gone now.

25705

return BB;

25706

}

25707

25708

static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,

25709

const X86Subtarget &Subtarget,

25710

unsigned Opc) {

25711

DebugLoc dl = MI.getDebugLoc();

25712

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

25713

// Address into RAX/EAX, other two args into ECX, EDX.

25714

unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;

25715

unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

25716

MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);

25717

for (int i = 0; i < X86::AddrNumOperands; ++i)

25718

MIB.add(MI.getOperand(i));

25719

25720

unsigned ValOps = X86::AddrNumOperands;

25721

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)

25722

.addReg(MI.getOperand(ValOps).getReg());

25723

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)

25724

.addReg(MI.getOperand(ValOps + 1).getReg());

25725

25726

// The instruction doesn't actually take any operands though.

25727

BuildMI(*BB, MI, dl, TII->get(Opc));

25728

25729

MI.eraseFromParent(); // The pseudo is gone now.

25730

return BB;

25731

}

25732

25733

static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,

25734

const X86Subtarget &Subtarget) {

25735

DebugLoc dl = MI->getDebugLoc();

25736

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

25737

// Address into RAX/EAX

25738

unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;

25739

unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

25740

MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);

25741

for (int i = 0; i < X86::AddrNumOperands; ++i)

25742

MIB.add(MI->getOperand(i));

25743

25744

// The instruction doesn't actually take any operands though.

25745

BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));

25746

25747

MI->eraseFromParent(); // The pseudo is gone now.

25748

return BB;

25749

}

25750

25751

25752

25753

MachineBasicBlock *

25754

X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,

25755

MachineBasicBlock *MBB) const {

25756

// Emit va_arg instruction on X86-64.

25757

25758

// Operands to this pseudo-instruction:

25759

// 0 ) Output : destination address (reg)

25760

// 1-5) Input : va_list address (addr, i64mem)

25761

// 6 ) ArgSize : Size (in bytes) of vararg type

25762

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

25763

// 8 ) Align : Alignment of type

25764

// 9 ) EFLAGS (implicit-def)

25765

25766

assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG_64 should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25766, __extension__ __PRETTY_FUNCTION__));

25767

static_assert(X86::AddrNumOperands == 5,

25768

"VAARG_64 assumes 5 address operands");

25769

25770

unsigned DestReg = MI.getOperand(0).getReg();

25771

MachineOperand &Base = MI.getOperand(1);

25772

MachineOperand &Scale = MI.getOperand(2);

25773

MachineOperand &Index = MI.getOperand(3);

25774

MachineOperand &Disp = MI.getOperand(4);

25775

MachineOperand &Segment = MI.getOperand(5);

25776

unsigned ArgSize = MI.getOperand(6).getImm();

25777

unsigned ArgMode = MI.getOperand(7).getImm();

25778

unsigned Align = MI.getOperand(8).getImm();

25779

25780

// Memory Reference

25781

assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25781, __extension__ __PRETTY_FUNCTION__));

25782

MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();

25783

MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

25784

25785

// Machine Information

25786

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

25787

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

25788

const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);

25789

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

25790

DebugLoc DL = MI.getDebugLoc();

25791

25792

// struct va_list {

25793

// i32 gp_offset

25794

// i32 fp_offset

25795

// i64 overflow_area (address)

25796

// i64 reg_save_area (address)

25797

// }

25798

// sizeof(va_list) = 24

25799

// alignment(va_list) = 8

25800

25801

unsigned TotalNumIntRegs = 6;

25802

unsigned TotalNumXMMRegs = 8;

25803

bool UseGPOffset = (ArgMode == 1);

25804

bool UseFPOffset = (ArgMode == 2);

25805

unsigned MaxOffset = TotalNumIntRegs * 8 +

25806

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

25807

25808

/* Align ArgSize to a multiple of 8 */

25809

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

25810

bool NeedsAlign = (Align > 8);

25811

25812

MachineBasicBlock *thisMBB = MBB;

25813

MachineBasicBlock *overflowMBB;

25814

MachineBasicBlock *offsetMBB;

25815

MachineBasicBlock *endMBB;

25816

25817

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

25818

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

25819

unsigned OffsetReg = 0;

25820

25821

if (!UseGPOffset && !UseFPOffset) {

25822

// If we only pull from the overflow region, we don't create a branch.

25823

// We don't need to alter control flow.

25824

OffsetDestReg = 0; // unused

25825

OverflowDestReg = DestReg;

25826

25827

offsetMBB = nullptr;

25828

overflowMBB = thisMBB;

25829

endMBB = thisMBB;

25830

} else {

25831

// First emit code to check if gp_offset (or fp_offset) is below the bound.

25832

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

25833

// If not, pull from overflow_area. (branch to overflowMBB)

25834

25835

// thisMBB

25836

// | .

25837

// | .

25838

// offsetMBB overflowMBB

25839

// | .

25840

// | .

25841

// endMBB

25842

25843

// Registers for the PHI in endMBB

25844

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

25845

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

25846

25847

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

25848

MachineFunction *MF = MBB->getParent();

25849

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

25850

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

25851

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

25852

25853

MachineFunction::iterator MBBIter = ++MBB->getIterator();

25854

25855

// Insert the new basic blocks

25856

MF->insert(MBBIter, offsetMBB);

25857

MF->insert(MBBIter, overflowMBB);

25858

MF->insert(MBBIter, endMBB);

25859

25860

// Transfer the remainder of MBB and its successor edges to endMBB.

25861

endMBB->splice(endMBB->begin(), thisMBB,

25862

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

25863

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

25864

25865

// Make offsetMBB and overflowMBB successors of thisMBB

25866

thisMBB->addSuccessor(offsetMBB);

25867

thisMBB->addSuccessor(overflowMBB);

25868

25869

// endMBB is a successor of both offsetMBB and overflowMBB

25870

offsetMBB->addSuccessor(endMBB);

25871

overflowMBB->addSuccessor(endMBB);

25872

25873

// Load the offset value into a register

25874

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

25875

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

25876

.add(Base)

25877

.add(Scale)

25878

.add(Index)

25879

.addDisp(Disp, UseFPOffset ? 4 : 0)

25880

.add(Segment)

25881

.setMemRefs(MMOBegin, MMOEnd);

25882

25883

// Check if there is enough room left to pull this argument.

25884

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

25885

.addReg(OffsetReg)

25886

.addImm(MaxOffset + 8 - ArgSizeA8);

25887

25888

// Branch to "overflowMBB" if offset >= max

25889

// Fall through to "offsetMBB" otherwise

25890

BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))

25891

.addMBB(overflowMBB);

25892

}

25893

25894

// In offsetMBB, emit code to use the reg_save_area.

25895

if (offsetMBB) {

25896

assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25896, __extension__ __PRETTY_FUNCTION__));

25897

25898

// Read the reg_save_area address.

25899

unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

25900

BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)

25901

.add(Base)

25902

.add(Scale)

25903

.add(Index)

25904

.addDisp(Disp, 16)

25905

.add(Segment)

25906

.setMemRefs(MMOBegin, MMOEnd);

25907

25908

// Zero-extend the offset

25909

unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

25910

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

25911

.addImm(0)

25912

.addReg(OffsetReg)

25913

.addImm(X86::sub_32bit);

25914

25915

// Add the offset to the reg_save_area to get the final address.

25916

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

25917

.addReg(OffsetReg64)

25918

.addReg(RegSaveReg);

25919

25920

// Compute the offset for the next argument

25921

unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

25922

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

25923

.addReg(OffsetReg)

25924

.addImm(UseFPOffset ? 16 : 8);

25925

25926

// Store it back into the va_list.

25927

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

25928

.add(Base)

25929

.add(Scale)

25930

.add(Index)

25931

.addDisp(Disp, UseFPOffset ? 4 : 0)

25932

.add(Segment)

25933

.addReg(NextOffsetReg)

25934

.setMemRefs(MMOBegin, MMOEnd);

25935

25936

// Jump to endMBB

25937

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

25938

.addMBB(endMBB);

25939

}

25940

25941

25942

// Emit code to use overflow area

25943

25944

25945

// Load the overflow_area address into a register.

25946

unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

25947

BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)

25948

.add(Base)

25949

.add(Scale)

25950

.add(Index)

25951

.addDisp(Disp, 8)

25952

.add(Segment)

25953

.setMemRefs(MMOBegin, MMOEnd);

25954

25955

// If we need to align it, do so. Otherwise, just copy the address

25956

// to OverflowDestReg.

25957

if (NeedsAlign) {

25958

// Align the overflow address

25959

assert(isPowerOf2_32(Align) && "Alignment must be a power of 2")(static_cast <bool> (isPowerOf2_32(Align) && "Alignment must be a power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(Align) && \"Alignment must be a power of 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25959, __extension__ __PRETTY_FUNCTION__));

25960

unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

25961

25962

// aligned_addr = (addr + (align-1)) & ~(align-1)

25963

BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)

25964

.addReg(OverflowAddrReg)

25965

.addImm(Align-1);

25966

25967

BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)

25968

.addReg(TmpReg)

25969

.addImm(~(uint64_t)(Align-1));

25970

} else {

25971

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

25972

.addReg(OverflowAddrReg);

25973

}

25974

25975

// Compute the next overflow address after this argument.

25976

// (the overflow address should be kept 8-byte aligned)

25977

unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

25978

BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)

25979

.addReg(OverflowDestReg)

25980

.addImm(ArgSizeA8);

25981

25982

// Store the new overflow address.

25983

BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))

25984

.add(Base)

25985

.add(Scale)

25986

.add(Index)

25987

.addDisp(Disp, 8)

25988

.add(Segment)

25989

.addReg(NextAddrReg)

25990

.setMemRefs(MMOBegin, MMOEnd);

25991

25992

// If we branched, emit the PHI to the front of endMBB.

25993

if (offsetMBB) {

25994

BuildMI(*endMBB, endMBB->begin(), DL,

25995

TII->get(X86::PHI), DestReg)

25996

.addReg(OffsetDestReg).addMBB(offsetMBB)

25997

.addReg(OverflowDestReg).addMBB(overflowMBB);

25998

}

25999

26000

// Erase the pseudo instruction

26001

MI.eraseFromParent();

26002

26003

return endMBB;

26004

}

26005

26006

MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(

26007

MachineInstr &MI, MachineBasicBlock *MBB) const {

26008

// Emit code to save XMM registers to the stack. The ABI says that the

26009

// number of registers to save is given in %al, so it's theoretically

26010

// possible to do an indirect jump trick to avoid saving all of them,

26011

// however this code takes a simpler approach and just executes all

26012

// of the stores if %al is non-zero. It's less code, and it's probably

26013

// easier on the hardware branch predictor, and stores aren't all that

26014

// expensive anyway.

26015

26016

// Create the new basic blocks. One block contains all the XMM stores,

26017

// and one block is the final destination regardless of whether any

26018

// stores were performed.

26019

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

26020

MachineFunction *F = MBB->getParent();

26021

MachineFunction::iterator MBBIter = ++MBB->getIterator();

26022

MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);

26023

MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);

26024

F->insert(MBBIter, XMMSaveMBB);

26025

F->insert(MBBIter, EndMBB);

26026

26027

// Transfer the remainder of MBB and its successor edges to EndMBB.

26028

EndMBB->splice(EndMBB->begin(), MBB,

26029

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

26030

EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

26031

26032

// The original block will now fall through to the XMM save block.

26033

MBB->addSuccessor(XMMSaveMBB);

26034

// The XMMSaveMBB will fall through to the end block.

26035

XMMSaveMBB->addSuccessor(EndMBB);

26036

26037

// Now add the instructions.

26038

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

26039

DebugLoc DL = MI.getDebugLoc();

26040

26041

unsigned CountReg = MI.getOperand(0).getReg();

26042

int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();

26043

int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

26044

26045

if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {

26046

// If %al is 0, branch around the XMM save block.

26047

BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);

26048

BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);

26049

MBB->addSuccessor(EndMBB);

26050

}

26051

26052

// Make sure the last operand is EFLAGS, which gets clobbered by the branch

26053

// that was just emitted, but clearly shouldn't be "saved".

26054

assert((MI.getNumOperands() <= 3 ||(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__))

26055

!MI.getOperand(MI.getNumOperands() - 1).isReg() ||(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__))

26056

MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__))

26057

"Expected last argument to be EFLAGS")(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__));

26058

unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;

26059

// In the XMM save block, save all the XMM argument registers.

26060

for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {

26061

int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;

26062

MachineMemOperand *MMO = F->getMachineMemOperand(

26063

MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),

26064

MachineMemOperand::MOStore,

26065

/*Size=*/16, /*Align=*/16);

26066

BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))

26067

.addFrameIndex(RegSaveFrameIndex)

26068

.addImm(/*Scale=*/1)

26069

.addReg(/*IndexReg=*/0)

26070

.addImm(/*Disp=*/Offset)

26071

.addReg(/*Segment=*/0)

26072

.addReg(MI.getOperand(i).getReg())

26073

.addMemOperand(MMO);

26074

}

26075

26076

MI.eraseFromParent(); // The pseudo instruction is gone now.

26077

26078

return EndMBB;

26079

}

26080

26081

// The EFLAGS operand of SelectItr might be missing a kill marker

26082

// because there were multiple uses of EFLAGS, and ISel didn't know

26083

// which to mark. Figure out whether SelectItr should have had a

26084

// kill marker, and set it if it should. Returns the correct kill

26085

// marker value.

26086

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

26087

MachineBasicBlock* BB,

26088

const TargetRegisterInfo* TRI) {

26089

// Scan forward through BB for a use/def of EFLAGS.

26090

MachineBasicBlock::iterator miI(std::next(SelectItr));

26091

for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {

26092

const MachineInstr& mi = *miI;

26093

if (mi.readsRegister(X86::EFLAGS))

26094

return false;

26095

if (mi.definesRegister(X86::EFLAGS))

26096

break; // Should have kill-flag - update below.

26097

}

26098

26099

// If we hit the end of the block, check whether EFLAGS is live into a

26100

// successor.

26101

if (miI == BB->end()) {

26102

for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),

26103

sEnd = BB->succ_end();

26104

sItr != sEnd; ++sItr) {

26105

MachineBasicBlock* succ = *sItr;

26106

if (succ->isLiveIn(X86::EFLAGS))

26107

return false;

26108

}

26109

}

26110

26111

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

26112

// out. SelectMI should have a kill flag on EFLAGS.

26113

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

26114

return true;

26115

}

26116

26117

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

26118

// together with other CMOV pseudo-opcodes into a single basic-block with

26119

// conditional jump around it.

26120

static bool isCMOVPseudo(MachineInstr &MI) {

26121

switch (MI.getOpcode()) {

26122

case X86::CMOV_FR32:

26123

case X86::CMOV_FR64:

26124

case X86::CMOV_GR8:

26125

case X86::CMOV_GR16:

26126

case X86::CMOV_GR32:

26127

case X86::CMOV_RFP32:

26128

case X86::CMOV_RFP64:

26129

case X86::CMOV_RFP80:

26130

case X86::CMOV_V2F64:

26131

case X86::CMOV_V2I64:

26132

case X86::CMOV_V4F32:

26133

case X86::CMOV_V4F64:

26134

case X86::CMOV_V4I64:

26135

case X86::CMOV_V16F32:

26136

case X86::CMOV_V8F32:

26137

case X86::CMOV_V8F64:

26138

case X86::CMOV_V8I64:

26139

case X86::CMOV_V8I1:

26140

case X86::CMOV_V16I1:

26141

case X86::CMOV_V32I1:

26142

case X86::CMOV_V64I1:

26143

return true;

26144

26145

default:

26146

return false;

26147

}

26148

}

26149

26150

// Helper function, which inserts PHI functions into SinkMBB:

26151

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

26152

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

26153

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

26154

// the last PHI function inserted.

26155

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

26156

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

26157

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

26158

MachineBasicBlock *SinkMBB) {

26159

MachineFunction *MF = TrueMBB->getParent();

26160

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

26161

DebugLoc DL = MIItBegin->getDebugLoc();

26162

26163

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

26164

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

26165

26166

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

26167

26168

// As we are creating the PHIs, we have to be careful if there is more than

26169

// one. Later CMOVs may reference the results of earlier CMOVs, but later

26170

// PHIs have to reference the individual true/false inputs from earlier PHIs.

26171

// That also means that PHI construction must work forward from earlier to

26172

// later, and that the code must maintain a mapping from earlier PHI's

26173

// destination registers, and the registers that went into the PHI.

26174

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

26175

MachineInstrBuilder MIB;

26176

26177

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

26178

unsigned DestReg = MIIt->getOperand(0).getReg();

26179

unsigned Op1Reg = MIIt->getOperand(1).getReg();

26180

unsigned Op2Reg = MIIt->getOperand(2).getReg();

26181

26182

// If this CMOV we are generating is the opposite condition from

26183

// the jump we generated, then we have to swap the operands for the

26184

// PHI that is going to be generated.

26185

if (MIIt->getOperand(3).getImm() == OppCC)

26186

std::swap(Op1Reg, Op2Reg);

26187

26188

if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())

26189

Op1Reg = RegRewriteTable[Op1Reg].first;

26190

26191

if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())

26192

Op2Reg = RegRewriteTable[Op2Reg].second;

26193

26194

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

26195

.addReg(Op1Reg)

26196

.addMBB(FalseMBB)

26197

.addReg(Op2Reg)

26198

.addMBB(TrueMBB);

26199

26200

// Add this PHI to the rewrite table.

26201

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

26202

}

26203

26204

return MIB;

26205

}

26206

26207

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

26208

MachineBasicBlock *

26209

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

26210

MachineInstr &SecondCascadedCMOV,

26211

MachineBasicBlock *ThisMBB) const {

26212

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

26213

DebugLoc DL = FirstCMOV.getDebugLoc();

26214

26215

// We lower cascaded CMOVs such as

26216

26217

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

26218

26219

// to two successive branches.

26220

26221

// Without this, we would add a PHI between the two jumps, which ends up

26222

// creating a few copies all around. For instance, for

26223

26224

// (sitofp (zext (fcmp une)))

26225

26226

// we would generate:

26227

26228

// ucomiss %xmm1, %xmm0

26229

// movss <1.0f>, %xmm0

26230

// movaps %xmm0, %xmm1

26231

// jne .LBB5_2

26232

// xorps %xmm1, %xmm1

26233

// .LBB5_2:

26234

// jp .LBB5_4

26235

// movaps %xmm1, %xmm0

26236

// .LBB5_4:

26237

// retq

26238

26239

// because this custom-inserter would have generated:

26240

26241

// A

26242

// | \

26243

// | B

26244

// | /

26245

// C

26246

// | \

26247

// | D

26248

// | /

26249

// E

26250

26251

// A: X = ...; Y = ...

26252

// B: empty

26253

// C: Z = PHI [X, A], [Y, B]

26254

// D: empty

26255

// E: PHI [X, C], [Z, D]

26256

26257

// If we lower both CMOVs in a single step, we can instead generate:

26258

26259

// A

26260

// | \

26261

// | C

26262

// | /|

26263

// |/ |

26264

// | |

26265

// | D

26266

// | /

26267

// E

26268

26269

// A: X = ...; Y = ...

26270

// D: empty

26271

// E: PHI [X, A], [X, C], [Y, D]

26272

26273

// Which, in our sitofp/fcmp example, gives us something like:

26274

26275

// ucomiss %xmm1, %xmm0

26276

// movss <1.0f>, %xmm0

26277

// jne .LBB5_4

26278

// jp .LBB5_4

26279

// xorps %xmm0, %xmm0

26280

// .LBB5_4:

26281

// retq

26282

26283

26284

// We lower cascaded CMOV into two successive branches to the same block.

26285

// EFLAGS is used by both, so mark it as live in the second.

26286

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

26287

MachineFunction *F = ThisMBB->getParent();

26288

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

26289

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

26290

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

26291

26292

MachineFunction::iterator It = ++ThisMBB->getIterator();

26293

F->insert(It, FirstInsertedMBB);

26294

F->insert(It, SecondInsertedMBB);

26295

F->insert(It, SinkMBB);

26296

26297

// For a cascaded CMOV, we lower it to two successive branches to

26298

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

26299

// the FirstInsertedMBB.

26300

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

26301

26302

// If the EFLAGS register isn't dead in the terminator, then claim that it's

26303

// live into the sink and copy blocks.

26304

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

26305

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

26306

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

26307

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

26308

SinkMBB->addLiveIn(X86::EFLAGS);

26309

}

26310

26311

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

26312

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

26313

std::next(MachineBasicBlock::iterator(FirstCMOV)),

26314

ThisMBB->end());

26315

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

26316

26317

// Fallthrough block for ThisMBB.

26318

ThisMBB->addSuccessor(FirstInsertedMBB);

26319

// The true block target of the first branch is always SinkMBB.

26320

ThisMBB->addSuccessor(SinkMBB);

26321

// Fallthrough block for FirstInsertedMBB.

26322

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

26323

// The true block for the branch of FirstInsertedMBB.

26324

FirstInsertedMBB->addSuccessor(SinkMBB);

26325

// This is fallthrough.

26326

SecondInsertedMBB->addSuccessor(SinkMBB);

26327

26328

// Create the conditional branch instructions.

26329

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

26330

unsigned Opc = X86::GetCondBranchFromCond(FirstCC);

26331

BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

26332

26333

X86::CondCode SecondCC =

26334

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

26335

unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);

26336

BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);

26337

26338

// SinkMBB:

26339

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

26340

unsigned DestReg = FirstCMOV.getOperand(0).getReg();

26341

unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();

26342

unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();

26343

MachineInstrBuilder MIB =

26344

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

26345

.addReg(Op1Reg)

26346

.addMBB(SecondInsertedMBB)

26347

.addReg(Op2Reg)

26348

.addMBB(ThisMBB);

26349

26350

// The second SecondInsertedMBB provides the same incoming value as the

26351

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

26352

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

26353

// Copy the PHI result to the register defined by the second CMOV.

26354

BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,

26355

TII->get(TargetOpcode::COPY),

26356

SecondCascadedCMOV.getOperand(0).getReg())

26357

.addReg(FirstCMOV.getOperand(0).getReg());

26358

26359

// Now remove the CMOVs.

26360

FirstCMOV.eraseFromParent();

26361

SecondCascadedCMOV.eraseFromParent();

26362

26363

return SinkMBB;

26364

}

26365

26366

MachineBasicBlock *

26367

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

26368

MachineBasicBlock *ThisMBB) const {

26369

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

26370

DebugLoc DL = MI.getDebugLoc();

26371

26372

// To "insert" a SELECT_CC instruction, we actually have to insert the

26373

// diamond control-flow pattern. The incoming instruction knows the

26374

// destination vreg to set, the condition code register to branch on, the

26375

// true/false values to select between and a branch opcode to use.

26376

26377

// ThisMBB:

26378

// ...

26379

// TrueVal = ...

26380

// cmpTY ccX, r1, r2

26381

// bCC copy1MBB

26382

// fallthrough --> FalseMBB

26383

26384

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

26385

// as described above, by inserting a BB, and then making a PHI at the join

26386

// point to select the true and false operands of the CMOV in the PHI.

26387

26388

// The code also handles two different cases of multiple CMOV opcodes

26389

// in a row.

26390

26391

// Case 1:

26392

// In this case, there are multiple CMOVs in a row, all which are based on

26393

// the same condition setting (or the exact opposite condition setting).

26394

// In this case we can lower all the CMOVs using a single inserted BB, and

26395

// then make a number of PHIs at the join point to model the CMOVs. The only

26396

// trickiness here, is that in a case like:

26397

26398

// t2 = CMOV cond1 t1, f1

26399

// t3 = CMOV cond1 t2, f2

26400

26401

// when rewriting this into PHIs, we have to perform some renaming on the

26402

// temps since you cannot have a PHI operand refer to a PHI result earlier

26403

// in the same block. The "simple" but wrong lowering would be:

26404

26405

// t2 = PHI t1(BB1), f1(BB2)

26406

// t3 = PHI t2(BB1), f2(BB2)

26407

26408

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

26409

// renaming is to note that on the path through BB1, t2 is really just a

26410

// copy of t1, and do that renaming, properly generating:

26411

26412

// t2 = PHI t1(BB1), f1(BB2)

26413

// t3 = PHI t1(BB1), f2(BB2)

26414

26415

// Case 2:

26416

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

26417

// function - EmitLoweredCascadedSelect.

26418

26419

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

26420

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

26421

MachineInstr *LastCMOV = &MI;

26422

MachineBasicBlock::iterator NextMIIt =

26423

std::next(MachineBasicBlock::iterator(MI));

26424

26425

// Check for case 1, where there are multiple CMOVs with the same condition

26426

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

26427

// number of jumps the most.

26428

26429

if (isCMOVPseudo(MI)) {

26430

// See if we have a string of CMOVS with the same condition.

26431

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

26432

(NextMIIt->getOperand(3).getImm() == CC ||

26433

NextMIIt->getOperand(3).getImm() == OppCC)) {

26434

LastCMOV = &*NextMIIt;

26435

++NextMIIt;

26436

}

26437

}

26438

26439

// This checks for case 2, but only do this if we didn't already find

26440

// case 1, as indicated by LastCMOV == MI.

26441

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

26442

NextMIIt->getOpcode() == MI.getOpcode() &&

26443

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

26444

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

26445

NextMIIt->getOperand(1).isKill()) {

26446

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

26447

}

26448

26449

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

26450

MachineFunction *F = ThisMBB->getParent();

26451

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

26452

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

26453

26454

MachineFunction::iterator It = ++ThisMBB->getIterator();

26455

F->insert(It, FalseMBB);

26456

F->insert(It, SinkMBB);

26457

26458

// If the EFLAGS register isn't dead in the terminator, then claim that it's

26459

// live into the sink and copy blocks.

26460

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

26461

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

26462

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

26463

FalseMBB->addLiveIn(X86::EFLAGS);

26464

SinkMBB->addLiveIn(X86::EFLAGS);

26465

}

26466

26467

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

26468

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

26469

std::next(MachineBasicBlock::iterator(LastCMOV)),

26470

ThisMBB->end());

26471

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

26472

26473

// Fallthrough block for ThisMBB.

26474

ThisMBB->addSuccessor(FalseMBB);

26475

// The true block target of the first (or only) branch is always a SinkMBB.

26476

ThisMBB->addSuccessor(SinkMBB);

26477

// Fallthrough block for FalseMBB.

26478

FalseMBB->addSuccessor(SinkMBB);

26479

26480

// Create the conditional branch instruction.

26481

unsigned Opc = X86::GetCondBranchFromCond(CC);

26482

BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

26483

26484

// SinkMBB:

26485

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

26486

// ...

26487

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

26488

MachineBasicBlock::iterator MIItEnd =

26489

std::next(MachineBasicBlock::iterator(LastCMOV));

26490

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

26491

26492

// Now remove the CMOV(s).

26493

ThisMBB->erase(MIItBegin, MIItEnd);

26494

26495

return SinkMBB;

26496

}

26497

26498

MachineBasicBlock *

26499

X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,

26500

MachineBasicBlock *BB) const {

26501

// Combine the following atomic floating-point modification pattern:

26502

// a.store(reg OP a.load(acquire), release)

26503

// Transform them into:

26504

// OPss (%gpr), %xmm

26505

// movss %xmm, (%gpr)

26506

// Or sd equivalent for 64-bit operations.

26507

unsigned MOp, FOp;

26508

switch (MI.getOpcode()) {

26509

default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP")::llvm::llvm_unreachable_internal("unexpected instr type for EmitLoweredAtomicFP"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26509);

26510

case X86::RELEASE_FADD32mr:

26511

FOp = X86::ADDSSrm;

26512

MOp = X86::MOVSSmr;

26513

break;

26514

case X86::RELEASE_FADD64mr:

26515

FOp = X86::ADDSDrm;

26516

MOp = X86::MOVSDmr;

26517

break;

26518

}

26519

const X86InstrInfo *TII = Subtarget.getInstrInfo();

26520

DebugLoc DL = MI.getDebugLoc();

26521

MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();

26522

unsigned ValOpIdx = X86::AddrNumOperands;

26523

unsigned VSrc = MI.getOperand(ValOpIdx).getReg();

26524

MachineInstrBuilder MIB =

26525

BuildMI(*BB, MI, DL, TII->get(FOp),

26526

MRI.createVirtualRegister(MRI.getRegClass(VSrc)))

26527

.addReg(VSrc);

26528

for (int i = 0; i < X86::AddrNumOperands; ++i) {

26529

MachineOperand &Operand = MI.getOperand(i);

26530

// Clear any kill flags on register operands as we'll create a second

26531

// instruction using the same address operands.

26532

if (Operand.isReg())

26533

Operand.setIsKill(false);

26534

MIB.add(Operand);

26535

}

26536

MachineInstr *FOpMI = MIB;

26537

MIB = BuildMI(*BB, MI, DL, TII->get(MOp));

26538

for (int i = 0; i < X86::AddrNumOperands; ++i)

26539

MIB.add(MI.getOperand(i));

26540

MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);

26541

MI.eraseFromParent(); // The pseudo instruction is gone now.

26542

return BB;

26543

}

26544

26545

MachineBasicBlock *

26546

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

26547

MachineBasicBlock *BB) const {

26548

MachineFunction *MF = BB->getParent();

26549

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

26550

DebugLoc DL = MI.getDebugLoc();

26551

const BasicBlock *LLVM_BB = BB->getBasicBlock();

26552

26553

assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26553, __extension__ __PRETTY_FUNCTION__));

26554

26555

const bool Is64Bit = Subtarget.is64Bit();

26556

const bool IsLP64 = Subtarget.isTarget64BitLP64();

26557

26558

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

26559

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

26560

26561

// BB:

26562

// ... [Till the alloca]

26563

// If stacklet is not large enough, jump to mallocMBB

26564

26565

// bumpMBB:

26566

// Allocate by subtracting from RSP

26567

// Jump to continueMBB

26568

26569

// mallocMBB:

26570

// Allocate by call to runtime

26571

26572

// continueMBB:

26573

// ...

26574

// [rest of original BB]

26575

26576

26577

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

26578

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

26579

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

26580

26581

MachineRegisterInfo &MRI = MF->getRegInfo();

26582

const TargetRegisterClass *AddrRegClass =

26583

getRegClassFor(getPointerTy(MF->getDataLayout()));

26584

26585

unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

26586

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

26587

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

26588

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

26589

sizeVReg = MI.getOperand(1).getReg(),

26590

physSPReg =

26591

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

26592

26593

MachineFunction::iterator MBBIter = ++BB->getIterator();

26594

26595

MF->insert(MBBIter, bumpMBB);

26596

MF->insert(MBBIter, mallocMBB);

26597

MF->insert(MBBIter, continueMBB);

26598

26599

continueMBB->splice(continueMBB->begin(), BB,

26600

std::next(MachineBasicBlock::iterator(MI)), BB->end());

26601

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

26602

26603

// Add code to the main basic block to check if the stack limit has been hit,

26604

// and if so, jump to mallocMBB otherwise to bumpMBB.

26605

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

26606

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

26607

.addReg(tmpSPVReg).addReg(sizeVReg);

26608

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

26609

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

26610

.addReg(SPLimitVReg);

26611

BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);

26612

26613

// bumpMBB simply decreases the stack pointer, since we know the current

26614

// stacklet has enough space.

26615

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

26616

.addReg(SPLimitVReg);

26617

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

26618

.addReg(SPLimitVReg);

26619

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

26620

26621

// Calls into a routine in libgcc to allocate more space from the heap.

26622

const uint32_t *RegMask =

26623

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

26624

if (IsLP64) {

26625

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

26626

.addReg(sizeVReg);

26627

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

26628

.addExternalSymbol("__morestack_allocate_stack_space")

26629

.addRegMask(RegMask)

26630

.addReg(X86::RDI, RegState::Implicit)

26631

.addReg(X86::RAX, RegState::ImplicitDefine);

26632

} else if (Is64Bit) {

26633

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

26634

.addReg(sizeVReg);

26635

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

26636

.addExternalSymbol("__morestack_allocate_stack_space")

26637

.addRegMask(RegMask)

26638

.addReg(X86::EDI, RegState::Implicit)

26639

.addReg(X86::EAX, RegState::ImplicitDefine);

26640

} else {

26641

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

26642

.addImm(12);

26643

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

26644

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

26645

.addExternalSymbol("__morestack_allocate_stack_space")

26646

.addRegMask(RegMask)

26647

.addReg(X86::EAX, RegState::ImplicitDefine);

26648

}

26649

26650

if (!Is64Bit)

26651

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

26652

.addImm(16);

26653

26654

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

26655

.addReg(IsLP64 ? X86::RAX : X86::EAX);

26656

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

26657

26658

// Set up the CFG correctly.

26659

BB->addSuccessor(bumpMBB);

26660

BB->addSuccessor(mallocMBB);

26661

mallocMBB->addSuccessor(continueMBB);

26662

bumpMBB->addSuccessor(continueMBB);

26663

26664

// Take care of the PHI nodes.

26665

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

26666

MI.getOperand(0).getReg())

26667

.addReg(mallocPtrVReg)

26668

.addMBB(mallocMBB)

26669

.addReg(bumpSPPtrVReg)

26670

.addMBB(bumpMBB);

26671

26672

// Delete the original pseudo instruction.

26673

MI.eraseFromParent();

26674

26675

// And we're done.

26676

return continueMBB;

26677

}

26678

26679

MachineBasicBlock *

26680

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

26681

MachineBasicBlock *BB) const {

26682

MachineFunction *MF = BB->getParent();

26683

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

26684

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

26685

DebugLoc DL = MI.getDebugLoc();

26686

26687

assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26689, __extension__ __PRETTY_FUNCTION__))

26688

classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26689, __extension__ __PRETTY_FUNCTION__))

26689

"SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26689, __extension__ __PRETTY_FUNCTION__));

26690

26691

// Only 32-bit EH needs to worry about manually restoring stack pointers.

26692

if (!Subtarget.is32Bit())

26693

return BB;

26694

26695

// C++ EH creates a new target block to hold the restore code, and wires up

26696

// the new block to the return destination with a normal JMP_4.

26697

MachineBasicBlock *RestoreMBB =

26698

MF->CreateMachineBasicBlock(BB->getBasicBlock());

26699

assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26699, __extension__ __PRETTY_FUNCTION__));

26700

MF->insert(std::next(BB->getIterator()), RestoreMBB);

26701

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

26702

BB->addSuccessor(RestoreMBB);

26703

MI.getOperand(0).setMBB(RestoreMBB);

26704

26705

auto RestoreMBBI = RestoreMBB->begin();

26706

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));

26707

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

26708

return BB;

26709

}

26710

26711

MachineBasicBlock *

26712

X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,

26713

MachineBasicBlock *BB) const {

26714

MachineFunction *MF = BB->getParent();

26715

const Constant *PerFn = MF->getFunction()->getPersonalityFn();

26716

bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));

26717

// Only 32-bit SEH requires special handling for catchpad.

26718

if (IsSEH && Subtarget.is32Bit()) {

26719

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

26720

DebugLoc DL = MI.getDebugLoc();

26721

BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));

26722

}

26723

MI.eraseFromParent();

26724

return BB;

26725

}

26726

26727

MachineBasicBlock *

26728

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

26729

MachineBasicBlock *BB) const {

26730

// So, here we replace TLSADDR with the sequence:

26731

// adjust_stackdown -> TLSADDR -> adjust_stackup.

26732

// We need this because TLSADDR is lowered into calls

26733

// inside MC, therefore without the two markers shrink-wrapping

26734

// may push the prologue/epilogue pass them.

26735

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

26736

DebugLoc DL = MI.getDebugLoc();

26737

MachineFunction &MF = *BB->getParent();

26738

26739

// Emit CALLSEQ_START right before the instruction.

26740

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

26741

MachineInstrBuilder CallseqStart =

26742

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

26743

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

26744

26745

// Emit CALLSEQ_END right after the instruction.

26746

// We don't call erase from parent because we want to keep the

26747

// original instruction around.

26748

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

26749

MachineInstrBuilder CallseqEnd =

26750

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

26751

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

26752

26753

return BB;

26754

}

26755

26756

MachineBasicBlock *

26757

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

26758

MachineBasicBlock *BB) const {

26759

// This is pretty easy. We're taking the value that we received from

26760

// our load from the relocation, sticking it in either RDI (x86-64)

26761

// or EAX and doing an indirect call. The return value will then

26762

// be in the normal return register.

26763

MachineFunction *F = BB->getParent();

26764

const X86InstrInfo *TII = Subtarget.getInstrInfo();

26765

DebugLoc DL = MI.getDebugLoc();

26766

26767

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26767, __extension__ __PRETTY_FUNCTION__));

26768

assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26768, __extension__ __PRETTY_FUNCTION__));

26769

26770

// Get a register mask for the lowered call.

26771

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

26772

// proper register mask.

26773

const uint32_t *RegMask =

26774

Subtarget.is64Bit() ?

26775

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

26776

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

26777

if (Subtarget.is64Bit()) {

26778

MachineInstrBuilder MIB =

26779

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

26780

.addReg(X86::RIP)

26781

.addImm(0)

26782

.addReg(0)

26783

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

26784

MI.getOperand(3).getTargetFlags())

26785

.addReg(0);

26786

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

26787

addDirectMem(MIB, X86::RDI);

26788

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

26789

} else if (!isPositionIndependent()) {

26790

MachineInstrBuilder MIB =

26791

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

26792

.addReg(0)

26793

.addImm(0)

26794

.addReg(0)

26795

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

26796

MI.getOperand(3).getTargetFlags())

26797

.addReg(0);

26798

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

26799

addDirectMem(MIB, X86::EAX);

26800

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

26801

} else {

26802

MachineInstrBuilder MIB =

26803

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

26804

.addReg(TII->getGlobalBaseReg(F))

26805

.addImm(0)

26806

.addReg(0)

26807

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

26808

MI.getOperand(3).getTargetFlags())

26809

.addReg(0);

26810

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

26811

addDirectMem(MIB, X86::EAX);

26812

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

26813

}

26814

26815

MI.eraseFromParent(); // The pseudo instruction is gone now.

26816

return BB;

26817

}

26818

26819

MachineBasicBlock *

26820

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

26821

MachineBasicBlock *MBB) const {

26822

DebugLoc DL = MI.getDebugLoc();

26823

MachineFunction *MF = MBB->getParent();

26824

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

26825

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

26826

MachineRegisterInfo &MRI = MF->getRegInfo();

26827

26828

const BasicBlock *BB = MBB->getBasicBlock();

26829

MachineFunction::iterator I = ++MBB->getIterator();

26830

26831

// Memory Reference

26832

MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();

26833

MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

26834

26835

unsigned DstReg;

26836

unsigned MemOpndSlot = 0;

26837

26838

unsigned CurOp = 0;

26839

26840

DstReg = MI.getOperand(CurOp++).getReg();

26841

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

26842

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26842, __extension__ __PRETTY_FUNCTION__));

26843

(void)TRI;

26844

unsigned mainDstReg = MRI.createVirtualRegister(RC);

26845

unsigned restoreDstReg = MRI.createVirtualRegister(RC);

26846

26847

MemOpndSlot = CurOp;

26848

26849

MVT PVT = getPointerTy(MF->getDataLayout());

26850

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26851, __extension__ __PRETTY_FUNCTION__))

26851

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26851, __extension__ __PRETTY_FUNCTION__));

26852

26853

// For v = setjmp(buf), we generate

26854

26855

// thisMBB:

26856

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

26857

// SjLjSetup restoreMBB

26858

26859

// mainMBB:

26860

// v_main = 0

26861

26862

// sinkMBB:

26863

// v = phi(main, restore)

26864

26865

// restoreMBB:

26866

// if base pointer being used, load it from frame

26867

// v_restore = 1

26868

26869

MachineBasicBlock *thisMBB = MBB;

26870

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

26871

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

26872

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

26873

MF->insert(I, mainMBB);

26874

MF->insert(I, sinkMBB);

26875

MF->push_back(restoreMBB);

26876

restoreMBB->setHasAddressTaken();

26877

26878

MachineInstrBuilder MIB;

26879

26880

// Transfer the remainder of BB and its successor edges to sinkMBB.

26881

sinkMBB->splice(sinkMBB->begin(), MBB,

26882

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

26883

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

26884

26885

// thisMBB:

26886

unsigned PtrStoreOpc = 0;

26887

unsigned LabelReg = 0;

26888

const int64_t LabelOffset = 1 * PVT.getStoreSize();

26889

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

26890

!isPositionIndependent();

26891

26892

// Prepare IP either in reg or imm.

26893

if (!UseImmLabel) {

26894

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

26895

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

26896

LabelReg = MRI.createVirtualRegister(PtrRC);

26897

if (Subtarget.is64Bit()) {

26898

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

26899

.addReg(X86::RIP)

26900

.addImm(0)

26901

.addReg(0)

26902

.addMBB(restoreMBB)

26903

.addReg(0);

26904

} else {

26905

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

26906

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

26907

.addReg(XII->getGlobalBaseReg(MF))

26908

.addImm(0)

26909

.addReg(0)

26910

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

26911

.addReg(0);

26912

}

26913

} else

26914

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

26915

// Store IP

26916

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

26917

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

26918

if (i == X86::AddrDisp)

26919

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

26920

else

26921

MIB.add(MI.getOperand(MemOpndSlot + i));

26922

}

26923

if (!UseImmLabel)

26924

MIB.addReg(LabelReg);

26925

else

26926

MIB.addMBB(restoreMBB);

26927

MIB.setMemRefs(MMOBegin, MMOEnd);

26928

// Setup

26929

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

26930

.addMBB(restoreMBB);

26931

26932

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26933

MIB.addRegMask(RegInfo->getNoPreservedMask());

26934

thisMBB->addSuccessor(mainMBB);

26935

thisMBB->addSuccessor(restoreMBB);

26936

26937

// mainMBB:

26938

// EAX = 0

26939

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

26940

mainMBB->addSuccessor(sinkMBB);

26941

26942

// sinkMBB:

26943

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

26944

TII->get(X86::PHI), DstReg)

26945

.addReg(mainDstReg).addMBB(mainMBB)

26946

.addReg(restoreDstReg).addMBB(restoreMBB);

26947

26948

// restoreMBB:

26949

if (RegInfo->hasBasePointer(*MF)) {

26950

const bool Uses64BitFramePtr =

26951

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

26952

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

26953

X86FI->setRestoreBasePointer(MF);

26954

unsigned FramePtr = RegInfo->getFrameRegister(*MF);

26955

unsigned BasePtr = RegInfo->getBaseRegister();

26956

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

26957

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

26958

FramePtr, true, X86FI->getRestoreBasePointerOffset())

26959

.setMIFlag(MachineInstr::FrameSetup);

26960

}

26961

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

26962

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

26963

restoreMBB->addSuccessor(sinkMBB);

26964

26965

MI.eraseFromParent();

26966

return sinkMBB;

26967

}

26968

26969

MachineBasicBlock *

26970

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

26971

MachineBasicBlock *MBB) const {

26972

DebugLoc DL = MI.getDebugLoc();

26973

MachineFunction *MF = MBB->getParent();

26974

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

26975

MachineRegisterInfo &MRI = MF->getRegInfo();

26976

26977

// Memory Reference

26978

MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();

26979

MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

26980

26981

MVT PVT = getPointerTy(MF->getDataLayout());

26982

26983

26984

26985

const TargetRegisterClass *RC =

26986

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

26987

unsigned Tmp = MRI.createVirtualRegister(RC);

26988

// Since FP is only updated here but NOT referenced, it's treated as GPR.

26989

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26990

unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

26991

unsigned SP = RegInfo->getStackRegister();

26992

26993

MachineInstrBuilder MIB;

26994

26995

const int64_t LabelOffset = 1 * PVT.getStoreSize();

26996

const int64_t SPOffset = 2 * PVT.getStoreSize();

26997

26998

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

26999

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

27000

27001

// Reload FP

27002

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);

27003

for (unsigned i = 0; i < X86::AddrNumOperands; ++i)

27004

MIB.add(MI.getOperand(i));

27005

MIB.setMemRefs(MMOBegin, MMOEnd);

27006

// Reload IP

27007

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

27008

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

27009

if (i == X86::AddrDisp)

27010

MIB.addDisp(MI.getOperand(i), LabelOffset);

27011

else

27012

MIB.add(MI.getOperand(i));

27013

}

27014

MIB.setMemRefs(MMOBegin, MMOEnd);

27015

// Reload SP

27016

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);

27017

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

27018

if (i == X86::AddrDisp)

27019

MIB.addDisp(MI.getOperand(i), SPOffset);

27020

else

27021

MIB.add(MI.getOperand(i));

27022

}

27023

MIB.setMemRefs(MMOBegin, MMOEnd);

27024

// Jump

27025

BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

27026

27027

MI.eraseFromParent();

27028

return MBB;

27029

}

27030

27031

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

27032

MachineBasicBlock *MBB,

27033

MachineBasicBlock *DispatchBB,

27034

int FI) const {

27035

DebugLoc DL = MI.getDebugLoc();

27036

MachineFunction *MF = MBB->getParent();

27037

MachineRegisterInfo *MRI = &MF->getRegInfo();

27038

const X86InstrInfo *TII = Subtarget.getInstrInfo();

27039

27040

MVT PVT = getPointerTy(MF->getDataLayout());

27041

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27041, __extension__ __PRETTY_FUNCTION__));

27042

27043

unsigned Op = 0;

27044

unsigned VR = 0;

27045

27046

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

27047

!isPositionIndependent();

27048

27049

if (UseImmLabel) {

27050

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

27051

} else {

27052

const TargetRegisterClass *TRC =

27053

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

27054

VR = MRI->createVirtualRegister(TRC);

27055

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

27056

27057

if (Subtarget.is64Bit())

27058

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

27059

.addReg(X86::RIP)

27060

.addImm(1)

27061

.addReg(0)

27062

.addMBB(DispatchBB)

27063

.addReg(0);

27064

else

27065

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

27066

.addReg(0) /* TII->getGlobalBaseReg(MF) */

27067

.addImm(1)

27068

.addReg(0)

27069

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

27070

.addReg(0);

27071

}

27072

27073

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

27074

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

27075

if (UseImmLabel)

27076

MIB.addMBB(DispatchBB);

27077

else

27078

MIB.addReg(VR);

27079

}

27080

27081

MachineBasicBlock *

27082

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

27083

MachineBasicBlock *BB) const {

27084

DebugLoc DL = MI.getDebugLoc();

27085

MachineFunction *MF = BB->getParent();

27086

MachineFrameInfo &MFI = MF->getFrameInfo();

27087

MachineRegisterInfo *MRI = &MF->getRegInfo();

27088

const X86InstrInfo *TII = Subtarget.getInstrInfo();

27089

int FI = MFI.getFunctionContextIndex();

27090

27091

// Get a mapping of the call site numbers to all of the landing pads they're

27092

// associated with.

27093

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

27094

unsigned MaxCSNum = 0;

27095

for (auto &MBB : *MF) {

27096

if (!MBB.isEHPad())

27097

continue;

27098

27099

MCSymbol *Sym = nullptr;

27100

for (const auto &MI : MBB) {

27101

if (MI.isDebugValue())

27102

continue;

27103

27104

assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27104, __extension__ __PRETTY_FUNCTION__));

27105

Sym = MI.getOperand(0).getMCSymbol();

27106

break;

27107

}

27108

27109

if (!MF->hasCallSiteLandingPad(Sym))

27110

continue;

27111

27112

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

27113

CallSiteNumToLPad[CSI].push_back(&MBB);

27114

MaxCSNum = std::max(MaxCSNum, CSI);

27115

}

27116

}

27117

27118

// Get an ordered list of the machine basic blocks for the jump table.

27119

std::vector<MachineBasicBlock *> LPadList;

27120

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

27121

LPadList.reserve(CallSiteNumToLPad.size());

27122

27123

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

27124

for (auto &LP : CallSiteNumToLPad[CSI]) {

27125

LPadList.push_back(LP);

27126

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

27127

}

27128

}

27129

27130

assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27131, __extension__ __PRETTY_FUNCTION__))

27131

"No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27131, __extension__ __PRETTY_FUNCTION__));

27132

27133

// Create the MBBs for the dispatch code.

27134

27135

// Shove the dispatch's address into the return slot in the function context.

27136

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

27137

DispatchBB->setIsEHPad(true);

27138

27139

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

27140

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

27141

DispatchBB->addSuccessor(TrapBB);

27142

27143

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

27144

DispatchBB->addSuccessor(DispContBB);

27145

27146

// Insert MBBs.

27147

MF->push_back(DispatchBB);

27148

MF->push_back(DispContBB);

27149

MF->push_back(TrapBB);

27150

27151

// Insert code into the entry block that creates and registers the function

27152

// context.

27153

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

27154

27155

// Create the jump table and associated information

27156

unsigned JTE = getJumpTableEncoding();

27157

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

27158

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

27159

27160

const X86RegisterInfo &RI = TII->getRegisterInfo();

27161

// Add a register mask with no preserved registers. This results in all

27162

// registers being marked as clobbered.

27163

if (RI.hasBasePointer(*MF)) {

27164

const bool FPIs64Bit =

27165

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

27166

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

27167

MFI->setRestoreBasePointer(MF);

27168

27169

unsigned FP = RI.getFrameRegister(*MF);

27170

unsigned BP = RI.getBaseRegister();

27171

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

27172

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

27173

MFI->getRestoreBasePointerOffset())

27174

.addRegMask(RI.getNoPreservedMask());

27175

} else {

27176

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

27177

.addRegMask(RI.getNoPreservedMask());

27178

}

27179

27180

// IReg is used as an index in a memory operand and therefore can't be SP

27181

unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

27182

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

27183

Subtarget.is64Bit() ? 8 : 4);

27184

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

27185

.addReg(IReg)

27186

.addImm(LPadList.size());

27187

BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);

27188

27189

if (Subtarget.is64Bit()) {

27190

unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

27191

unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

27192

27193

// leaq .LJTI0_0(%rip), BReg

27194

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

27195

.addReg(X86::RIP)

27196

.addImm(1)

27197

.addReg(0)

27198

.addJumpTableIndex(MJTI)

27199

.addReg(0);

27200

// movzx IReg64, IReg

27201

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

27202

.addImm(0)

27203

.addReg(IReg)

27204

.addImm(X86::sub_32bit);

27205

27206

switch (JTE) {

27207

case MachineJumpTableInfo::EK_BlockAddress:

27208

// jmpq *(BReg,IReg64,8)

27209

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

27210

.addReg(BReg)

27211

.addImm(8)

27212

.addReg(IReg64)

27213

.addImm(0)

27214

.addReg(0);

27215

break;

27216

case MachineJumpTableInfo::EK_LabelDifference32: {

27217

unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

27218

unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

27219

unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

27220

27221

// movl (BReg,IReg64,4), OReg

27222

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

27223

.addReg(BReg)

27224

.addImm(4)

27225

.addReg(IReg64)

27226

.addImm(0)

27227

.addReg(0);

27228

// movsx OReg64, OReg

27229

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

27230

// addq BReg, OReg64, TReg

27231

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

27232

.addReg(OReg64)

27233

.addReg(BReg);

27234

// jmpq *TReg

27235

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

27236

break;

27237

}

27238

default:

27239

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27239);

27240

}

27241

} else {

27242

// jmpl *.LJTI0_0(,IReg,4)

27243

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

27244

.addReg(0)

27245

.addImm(4)

27246

.addReg(IReg)

27247

.addJumpTableIndex(MJTI)

27248

.addReg(0);

27249

}

27250

27251

// Add the jump table entries as successors to the MBB.

27252

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

27253

for (auto &LP : LPadList)

27254

if (SeenMBBs.insert(LP).second)

27255

DispContBB->addSuccessor(LP);

27256

27257

// N.B. the order the invoke BBs are processed in doesn't matter here.

27258

SmallVector<MachineBasicBlock *, 64> MBBLPads;

27259

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

27260

for (MachineBasicBlock *MBB : InvokeBBs) {

27261

// Remove the landing pad successor from the invoke block and replace it

27262

// with the new dispatch block.

27263

// Keep a copy of Successors since it's modified inside the loop.

27264

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

27265

MBB->succ_rend());

27266

// FIXME: Avoid quadratic complexity.

27267

for (auto MBBS : Successors) {

27268

if (MBBS->isEHPad()) {

27269

MBB->removeSuccessor(MBBS);

27270

MBBLPads.push_back(MBBS);

27271

}

27272

}

27273

27274

MBB->addSuccessor(DispatchBB);

27275

27276

// Find the invoke call and mark all of the callee-saved registers as

27277

// 'implicit defined' so that they're spilled. This prevents code from

27278

// moving instructions to before the EH block, where they will never be

27279

// executed.

27280

for (auto &II : reverse(*MBB)) {

27281

if (!II.isCall())

27282

continue;

27283

27284

DenseMap<unsigned, bool> DefRegs;

27285

for (auto &MOp : II.operands())

27286

if (MOp.isReg())

27287

DefRegs[MOp.getReg()] = true;

27288

27289

MachineInstrBuilder MIB(*MF, &II);

27290

for (unsigned RI = 0; SavedRegs[RI]; ++RI) {

27291

unsigned Reg = SavedRegs[RI];

27292

if (!DefRegs[Reg])

27293

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

27294

}

27295

27296

break;

27297

}

27298

}

27299

27300

// Mark all former landing pads as non-landing pads. The dispatch is the only

27301

// landing pad now.

27302

for (auto &LP : MBBLPads)

27303

LP->setIsEHPad(false);

27304

27305

// The instruction is gone now.

27306

MI.eraseFromParent();

27307

return BB;

27308

}

27309

27310

MachineBasicBlock *

27311

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

27312

MachineBasicBlock *BB) const {

27313

MachineFunction *MF = BB->getParent();

27314

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

27315

DebugLoc DL = MI.getDebugLoc();

27316

27317

switch (MI.getOpcode()) {

27318

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27318);

27319

case X86::TAILJMPd64:

27320

case X86::TAILJMPr64:

27321

case X86::TAILJMPm64:

27322

case X86::TAILJMPr64_REX:

27323

case X86::TAILJMPm64_REX:

27324

llvm_unreachable("TAILJMP64 would not be touched here.")::llvm::llvm_unreachable_internal("TAILJMP64 would not be touched here."
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27324);

27325

case X86::TCRETURNdi64:

27326

case X86::TCRETURNri64:

27327

case X86::TCRETURNmi64:

27328

return BB;

27329

case X86::TLS_addr32:

27330

case X86::TLS_addr64:

27331

case X86::TLS_base_addr32:

27332

case X86::TLS_base_addr64:

27333

return EmitLoweredTLSAddr(MI, BB);

27334

case X86::CATCHRET:

27335

return EmitLoweredCatchRet(MI, BB);

27336

case X86::CATCHPAD:

27337

return EmitLoweredCatchPad(MI, BB);

27338

case X86::SEG_ALLOCA_32:

27339

case X86::SEG_ALLOCA_64:

27340

return EmitLoweredSegAlloca(MI, BB);

27341

case X86::TLSCall_32:

27342

case X86::TLSCall_64:

27343

return EmitLoweredTLSCall(MI, BB);

27344

case X86::CMOV_FR32:

27345

case X86::CMOV_FR64:

27346

case X86::CMOV_FR128:

27347

case X86::CMOV_GR8:

27348

case X86::CMOV_GR16:

27349

case X86::CMOV_GR32:

27350

case X86::CMOV_RFP32:

27351

case X86::CMOV_RFP64:

27352

case X86::CMOV_RFP80:

27353

case X86::CMOV_V2F64:

27354

case X86::CMOV_V2I64:

27355

case X86::CMOV_V4F32:

27356

case X86::CMOV_V4F64:

27357

case X86::CMOV_V4I64:

27358

case X86::CMOV_V16F32:

27359

case X86::CMOV_V8F32:

27360

case X86::CMOV_V8F64:

27361

case X86::CMOV_V8I64:

27362

case X86::CMOV_V8I1:

27363

case X86::CMOV_V16I1:

27364

case X86::CMOV_V32I1:

27365

case X86::CMOV_V64I1:

27366

return EmitLoweredSelect(MI, BB);

27367

27368

case X86::RDFLAGS32:

27369

case X86::RDFLAGS64: {

27370

unsigned PushF =

27371

MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;

27372

unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;

27373

MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));

27374

// Permit reads of the FLAGS register without it being defined.

27375

// This intrinsic exists to read external processor state in flags, such as

27376

// the trap flag, interrupt flag, and direction flag, none of which are

27377

// modeled by the backend.

27378

Push->getOperand(2).setIsUndef();

27379

BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

27380

27381

MI.eraseFromParent(); // The pseudo is gone now.

27382

return BB;

27383

}

27384

27385

case X86::WRFLAGS32:

27386

case X86::WRFLAGS64: {

27387

unsigned Push =

27388

MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;

27389

unsigned PopF =

27390

MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;

27391

BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());

27392

BuildMI(*BB, MI, DL, TII->get(PopF));

27393

27394

MI.eraseFromParent(); // The pseudo is gone now.

27395

return BB;

27396

}

27397

27398

case X86::RELEASE_FADD32mr:

27399

case X86::RELEASE_FADD64mr:

27400

return EmitLoweredAtomicFP(MI, BB);

27401

27402

case X86::FP32_TO_INT16_IN_MEM:

27403

case X86::FP32_TO_INT32_IN_MEM:

27404

case X86::FP32_TO_INT64_IN_MEM:

27405

case X86::FP64_TO_INT16_IN_MEM:

27406

case X86::FP64_TO_INT32_IN_MEM:

27407

case X86::FP64_TO_INT64_IN_MEM:

27408

case X86::FP80_TO_INT16_IN_MEM:

27409

case X86::FP80_TO_INT32_IN_MEM:

27410

case X86::FP80_TO_INT64_IN_MEM: {

27411

// Change the floating point control register to use "round towards zero"

27412

// mode when truncating to an integer value.

27413

int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);

27414

addFrameReference(BuildMI(*BB, MI, DL,

27415

TII->get(X86::FNSTCW16m)), CWFrameIdx);

27416

27417

// Load the old value of the high byte of the control word...

27418

unsigned OldCW =

27419

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

27420

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),

27421

CWFrameIdx);

27422

27423

// Set the high part to be round to zero...

27424

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)

27425

.addImm(0xC7F);

27426

27427

// Reload the modified control word now...

27428

addFrameReference(BuildMI(*BB, MI, DL,

27429

TII->get(X86::FLDCW16m)), CWFrameIdx);

27430

27431

// Restore the memory image of control word to original value

27432

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)

27433

.addReg(OldCW);

27434

27435

// Get the X86 opcode to use.

27436

unsigned Opc;

27437

switch (MI.getOpcode()) {

27438

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27438);

27439

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

27440

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

27441

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

27442

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

27443

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

27444

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

27445

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

27446

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

27447

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

27448

}

27449

27450

X86AddressMode AM = getAddressFromInstr(&MI, 0);

27451

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

27452

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

27453

27454

// Reload the original control word now.

27455

addFrameReference(BuildMI(*BB, MI, DL,

27456

TII->get(X86::FLDCW16m)), CWFrameIdx);

27457

27458

MI.eraseFromParent(); // The pseudo instruction is gone now.

27459

return BB;

27460

}

27461

// String/text processing lowering.

27462

case X86::PCMPISTRM128REG:

27463

case X86::VPCMPISTRM128REG:

27464

case X86::PCMPISTRM128MEM:

27465

case X86::VPCMPISTRM128MEM:

27466

case X86::PCMPESTRM128REG:

27467

case X86::VPCMPESTRM128REG:

27468

case X86::PCMPESTRM128MEM:

27469

case X86::VPCMPESTRM128MEM:

27470

assert(Subtarget.hasSSE42() &&(static_cast <bool> (Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? void (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27471, __extension__ __PRETTY_FUNCTION__))

27471

"Target must have SSE4.2 or AVX features enabled")(static_cast <bool> (Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? void (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27471, __extension__ __PRETTY_FUNCTION__));

27472

return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());

27473

27474

// String/text processing lowering.

27475

case X86::PCMPISTRIREG:

27476

case X86::VPCMPISTRIREG:

27477

case X86::PCMPISTRIMEM:

27478

case X86::VPCMPISTRIMEM:

27479

case X86::PCMPESTRIREG:

27480

case X86::VPCMPESTRIREG:

27481

case X86::PCMPESTRIMEM:

27482

case X86::VPCMPESTRIMEM:

27483

27484

27485

return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());

27486

27487

// Thread synchronization.

27488

case X86::MONITOR:

27489

return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);

27490

case X86::MONITORX:

27491

return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);

27492

27493

// Cache line zero

27494

case X86::CLZERO:

27495

return emitClzero(&MI, BB, Subtarget);

27496

27497

// PKU feature

27498

case X86::WRPKRU:

27499

return emitWRPKRU(MI, BB, Subtarget);

27500

case X86::RDPKRU:

27501

return emitRDPKRU(MI, BB, Subtarget);

27502

// xbegin

27503

case X86::XBEGIN:

27504

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

27505

27506

case X86::VASTART_SAVE_XMM_REGS:

27507

return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

27508

27509

case X86::VAARG_64:

27510

return EmitVAARG64WithCustomInserter(MI, BB);

27511

27512

case X86::EH_SjLj_SetJmp32:

27513

case X86::EH_SjLj_SetJmp64:

27514

return emitEHSjLjSetJmp(MI, BB);

27515

27516

case X86::EH_SjLj_LongJmp32:

27517

case X86::EH_SjLj_LongJmp64:

27518

return emitEHSjLjLongJmp(MI, BB);

27519

27520

case X86::Int_eh_sjlj_setup_dispatch:

27521

return EmitSjLjDispatchBlock(MI, BB);

27522

27523

case TargetOpcode::STATEPOINT:

27524

// As an implementation detail, STATEPOINT shares the STACKMAP format at

27525

// this point in the process. We diverge later.

27526

return emitPatchPoint(MI, BB);

27527

27528

case TargetOpcode::STACKMAP:

27529

case TargetOpcode::PATCHPOINT:

27530

return emitPatchPoint(MI, BB);

27531

27532

case TargetOpcode::PATCHABLE_EVENT_CALL:

27533

// Do nothing here, handle in xray instrumentation pass.

27534

return BB;

27535

27536

case X86::LCMPXCHG8B: {

27537

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

27538

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

27539

// requires a memory operand. If it happens that current architecture is

27540

// i686 and for current function we need a base pointer

27541

// - which is ESI for i686 - register allocator would not be able to

27542

// allocate registers for an address in form of X(%reg, %reg, Y)

27543

// - there never would be enough unreserved registers during regalloc

27544

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

27545

// We are giving a hand to register allocator by precomputing the address in

27546

// a new vreg using LEA.

27547

27548

// If it is not i686 or there is no base pointer - nothing to do here.

27549

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

27550

return BB;

27551

27552

// Even though this code does not necessarily needs the base pointer to

27553

// be ESI, we check for that. The reason: if this assert fails, there are

27554

// some changes happened in the compiler base pointer handling, which most

27555

// probably have to be addressed somehow here.

27556

assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27558, __extension__ __PRETTY_FUNCTION__))

27557

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27558, __extension__ __PRETTY_FUNCTION__))

27558

"base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27558, __extension__ __PRETTY_FUNCTION__));

27559

27560

MachineRegisterInfo &MRI = MF->getRegInfo();

27561

MVT SPTy = getPointerTy(MF->getDataLayout());

27562

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

27563

unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

27564

27565

X86AddressMode AM = getAddressFromInstr(&MI, 0);

27566

// Regalloc does not need any help when the memory operand of CMPXCHG8B

27567

// does not use index register.

27568

if (AM.IndexReg == X86::NoRegister)

27569

return BB;

27570

27571

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

27572

// four operand definitions that are E[ABCD] registers. We skip them and

27573

// then insert the LEA.

27574

MachineBasicBlock::iterator MBBI(MI);

27575

while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||

27576

MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))

27577

--MBBI;

27578

addFullAddress(

27579

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

27580

27581

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

27582

27583

return BB;

27584

}

27585

case X86::LCMPXCHG16B:

27586

return BB;

27587

case X86::LCMPXCHG8B_SAVE_EBX:

27588

case X86::LCMPXCHG16B_SAVE_RBX: {

27589

unsigned BasePtr =

27590

MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;

27591

if (!BB->isLiveIn(BasePtr))

27592

BB->addLiveIn(BasePtr);

27593

return BB;

27594

}

27595

}

27596

}

27597

27598

//===----------------------------------------------------------------------===//

27599

// X86 Optimization Hooks

27600

//===----------------------------------------------------------------------===//

27601

27602

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

27603

KnownBits &Known,

27604

const APInt &DemandedElts,

27605

const SelectionDAG &DAG,

27606

unsigned Depth) const {

27607

unsigned BitWidth = Known.getBitWidth();

27608

unsigned Opc = Op.getOpcode();

27609

EVT VT = Op.getValueType();

27610

assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))

27611

Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))

27612

Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))

27613

Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))

27614

"Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))

27615

" is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__));

27616

27617

Known.resetAll();

27618

switch (Opc) {

27619

default: break;

27620

case X86ISD::SETCC:

27621

Known.Zero.setBitsFrom(1);

27622

break;

27623

case X86ISD::MOVMSK: {

27624

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

27625

Known.Zero.setBitsFrom(NumLoBits);

27626

break;

27627

}

27628

case X86ISD::PEXTRB:

27629

case X86ISD::PEXTRW: {

27630

SDValue Src = Op.getOperand(0);

27631

EVT SrcVT = Src.getValueType();

27632

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

27633

Op.getConstantOperandVal(1));

27634

DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);

27635

Known = Known.zextOrTrunc(BitWidth);

27636

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

27637

break;

27638

}

27639

case X86ISD::VSHLI:

27640

case X86ISD::VSRLI: {

27641

if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

27642

if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {

27643

Known.setAllZero();

27644

break;

27645

}

27646

27647

DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);

27648

unsigned ShAmt = ShiftImm->getZExtValue();

27649

if (Opc == X86ISD::VSHLI) {

27650

Known.Zero <<= ShAmt;

27651

Known.One <<= ShAmt;

27652

// Low bits are known zero.

27653

Known.Zero.setLowBits(ShAmt);

27654

} else {

27655

Known.Zero.lshrInPlace(ShAmt);

27656

Known.One.lshrInPlace(ShAmt);

27657

// High bits are known zero.

27658

Known.Zero.setHighBits(ShAmt);

27659

}

27660

}

27661

break;

27662

}

27663

case X86ISD::VZEXT: {

27664

// TODO: Add DemandedElts support.

27665

SDValue N0 = Op.getOperand(0);

27666

unsigned NumElts = VT.getVectorNumElements();

27667

27668

EVT SrcVT = N0.getValueType();

27669

unsigned InNumElts = SrcVT.getVectorNumElements();

27670

unsigned InBitWidth = SrcVT.getScalarSizeInBits();

27671

assert(InNumElts >= NumElts && "Illegal VZEXT input")(static_cast <bool> (InNumElts >= NumElts &&
"Illegal VZEXT input") ? void (0) : __assert_fail ("InNumElts >= NumElts && \"Illegal VZEXT input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27671, __extension__ __PRETTY_FUNCTION__));

27672

27673

Known = KnownBits(InBitWidth);

27674

APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);

27675

DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);

27676

Known = Known.zext(BitWidth);

27677

Known.Zero.setBitsFrom(InBitWidth);

27678

break;

27679

}

27680

case X86ISD::CMOV: {

27681

DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);

27682

// If we don't know any bits, early out.

27683

if (Known.isUnknown())

27684

break;

27685

KnownBits Known2;

27686

DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);

27687

27688

// Only known if known in both the LHS and RHS.

27689

Known.One &= Known2.One;

27690

Known.Zero &= Known2.Zero;

27691

break;

27692

}

27693

case X86ISD::UDIVREM8_ZEXT_HREG:

27694

// TODO: Support more than just the zero extended bits?

27695

if (Op.getResNo() != 1)

27696

break;

27697

// The remainder is zero extended.

27698

Known.Zero.setBitsFrom(8);

27699

break;

27700

}

27701

}

27702

27703

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

27704

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

27705

unsigned Depth) const {

27706

unsigned VTBits = Op.getScalarValueSizeInBits();

27707

unsigned Opcode = Op.getOpcode();

27708

switch (Opcode) {

27709

case X86ISD::SETCC_CARRY:

27710

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

27711

return VTBits;

27712

27713

case X86ISD::VSEXT: {

27714

// TODO: Add DemandedElts support.

27715

SDValue Src = Op.getOperand(0);

27716

unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);

27717

Tmp += VTBits - Src.getScalarValueSizeInBits();

27718

return Tmp;

27719

}

27720

27721

case X86ISD::VTRUNC: {

27722

// TODO: Add DemandedElts support.

27723

SDValue Src = Op.getOperand(0);

27724

unsigned NumSrcBits = Src.getScalarValueSizeInBits();

27725

assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27725, __extension__ __PRETTY_FUNCTION__));

27726

unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);

27727

if (Tmp > (NumSrcBits - VTBits))

27728

return Tmp - (NumSrcBits - VTBits);

27729

return 1;

27730

}

27731

27732

case X86ISD::PACKSS: {

27733

// PACKSS is just a truncation if the sign bits extend to the packed size.

27734

// TODO: Add DemandedElts support.

27735

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

27736

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);

27737

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);

27738

unsigned Tmp = std::min(Tmp0, Tmp1);

27739

if (Tmp > (SrcBits - VTBits))

27740

return Tmp - (SrcBits - VTBits);

27741

return 1;

27742

}

27743

27744

case X86ISD::VSHLI: {

27745

SDValue Src = Op.getOperand(0);

27746

APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();

27747

if (ShiftVal.uge(VTBits))

27748

return VTBits; // Shifted all bits out --> zero.

27749

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

27750

if (ShiftVal.uge(Tmp))

27751

return 1; // Shifted all sign bits out --> unknown.

27752

return Tmp - ShiftVal.getZExtValue();

27753

}

27754

27755

case X86ISD::VSRAI: {

27756

SDValue Src = Op.getOperand(0);

27757

APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();

27758

if (ShiftVal.uge(VTBits - 1))

27759

return VTBits; // Sign splat.

27760

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

27761

ShiftVal += Tmp;

27762

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

27763

}

27764

27765

case X86ISD::PCMPGT:

27766

case X86ISD::PCMPEQ:

27767

case X86ISD::CMPP:

27768

case X86ISD::VPCOM:

27769

case X86ISD::VPCOMU:

27770

// Vector compares return zero/all-bits result values.

27771

return VTBits;

27772

27773

case X86ISD::CMOV: {

27774

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

27775

if (Tmp0 == 1) return 1; // Early out.

27776

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

27777

return std::min(Tmp0, Tmp1);

27778

}

27779

case X86ISD::SDIVREM8_SEXT_HREG:

27780

// TODO: Support more than just the sign extended bits?

27781

if (Op.getResNo() != 1)

27782

break;

27783

// The remainder is sign extended.

27784

return VTBits - 7;

27785

}

27786

27787

// Fallback case.

27788

return 1;

27789

}

27790

27791

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

27792

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

27793

return N->getOperand(0);

27794

return N;

27795

}

27796

27797

/// Returns true (and the GlobalValue and the offset) if the node is a

27798

/// GlobalAddress + offset.

27799

bool X86TargetLowering::isGAPlusOffset(SDNode *N,

27800

const GlobalValue* &GA,

27801

int64_t &Offset) const {

27802

if (N->getOpcode() == X86ISD::Wrapper) {

27803

if (isa<GlobalAddressSDNode>(N->getOperand(0))) {

27804

GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();

27805

Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();

27806

return true;

27807

}

27808

}

27809

return TargetLowering::isGAPlusOffset(N, GA, Offset);

27810

}

27811

27812

// Attempt to match a combined shuffle mask against supported unary shuffle

27813

// instructions.

27814

// TODO: Investigate sharing more of this with shuffle lowering.

27815

static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,

27816

bool AllowFloatDomain, bool AllowIntDomain,

27817

SDValue &V1, SDLoc &DL, SelectionDAG &DAG,

27818

const X86Subtarget &Subtarget,

27819

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {

27820

unsigned NumMaskElts = Mask.size();

27821

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

27822

27823

// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.

27824

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

27825

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

27826

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

27827

unsigned MaxScale = 64 / MaskEltSize;

27828

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

27829

bool Match = true;

27830

unsigned NumDstElts = NumMaskElts / Scale;

27831

for (unsigned i = 0; i != NumDstElts && Match; ++i) {

27832

Match &= isUndefOrEqual(Mask[i * Scale], (int)i);

27833

Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

27834

}

27835

if (Match) {

27836

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

27837

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

27838

MVT::getIntegerVT(MaskEltSize);

27839

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

27840

27841

if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {

27842

V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);

27843

Shuffle = unsigned(X86ISD::VZEXT);

27844

} else

27845

Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);

27846

27847

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

27848

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

27849

return true;

27850

}

27851

}

27852

}

27853

27854

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

27855

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&

27856

isUndefOrEqual(Mask[0], 0) &&

27857

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

27858

Shuffle = X86ISD::VZEXT_MOVL;

27859

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

27860

return true;

27861

}

27862

27863

// Check if we have SSE3 which will let us use MOVDDUP etc. The

27864

// instructions are no slower than UNPCKLPD but has the option to

27865

// fold the input operand into even an unaligned memory load.

27866

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

27867

if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {

27868

Shuffle = X86ISD::MOVDDUP;

27869

SrcVT = DstVT = MVT::v2f64;

27870

return true;

27871

}

27872

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {

27873

Shuffle = X86ISD::MOVSLDUP;

27874

SrcVT = DstVT = MVT::v4f32;

27875

return true;

27876

}

27877

if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {

27878

Shuffle = X86ISD::MOVSHDUP;

27879

SrcVT = DstVT = MVT::v4f32;

27880

return true;

27881

}

27882

}

27883

27884

if (MaskVT.is256BitVector() && AllowFloatDomain) {

27885

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27885, __extension__ __PRETTY_FUNCTION__));

27886

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {

27887

Shuffle = X86ISD::MOVDDUP;

27888

SrcVT = DstVT = MVT::v4f64;

27889

return true;

27890

}

27891

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {

27892

Shuffle = X86ISD::MOVSLDUP;

27893

SrcVT = DstVT = MVT::v8f32;

27894

return true;

27895

}

27896

if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {

27897

Shuffle = X86ISD::MOVSHDUP;

27898

SrcVT = DstVT = MVT::v8f32;

27899

return true;

27900

}

27901

}

27902

27903

if (MaskVT.is512BitVector() && AllowFloatDomain) {

27904

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27905, __extension__ __PRETTY_FUNCTION__))

27905

"AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27905, __extension__ __PRETTY_FUNCTION__));

27906

if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {

27907

Shuffle = X86ISD::MOVDDUP;

27908

SrcVT = DstVT = MVT::v8f64;

27909

return true;

27910

}

27911

if (isTargetShuffleEquivalent(

27912

Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {

27913

Shuffle = X86ISD::MOVSLDUP;

27914

SrcVT = DstVT = MVT::v16f32;

27915

return true;

27916

}

27917

if (isTargetShuffleEquivalent(

27918

Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {

27919

Shuffle = X86ISD::MOVSHDUP;

27920

SrcVT = DstVT = MVT::v16f32;

27921

return true;

27922

}

27923

}

27924

27925

// Attempt to match against broadcast-from-vector.

27926

if (Subtarget.hasAVX2()) {

27927

SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);

27928

if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {

27929

SrcVT = DstVT = MaskVT;

27930

Shuffle = X86ISD::VBROADCAST;

27931

return true;

27932

}

27933

}

27934

27935

return false;

27936

}

27937

27938

// Attempt to match a combined shuffle mask against supported unary immediate

27939

// permute instructions.

27940

// TODO: Investigate sharing more of this with shuffle lowering.

27941

static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,

27942

const APInt &Zeroable,

27943

bool AllowFloatDomain,

27944

bool AllowIntDomain,

27945

const X86Subtarget &Subtarget,

27946

unsigned &Shuffle, MVT &ShuffleVT,

27947

unsigned &PermuteImm) {

27948

unsigned NumMaskElts = Mask.size();

27949

unsigned InputSizeInBits = MaskVT.getSizeInBits();

27950

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

27951

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

27952

27953

bool ContainsZeros =

27954

llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

27955

27956

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

27957

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

27958

// Check for lane crossing permutes.

27959

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

27960

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

27961

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

27962

Shuffle = X86ISD::VPERMI;

27963

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

27964

PermuteImm = getV4X86ShuffleImm(Mask);

27965

return true;

27966

}

27967

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

27968

SmallVector<int, 4> RepeatedMask;

27969

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

27970

Shuffle = X86ISD::VPERMI;

27971

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

27972

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

27973

return true;

27974

}

27975

}

27976

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

27977

// VPERMILPD can permute with a non-repeating shuffle.

27978

Shuffle = X86ISD::VPERMILPI;

27979

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

27980

PermuteImm = 0;

27981

for (int i = 0, e = Mask.size(); i != e; ++i) {

27982

int M = Mask[i];

27983

if (M == SM_SentinelUndef)

27984

continue;

27985

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27985, __extension__ __PRETTY_FUNCTION__));

27986

PermuteImm |= (M & 1) << i;

27987

}

27988

return true;

27989

}

27990

}

27991

27992

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

27993

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

27994

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

27995

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

27996

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

27997

SmallVector<int, 4> RepeatedMask;

27998

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

27999

// Narrow the repeated mask to create 32-bit element permutes.

28000

SmallVector<int, 4> WordMask = RepeatedMask;

28001

if (MaskScalarSizeInBits == 64)

28002

scaleShuffleMask<int>(2, RepeatedMask, WordMask);

28003

28004

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

28005

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

28006

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

28007

PermuteImm = getV4X86ShuffleImm(WordMask);

28008

return true;

28009

}

28010

}

28011

28012

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

28013

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {

28014

SmallVector<int, 4> RepeatedMask;

28015

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

28016

ArrayRef<int> LoMask(Mask.data() + 0, 4);

28017

ArrayRef<int> HiMask(Mask.data() + 4, 4);

28018

28019

// PSHUFLW: permute lower 4 elements only.

28020

if (isUndefOrInRange(LoMask, 0, 4) &&

28021

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

28022

Shuffle = X86ISD::PSHUFLW;

28023

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

28024

PermuteImm = getV4X86ShuffleImm(LoMask);

28025

return true;

28026

}

28027

28028

// PSHUFHW: permute upper 4 elements only.

28029

if (isUndefOrInRange(HiMask, 4, 8) &&

28030

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

28031

// Offset the HiMask so that we can create the shuffle immediate.

28032

int OffsetHiMask[4];

28033

for (int i = 0; i != 4; ++i)

28034

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

28035

28036

Shuffle = X86ISD::PSHUFHW;

28037

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

28038

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

28039

return true;

28040

}

28041

}

28042

}

28043

28044

// Attempt to match against byte/bit shifts.

28045

// FIXME: Add 512-bit support.

28046

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

28047

(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {

28048

int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,

28049

MaskScalarSizeInBits, Mask,

28050

0, Zeroable, Subtarget);

28051

if (0 < ShiftAmt) {

28052

PermuteImm = (unsigned)ShiftAmt;

28053

return true;

28054

}

28055

}

28056

28057

return false;

28058

}

28059

28060

// Attempt to match a combined unary shuffle mask against supported binary

28061

// shuffle instructions.

28062

// TODO: Investigate sharing more of this with shuffle lowering.

28063

static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,

28064

bool AllowFloatDomain, bool AllowIntDomain,

28065

SDValue &V1, SDValue &V2, SDLoc &DL,

28066

SelectionDAG &DAG,

28067

const X86Subtarget &Subtarget,

28068

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

28069

bool IsUnary) {

28070

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

28071

28072

if (MaskVT.is128BitVector()) {

28073

if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {

28074

V2 = V1;

28075

Shuffle = X86ISD::MOVLHPS;

28076

SrcVT = DstVT = MVT::v4f32;

28077

return true;

28078

}

28079

if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {

28080

V2 = V1;

28081

Shuffle = X86ISD::MOVHLPS;

28082

SrcVT = DstVT = MVT::v4f32;

28083

return true;

28084

}

28085

if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&

28086

(AllowFloatDomain || !Subtarget.hasSSE41())) {

28087

std::swap(V1, V2);

28088

Shuffle = X86ISD::MOVSD;

28089

SrcVT = DstVT = MaskVT;

28090

return true;

28091

}

28092

if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&

28093

(AllowFloatDomain || !Subtarget.hasSSE41())) {

28094

Shuffle = X86ISD::MOVSS;

28095

SrcVT = DstVT = MaskVT;

28096

return true;

28097

}

28098

}

28099

28100

// Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.

28101

// TODO add support for 256/512-bit types.

28102

if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {

28103

if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

28104

Subtarget)) {

28105

DstVT = MaskVT;

28106

return true;

28107

}

28108

}

28109

28110

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

28111

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

28112

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

28113

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

28114

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

28115

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

28116

if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,

28117

DAG, Subtarget)) {

28118

SrcVT = DstVT = MaskVT;

28119

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

28120

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

28121

return true;

28122

}

28123

}

28124

28125

return false;

28126

}

28127

28128

static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,

28129

const APInt &Zeroable,

28130

bool AllowFloatDomain,

28131

bool AllowIntDomain,

28132

SDValue &V1, SDValue &V2, SDLoc &DL,

28133

SelectionDAG &DAG,

28134

const X86Subtarget &Subtarget,

28135

unsigned &Shuffle, MVT &ShuffleVT,

28136

unsigned &PermuteImm) {

28137

unsigned NumMaskElts = Mask.size();

28138

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

28139

28140

// Attempt to match against PALIGNR byte rotate.

28141

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

28142

(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {

28143

int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);

28144

if (0 < ByteRotation) {

28145

Shuffle = X86ISD::PALIGNR;

28146

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

28147

PermuteImm = ByteRotation;

28148

return true;

28149

}

28150

}

28151

28152

// Attempt to combine to X86ISD::BLENDI.

28153

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

28154

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

28155

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

28156

uint64_t BlendMask = 0;

28157

bool ForceV1Zero = false, ForceV2Zero = false;

28158

SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());

28159

if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,

28160

BlendMask)) {

28161

if (MaskVT == MVT::v16i16) {

28162

// We can only use v16i16 PBLENDW if the lanes are repeated.

28163

SmallVector<int, 8> RepeatedMask;

28164

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

28165

RepeatedMask)) {

28166

assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28167, __extension__ __PRETTY_FUNCTION__))

28167

"Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28167, __extension__ __PRETTY_FUNCTION__));

28168

PermuteImm = 0;

28169

for (int i = 0; i < 8; ++i)

28170

if (RepeatedMask[i] >= 8)

28171

PermuteImm |= 1 << i;

28172

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

28173

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

28174

Shuffle = X86ISD::BLENDI;

28175

ShuffleVT = MaskVT;

28176

return true;

28177

}

28178

} else {

28179

// Determine a type compatible with X86ISD::BLENDI.

28180

ShuffleVT = MaskVT;

28181

if (Subtarget.hasAVX2()) {

28182

if (ShuffleVT == MVT::v4i64)

28183

ShuffleVT = MVT::v8i32;

28184

else if (ShuffleVT == MVT::v2i64)

28185

ShuffleVT = MVT::v4i32;

28186

} else {

28187

if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)

28188

ShuffleVT = MVT::v8i16;

28189

else if (ShuffleVT == MVT::v4i64)

28190

ShuffleVT = MVT::v4f64;

28191

else if (ShuffleVT == MVT::v8i32)

28192

ShuffleVT = MVT::v8f32;

28193

}

28194

28195

if (!ShuffleVT.isFloatingPoint()) {

28196

int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();

28197

BlendMask =

28198

scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);

28199

ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);

28200

ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);

28201

}

28202

28203

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

28204

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

28205

PermuteImm = (unsigned)BlendMask;

28206

Shuffle = X86ISD::BLENDI;

28207

return true;

28208

}

28209

}

28210

}

28211

28212

// Attempt to combine to INSERTPS.

28213

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

28214

MaskVT.is128BitVector()) {

28215

if (Zeroable.getBoolValue() &&

28216

matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

28217

Shuffle = X86ISD::INSERTPS;

28218

ShuffleVT = MVT::v4f32;

28219

return true;

28220

}

28221

}

28222

28223

// Attempt to combine to SHUFPD.

28224

if (AllowFloatDomain && EltSizeInBits == 64 &&

28225

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

28226

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

28227

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

28228

if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {

28229

Shuffle = X86ISD::SHUFP;

28230

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

28231

return true;

28232

}

28233

}

28234

28235

// Attempt to combine to SHUFPS.

28236

if (AllowFloatDomain && EltSizeInBits == 32 &&

28237

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

28238

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

28239

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

28240

SmallVector<int, 4> RepeatedMask;

28241

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

28242

// Match each half of the repeated mask, to determine if its just

28243

// referencing one of the vectors, is zeroable or entirely undef.

28244

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

28245

int M0 = RepeatedMask[Offset];

28246

int M1 = RepeatedMask[Offset + 1];

28247

28248

if (isUndefInRange(RepeatedMask, Offset, 2)) {

28249

return DAG.getUNDEF(MaskVT);

28250

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

28251

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

28252

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

28253

return getZeroVector(MaskVT, Subtarget, DAG, DL);

28254

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

28255

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

28256

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

28257

return V1;

28258

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

28259

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

28260

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

28261

return V2;

28262

}

28263

28264

return SDValue();

28265

};

28266

28267

int ShufMask[4] = {-1, -1, -1, -1};

28268

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

28269

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

28270

28271

if (Lo && Hi) {

28272

V1 = Lo;

28273

V2 = Hi;

28274

Shuffle = X86ISD::SHUFP;

28275

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

28276

PermuteImm = getV4X86ShuffleImm(ShufMask);

28277

return true;

28278

}

28279

}

28280

}

28281

28282

return false;

28283

}

28284

28285

/// \brief Combine an arbitrary chain of shuffles into a single instruction if

28286

/// possible.

28287

///

28288

/// This is the leaf of the recursive combine below. When we have found some

28289

/// chain of single-use x86 shuffle instructions and accumulated the combined

28290

/// shuffle mask represented by them, this will try to pattern match that mask

28291

/// into either a single instruction if there is a special purpose instruction

28292

/// for this operation, or into a PSHUFB instruction which is a fully general

28293

/// instruction but should only be used to replace chains over a certain depth.

28294

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

28295

ArrayRef<int> BaseMask, int Depth,

28296

bool HasVariableMask, SelectionDAG &DAG,

28297

TargetLowering::DAGCombinerInfo &DCI,

28298

const X86Subtarget &Subtarget) {

28299

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28299, __extension__ __PRETTY_FUNCTION__));

28300

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28301, __extension__ __PRETTY_FUNCTION__))

28301

"Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28301, __extension__ __PRETTY_FUNCTION__));

28302

28303

// Find the inputs that enter the chain. Note that multiple uses are OK

28304

// here, we're not going to remove the operands we find.

28305

bool UnaryShuffle = (Inputs.size() == 1);

28306

SDValue V1 = peekThroughBitcasts(Inputs[0]);

28307

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

28308

: peekThroughBitcasts(Inputs[1]));

28309

28310

MVT VT1 = V1.getSimpleValueType();

28311

MVT VT2 = V2.getSimpleValueType();

28312

MVT RootVT = Root.getSimpleValueType();

28313

assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&(static_cast <bool> (VT1.getSizeInBits() == RootVT.getSizeInBits
() && VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch") ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28315, __extension__ __PRETTY_FUNCTION__))

28314

VT2.getSizeInBits() == RootVT.getSizeInBits() &&(static_cast <bool> (VT1.getSizeInBits() == RootVT.getSizeInBits
() && VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch") ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28315, __extension__ __PRETTY_FUNCTION__))

28315

"Vector size mismatch")(static_cast <bool> (VT1.getSizeInBits() == RootVT.getSizeInBits
() && VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch") ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28315, __extension__ __PRETTY_FUNCTION__));

28316

28317

SDLoc DL(Root);

28318

SDValue Res;

28319

28320

unsigned NumBaseMaskElts = BaseMask.size();

28321

if (NumBaseMaskElts == 1) {

28322

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28322, __extension__ __PRETTY_FUNCTION__));

28323

return DAG.getBitcast(RootVT, V1);

28324

}

28325

28326

unsigned RootSizeInBits = RootVT.getSizeInBits();

28327

unsigned NumRootElts = RootVT.getVectorNumElements();

28328

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

28329

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

28330

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

28331

28332

// Don't combine if we are a AVX512/EVEX target and the mask element size

28333

// is different from the root element size - this would prevent writemasks

28334

// from being reused.

28335

// TODO - this currently prevents all lane shuffles from occurring.

28336

// TODO - check for writemasks usage instead of always preventing combining.

28337

// TODO - attempt to narrow Mask back to writemask size.

28338

bool IsEVEXShuffle =

28339

RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);

28340

if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))

28341

return SDValue();

28342

28343

// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

28344

28345

// Handle 128-bit lane shuffles of 256-bit vectors.

28346

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

28347

// we need to use the zeroing feature.

28348

// TODO - this should support binary shuffles.

28349

if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&

28350

!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&

28351

!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {

28352

if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)

28353

return SDValue(); // Nothing to do!

28354

MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);

28355

unsigned PermMask = 0;

28356

PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);

28357

PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

28358

28359

Res = DAG.getBitcast(ShuffleVT, V1);

28360

DCI.AddToWorklist(Res.getNode());

28361

Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,

28362

DAG.getUNDEF(ShuffleVT),

28363

DAG.getConstant(PermMask, DL, MVT::i8));

28364

DCI.AddToWorklist(Res.getNode());

28365

return DAG.getBitcast(RootVT, Res);

28366

}

28367

28368

// For masks that have been widened to 128-bit elements or more,

28369

// narrow back down to 64-bit elements.

28370

SmallVector<int, 64> Mask;

28371

if (BaseMaskEltSizeInBits > 64) {

28372

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28372, __extension__ __PRETTY_FUNCTION__));

28373

int MaskScale = BaseMaskEltSizeInBits / 64;

28374

scaleShuffleMask<int>(MaskScale, BaseMask, Mask);

28375

} else {

28376

Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());

28377

}

28378

28379

unsigned NumMaskElts = Mask.size();

28380

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

28381

28382

// Determine the effective mask value type.

28383

FloatDomain &= (32 <= MaskEltSizeInBits);

28384

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

28385

: MVT::getIntegerVT(MaskEltSizeInBits);

28386

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

28387

28388

// Only allow legal mask types.

28389

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

28390

return SDValue();

28391

28392

// Attempt to match the mask against known shuffle patterns.

28393

MVT ShuffleSrcVT, ShuffleVT;

28394

unsigned Shuffle, PermuteImm;

28395

28396

// Which shuffle domains are permitted?

28397

// Permit domain crossing at higher combine depths.

28398

bool AllowFloatDomain = FloatDomain || (Depth > 3);

28399

bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&

28400

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

28401

28402

// Determine zeroable mask elements.

28403

APInt Zeroable(NumMaskElts, 0);

28404

for (unsigned i = 0; i != NumMaskElts; ++i)

28405

if (isUndefOrZero(Mask[i]))

28406

Zeroable.setBit(i);

28407

28408

if (UnaryShuffle) {

28409

// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load

28410

// directly if we don't shuffle the lower element and we shuffle the upper

28411

// (zero) elements within themselves.

28412

if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&

28413

(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {

28414

unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;

28415

ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);

28416

if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&

28417

isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {

28418

return DAG.getBitcast(RootVT, V1);

28419

}

28420

}

28421

28422

if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,

28423

V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

28424

ShuffleVT)) {

28425

if (Depth == 1 && Root.getOpcode() == Shuffle)

28426

return SDValue(); // Nothing to do!

28427

if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))

28428

return SDValue(); // AVX512 Writemask clash.

28429

Res = DAG.getBitcast(ShuffleSrcVT, V1);

28430

DCI.AddToWorklist(Res.getNode());

28431

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

28432

DCI.AddToWorklist(Res.getNode());

28433

return DAG.getBitcast(RootVT, Res);

28434

}

28435

28436

if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

28437

AllowIntDomain, Subtarget, Shuffle,

28438

ShuffleVT, PermuteImm)) {

28439

if (Depth == 1 && Root.getOpcode() == Shuffle)

28440

return SDValue(); // Nothing to do!

28441

if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))

28442

return SDValue(); // AVX512 Writemask clash.

28443

Res = DAG.getBitcast(ShuffleVT, V1);

28444

DCI.AddToWorklist(Res.getNode());

28445

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

28446

DAG.getConstant(PermuteImm, DL, MVT::i8));

28447

DCI.AddToWorklist(Res.getNode());

28448

return DAG.getBitcast(RootVT, Res);

28449

}

28450

}

28451

28452

if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,

28453

V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

28454

ShuffleVT, UnaryShuffle)) {

28455

if (Depth == 1 && Root.getOpcode() == Shuffle)

28456

return SDValue(); // Nothing to do!

28457

if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))

28458

return SDValue(); // AVX512 Writemask clash.

28459

V1 = DAG.getBitcast(ShuffleSrcVT, V1);

28460

DCI.AddToWorklist(V1.getNode());

28461

V2 = DAG.getBitcast(ShuffleSrcVT, V2);

28462

DCI.AddToWorklist(V2.getNode());

28463

Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);

28464

DCI.AddToWorklist(Res.getNode());

28465

return DAG.getBitcast(RootVT, Res);

28466

}

28467

28468

if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

28469

AllowIntDomain, V1, V2, DL, DAG,

28470

Subtarget, Shuffle, ShuffleVT,

28471

PermuteImm)) {

28472

if (Depth == 1 && Root.getOpcode() == Shuffle)

28473

return SDValue(); // Nothing to do!

28474

if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))

28475

return SDValue(); // AVX512 Writemask clash.

28476

V1 = DAG.getBitcast(ShuffleVT, V1);

28477

DCI.AddToWorklist(V1.getNode());

28478

V2 = DAG.getBitcast(ShuffleVT, V2);

28479

DCI.AddToWorklist(V2.getNode());

28480

Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,

28481

DAG.getConstant(PermuteImm, DL, MVT::i8));

28482

DCI.AddToWorklist(Res.getNode());

28483

return DAG.getBitcast(RootVT, Res);

28484

}

28485

28486

// Typically from here on, we need an integer version of MaskVT.

28487

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

28488

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

28489

28490

// Annoyingly, SSE4A instructions don't map into the above match helpers.

28491

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

28492

uint64_t BitLen, BitIdx;

28493

if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

28494

Zeroable)) {

28495

if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)

28496

return SDValue(); // Nothing to do!

28497

V1 = DAG.getBitcast(IntMaskVT, V1);

28498

DCI.AddToWorklist(V1.getNode());

28499

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

28500

DAG.getConstant(BitLen, DL, MVT::i8),

28501

DAG.getConstant(BitIdx, DL, MVT::i8));

28502

DCI.AddToWorklist(Res.getNode());

28503

return DAG.getBitcast(RootVT, Res);

28504

}

28505

28506

if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

28507

if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)

28508

return SDValue(); // Nothing to do!

28509

V1 = DAG.getBitcast(IntMaskVT, V1);

28510

DCI.AddToWorklist(V1.getNode());

28511

V2 = DAG.getBitcast(IntMaskVT, V2);

28512

DCI.AddToWorklist(V2.getNode());

28513

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

28514

DAG.getConstant(BitLen, DL, MVT::i8),

28515

DAG.getConstant(BitIdx, DL, MVT::i8));

28516

DCI.AddToWorklist(Res.getNode());

28517

return DAG.getBitcast(RootVT, Res);

28518

}

28519

}

28520

28521

// Don't try to re-form single instruction chains under any circumstances now

28522

// that we've done encoding canonicalization for them.

28523

if (Depth < 2)

28524

return SDValue();

28525

28526

// Depth threshold above which we can efficiently use variable mask shuffles.

28527

// TODO This should probably be target specific.

28528

bool AllowVariableMask = (Depth >= 3) || HasVariableMask;

28529

28530

bool MaskContainsZeros =

28531

any_of(Mask, [](int M) { return M == SM_SentinelZero; });

28532

28533

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

28534

// If we have a single input lane-crossing shuffle then lower to VPERMV.

28535

if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&

28536

((Subtarget.hasAVX2() &&

28537

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

28538

(Subtarget.hasAVX512() &&

28539

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

28540

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

28541

(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||

28542

(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||

28543

(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||

28544

(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {

28545

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

28546

DCI.AddToWorklist(VPermMask.getNode());

28547

Res = DAG.getBitcast(MaskVT, V1);

28548

DCI.AddToWorklist(Res.getNode());

28549

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

28550

DCI.AddToWorklist(Res.getNode());

28551

return DAG.getBitcast(RootVT, Res);

28552

}

28553

28554

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

28555

// vector as the second source.

28556

if (UnaryShuffle && AllowVariableMask &&

28557

((Subtarget.hasAVX512() &&

28558

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

28559

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

28560

(Subtarget.hasVLX() &&

28561

(MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

28562

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

28563

(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||

28564

(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||

28565

(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||

28566

(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {

28567

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

28568

for (unsigned i = 0; i != NumMaskElts; ++i)

28569

if (Mask[i] == SM_SentinelZero)

28570

Mask[i] = NumMaskElts + i;

28571

28572

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

28573

DCI.AddToWorklist(VPermMask.getNode());

28574

Res = DAG.getBitcast(MaskVT, V1);

28575

DCI.AddToWorklist(Res.getNode());

28576

SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);

28577

DCI.AddToWorklist(Zero.getNode());

28578

Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);

28579

DCI.AddToWorklist(Res.getNode());

28580

return DAG.getBitcast(RootVT, Res);

28581

}

28582

28583

// If we have a dual input lane-crossing shuffle then lower to VPERMV3.

28584

if (AllowVariableMask && !MaskContainsZeros &&

28585

((Subtarget.hasAVX512() &&

28586

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

28587

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

28588

(Subtarget.hasVLX() &&

28589

(MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

28590

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

28591

(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||

28592

(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||

28593

(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||

28594

(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {

28595

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

28596

DCI.AddToWorklist(VPermMask.getNode());

28597

V1 = DAG.getBitcast(MaskVT, V1);

28598

DCI.AddToWorklist(V1.getNode());

28599

V2 = DAG.getBitcast(MaskVT, V2);

28600

DCI.AddToWorklist(V2.getNode());

28601

Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);

28602

DCI.AddToWorklist(Res.getNode());

28603

return DAG.getBitcast(RootVT, Res);

28604

}

28605

return SDValue();

28606

}

28607

28608

// See if we can combine a single input shuffle with zeros to a bit-mask,

28609

// which is much simpler than any shuffle.

28610

if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&

28611

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

28612

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

28613

APInt Zero = APInt::getNullValue(MaskEltSizeInBits);

28614

APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);

28615

APInt UndefElts(NumMaskElts, 0);

28616

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

28617

for (unsigned i = 0; i != NumMaskElts; ++i) {

28618

int M = Mask[i];

28619

if (M == SM_SentinelUndef) {

28620

UndefElts.setBit(i);

28621

continue;

28622

}

28623

if (M == SM_SentinelZero)

28624

continue;

28625

EltBits[i] = AllOnes;

28626

}

28627

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

28628

DCI.AddToWorklist(BitMask.getNode());

28629

Res = DAG.getBitcast(MaskVT, V1);

28630

DCI.AddToWorklist(Res.getNode());

28631

unsigned AndOpcode =

28632

FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

28633

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

28634

DCI.AddToWorklist(Res.getNode());

28635

return DAG.getBitcast(RootVT, Res);

28636

}

28637

28638

// If we have a single input shuffle with different shuffle patterns in the

28639

// the 128-bit lanes use the variable mask to VPERMILPS.

28640

// TODO Combine other mask types at higher depths.

28641

if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&

28642

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

28643

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

28644

SmallVector<SDValue, 16> VPermIdx;

28645

for (int M : Mask) {

28646

SDValue Idx =

28647

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

28648

VPermIdx.push_back(Idx);

28649

}

28650

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

28651

DCI.AddToWorklist(VPermMask.getNode());

28652

Res = DAG.getBitcast(MaskVT, V1);

28653

DCI.AddToWorklist(Res.getNode());

28654

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

28655

DCI.AddToWorklist(Res.getNode());

28656

return DAG.getBitcast(RootVT, Res);

28657

}

28658

28659

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

28660

// to VPERMIL2PD/VPERMIL2PS.

28661

if (AllowVariableMask && Subtarget.hasXOP() &&

28662

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

28663

MaskVT == MVT::v8f32)) {

28664

// VPERMIL2 Operation.

28665

// Bits[3] - Match Bit.

28666

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

28667

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

28668

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

28669

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

28670

SmallVector<int, 8> VPerm2Idx;

28671

unsigned M2ZImm = 0;

28672

for (int M : Mask) {

28673

if (M == SM_SentinelUndef) {

28674

VPerm2Idx.push_back(-1);

28675

continue;

28676

}

28677

if (M == SM_SentinelZero) {

28678

M2ZImm = 2;

28679

VPerm2Idx.push_back(8);

28680

continue;

28681

}

28682

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

28683

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

28684

VPerm2Idx.push_back(Index);

28685

}

28686

V1 = DAG.getBitcast(MaskVT, V1);

28687

DCI.AddToWorklist(V1.getNode());

28688

V2 = DAG.getBitcast(MaskVT, V2);

28689

DCI.AddToWorklist(V2.getNode());

28690

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

28691

DCI.AddToWorklist(VPerm2MaskOp.getNode());

28692

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

28693

DAG.getConstant(M2ZImm, DL, MVT::i8));

28694

DCI.AddToWorklist(Res.getNode());

28695

return DAG.getBitcast(RootVT, Res);

28696

}

28697

28698

// If we have 3 or more shuffle instructions or a chain involving a variable

28699

// mask, we can replace them with a single PSHUFB instruction profitably.

28700

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

28701

// instructions, but in practice PSHUFB tends to be *very* fast so we're

28702

// more aggressive.

28703

if (UnaryShuffle && AllowVariableMask &&

28704

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

28705

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

28706

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

28707

SmallVector<SDValue, 16> PSHUFBMask;

28708

int NumBytes = RootVT.getSizeInBits() / 8;

28709

int Ratio = NumBytes / NumMaskElts;

28710

for (int i = 0; i < NumBytes; ++i) {

28711

int M = Mask[i / Ratio];

28712

if (M == SM_SentinelUndef) {

28713

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

28714

continue;

28715

}

28716

if (M == SM_SentinelZero) {

28717

PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));

28718

continue;

28719

}

28720

M = Ratio * M + i % Ratio;

28721

assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28721, __extension__ __PRETTY_FUNCTION__));

28722

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

28723

}

28724

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

28725

Res = DAG.getBitcast(ByteVT, V1);

28726

DCI.AddToWorklist(Res.getNode());

28727

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

28728

DCI.AddToWorklist(PSHUFBMaskOp.getNode());

28729

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

28730

DCI.AddToWorklist(Res.getNode());

28731

return DAG.getBitcast(RootVT, Res);

28732

}

28733

28734

// With XOP, if we have a 128-bit binary input shuffle we can always combine

28735

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

28736

// slower than PSHUFB on targets that support both.

28737

if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {

28738

// VPPERM Mask Operation

28739

// Bits[4:0] - Byte Index (0 - 31)

28740

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

28741

SmallVector<SDValue, 16> VPPERMMask;

28742

int NumBytes = 16;

28743

int Ratio = NumBytes / NumMaskElts;

28744

for (int i = 0; i < NumBytes; ++i) {

28745

int M = Mask[i / Ratio];

28746

if (M == SM_SentinelUndef) {

28747

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

28748

continue;

28749

}

28750

if (M == SM_SentinelZero) {

28751

VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));

28752

continue;

28753

}

28754

M = Ratio * M + i % Ratio;

28755

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

28756

}

28757

MVT ByteVT = MVT::v16i8;

28758

V1 = DAG.getBitcast(ByteVT, V1);

28759

DCI.AddToWorklist(V1.getNode());

28760

V2 = DAG.getBitcast(ByteVT, V2);

28761

DCI.AddToWorklist(V2.getNode());

28762

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

28763

DCI.AddToWorklist(VPPERMMaskOp.getNode());

28764

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

28765

DCI.AddToWorklist(Res.getNode());

28766

return DAG.getBitcast(RootVT, Res);

28767

}

28768

28769

// Failed to find any combines.

28770

return SDValue();

28771

}

28772

28773

// Attempt to constant fold all of the constant source ops.

28774

// Returns true if the entire shuffle is folded to a constant.

28775

// TODO: Extend this to merge multiple constant Ops and update the mask.

28776

static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,

28777

ArrayRef<int> Mask, SDValue Root,

28778

bool HasVariableMask,

28779

SelectionDAG &DAG,

28780

TargetLowering::DAGCombinerInfo &DCI,

28781

const X86Subtarget &Subtarget) {

28782

MVT VT = Root.getSimpleValueType();

28783

28784

unsigned SizeInBits = VT.getSizeInBits();

28785

unsigned NumMaskElts = Mask.size();

28786

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

28787

unsigned NumOps = Ops.size();

28788

28789

// Extract constant bits from each source op.

28790

bool OneUseConstantOp = false;

28791

SmallVector<APInt, 16> UndefEltsOps(NumOps);

28792

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

28793

for (unsigned i = 0; i != NumOps; ++i) {

28794

SDValue SrcOp = Ops[i];

28795

OneUseConstantOp |= SrcOp.hasOneUse();

28796

if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],

28797

RawBitsOps[i]))

28798

return SDValue();

28799

}

28800

28801

// Only fold if at least one of the constants is only used once or

28802

// the combined shuffle has included a variable mask shuffle, this

28803

// is to avoid constant pool bloat.

28804

if (!OneUseConstantOp && !HasVariableMask)

28805

return SDValue();

28806

28807

// Shuffle the constant bits according to the mask.

28808

APInt UndefElts(NumMaskElts, 0);

28809

APInt ZeroElts(NumMaskElts, 0);

28810

APInt ConstantElts(NumMaskElts, 0);

28811

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

28812

APInt::getNullValue(MaskSizeInBits));

28813

for (unsigned i = 0; i != NumMaskElts; ++i) {

28814

int M = Mask[i];

28815

if (M == SM_SentinelUndef) {

28816

UndefElts.setBit(i);

28817

continue;

28818

} else if (M == SM_SentinelZero) {

28819

ZeroElts.setBit(i);

28820

continue;

28821

}

28822

assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28822, __extension__ __PRETTY_FUNCTION__));

28823

28824

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

28825

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

28826

28827

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

28828

if (SrcUndefElts[SrcMaskIdx]) {

28829

UndefElts.setBit(i);

28830

continue;

28831

}

28832

28833

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

28834

APInt &Bits = SrcEltBits[SrcMaskIdx];

28835

if (!Bits) {

28836

ZeroElts.setBit(i);

28837

continue;

28838

}

28839

28840

ConstantElts.setBit(i);

28841

ConstantBitData[i] = Bits;

28842

}

28843

assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnesValue()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28843, __extension__ __PRETTY_FUNCTION__));

28844

28845

// Create the constant data.

28846

MVT MaskSVT;

28847

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

28848

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

28849

else

28850

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

28851

28852

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

28853

28854

SDLoc DL(Root);

28855

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

28856

DCI.AddToWorklist(CstOp.getNode());

28857

return DAG.getBitcast(VT, CstOp);

28858

}

28859

28860

/// \brief Fully generic combining of x86 shuffle instructions.

28861

///

28862

/// This should be the last combine run over the x86 shuffle instructions. Once

28863

/// they have been fully optimized, this will recursively consider all chains

28864

/// of single-use shuffle instructions, build a generic model of the cumulative

28865

/// shuffle operation, and check for simpler instructions which implement this

28866

/// operation. We use this primarily for two purposes:

28867

///

28868

/// 1) Collapse generic shuffles to specialized single instructions when

28869

/// equivalent. In most cases, this is just an encoding size win, but

28870

/// sometimes we will collapse multiple generic shuffles into a single

28871

/// special-purpose shuffle.

28872

/// 2) Look for sequences of shuffle instructions with 3 or more total

28873

/// instructions, and replace them with the slightly more expensive SSSE3

28874

/// PSHUFB instruction if available. We do this as the last combining step

28875

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

28876

/// a suitable short sequence of other instructions. The PSHUFB will either

28877

/// use a register or have to read from memory and so is slightly (but only

28878

/// slightly) more expensive than the other shuffle instructions.

28879

///

28880

/// Because this is inherently a quadratic operation (for each shuffle in

28881

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

28882

/// This should never be an issue in practice as the shuffle lowering doesn't

28883

/// produce sequences of more than 8 instructions.

28884

///

28885

/// FIXME: We will currently miss some cases where the redundant shuffling

28886

/// would simplify under the threshold for PSHUFB formation because of

28887

/// combine-ordering. To fix this, we should do the redundant instruction

28888

/// combining in this recursive walk.

28889

static SDValue combineX86ShufflesRecursively(

28890

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

28891

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,

28892

bool HasVariableMask, SelectionDAG &DAG,

28893

TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

28894

// Bound the depth of our recursive combine because this is ultimately

28895

// quadratic in nature.

28896

if (Depth > 8)

28897

return SDValue();

28898

28899

// Directly rip through bitcasts to find the underlying operand.

28900

SDValue Op = SrcOps[SrcOpIndex];

28901

Op = peekThroughOneUseBitcasts(Op);

28902

28903

MVT VT = Op.getSimpleValueType();

28904

if (!VT.isVector())

28905

return SDValue(); // Bail if we hit a non-vector.

28906

28907

assert(Root.getSimpleValueType().isVector() &&(static_cast <bool> (Root.getSimpleValueType().isVector
() && "Shuffles operate on vector types!") ? void (0)
: __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28908, __extension__ __PRETTY_FUNCTION__))

28908

"Shuffles operate on vector types!")(static_cast <bool> (Root.getSimpleValueType().isVector
() && "Shuffles operate on vector types!") ? void (0)
: __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28908, __extension__ __PRETTY_FUNCTION__));

28909

assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == Root.getSimpleValueType
().getSizeInBits() && "Can only combine shuffles of the same vector register size."
) ? void (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28910, __extension__ __PRETTY_FUNCTION__))

28910

"Can only combine shuffles of the same vector register size.")(static_cast <bool> (VT.getSizeInBits() == Root.getSimpleValueType
().getSizeInBits() && "Can only combine shuffles of the same vector register size."
) ? void (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28910, __extension__ __PRETTY_FUNCTION__));

28911

28912

// Extract target shuffle mask and resolve sentinels and inputs.

28913

SmallVector<int, 64> OpMask;

28914

SmallVector<SDValue, 2> OpInputs;

28915

if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))

28916

return SDValue();

28917

28918

assert(OpInputs.size() <= 2 && "Too many shuffle inputs")(static_cast <bool> (OpInputs.size() <= 2 &&
"Too many shuffle inputs") ? void (0) : __assert_fail ("OpInputs.size() <= 2 && \"Too many shuffle inputs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28918, __extension__ __PRETTY_FUNCTION__));

28919

SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());

28920

SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());

28921

28922

// Add the inputs to the Ops list, avoiding duplicates.

28923

SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

28924

28925

int InputIdx0 = -1, InputIdx1 = -1;

28926

for (int i = 0, e = Ops.size(); i < e; ++i) {

28927

SDValue BC = peekThroughBitcasts(Ops[i]);

28928

if (Input0 && BC == peekThroughBitcasts(Input0))

28929

InputIdx0 = i;

28930

if (Input1 && BC == peekThroughBitcasts(Input1))

28931

InputIdx1 = i;

28932

}

28933

28934

if (Input0 && InputIdx0 < 0) {

28935

InputIdx0 = SrcOpIndex;

28936

Ops[SrcOpIndex] = Input0;

28937

}

28938

if (Input1 && InputIdx1 < 0) {

28939

InputIdx1 = Ops.size();

28940

Ops.push_back(Input1);

28941

}

28942

28943

assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))

28944

RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))

28945

(OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))

28946

OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))

28947

OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))

28948

"The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__));

28949

28950

// This function can be performance-critical, so we rely on the power-of-2

28951

// knowledge that we have about the mask sizes to replace div/rem ops with

28952

// bit-masks and shifts.

28953

assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28953, __extension__ __PRETTY_FUNCTION__));

28954

assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28954, __extension__ __PRETTY_FUNCTION__));

28955

unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());

28956

unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

28957

28958

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

28959

unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

28960

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

28961

assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28962, __extension__ __PRETTY_FUNCTION__))

28962

"Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28962, __extension__ __PRETTY_FUNCTION__));

28963

28964

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28964, __extension__ __PRETTY_FUNCTION__));

28965

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28965, __extension__ __PRETTY_FUNCTION__));

28966

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28966, __extension__ __PRETTY_FUNCTION__));

28967

unsigned RootRatioLog2 = countTrailingZeros(RootRatio);

28968

unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

28969

28970

SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

28971

28972

// Merge this shuffle operation's mask into our accumulated mask. Note that

28973

// this shuffle's mask will be the first applied to the input, followed by the

28974

// root mask to get us all the way to the root value arrangement. The reason

28975

// for this order is that we are recursing up the operation chain.

28976

for (unsigned i = 0; i < MaskWidth; ++i) {

28977

unsigned RootIdx = i >> RootRatioLog2;

28978

if (RootMask[RootIdx] < 0) {

28979

// This is a zero or undef lane, we're done.

28980

Mask[i] = RootMask[RootIdx];

28981

continue;

28982

}

28983

28984

unsigned RootMaskedIdx =

28985

RootRatio == 1

28986

? RootMask[RootIdx]

28987

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

28988

28989

// Just insert the scaled root mask value if it references an input other

28990

// than the SrcOp we're currently inserting.

28991

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

28992

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

28993

Mask[i] = RootMaskedIdx;

28994

continue;

28995

}

28996

28997

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

28998

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

28999

if (OpMask[OpIdx] < 0) {

29000

// The incoming lanes are zero or undef, it doesn't matter which ones we

29001

// are using.

29002

Mask[i] = OpMask[OpIdx];

29003

continue;

29004

}

29005

29006

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

29007

unsigned OpMaskedIdx =

29008

OpRatio == 1

29009

? OpMask[OpIdx]

29010

: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

29011

29012

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

29013

if (OpMask[OpIdx] < (int)OpMask.size()) {

29014

assert(0 <= InputIdx0 && "Unknown target shuffle input")(static_cast <bool> (0 <= InputIdx0 && "Unknown target shuffle input"
) ? void (0) : __assert_fail ("0 <= InputIdx0 && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29014, __extension__ __PRETTY_FUNCTION__));

29015

OpMaskedIdx += InputIdx0 * MaskWidth;

29016

} else {

29017

assert(0 <= InputIdx1 && "Unknown target shuffle input")(static_cast <bool> (0 <= InputIdx1 && "Unknown target shuffle input"
) ? void (0) : __assert_fail ("0 <= InputIdx1 && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29017, __extension__ __PRETTY_FUNCTION__));

29018

OpMaskedIdx += InputIdx1 * MaskWidth;

29019

}

29020

29021

Mask[i] = OpMaskedIdx;

29022

}

29023

29024

// Handle the all undef/zero cases early.

29025

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

29026

return DAG.getUNDEF(Root.getValueType());

29027

29028

// TODO - should we handle the mixed zero/undef case as well? Just returning

29029

// a zero mask will lose information on undef elements possibly reducing

29030

// future combine possibilities.

29031

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

29032

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,

29033

SDLoc(Root));

29034

29035

// Remove unused shuffle source ops.

29036

resolveTargetShuffleInputsAndMask(Ops, Mask);

29037

assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29037, __extension__ __PRETTY_FUNCTION__));

29038

29039

HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());

29040

29041

// Update the list of shuffle nodes that have been combined so far.

29042

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

29043

SrcNodes.end());

29044

CombinedNodes.push_back(Op.getNode());

29045

29046

// See if we can recurse into each shuffle source op (if it's a target

29047

// shuffle). The source op should only be combined if it either has a

29048

// single use (i.e. current Op) or all its users have already been combined.

29049

for (int i = 0, e = Ops.size(); i < e; ++i)

29050

if (Ops[i].getNode()->hasOneUse() ||

29051

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))

29052

if (SDValue Res = combineX86ShufflesRecursively(

29053

Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,

29054

DAG, DCI, Subtarget))

29055

return Res;

29056

29057

// Attempt to constant fold all of the constant source ops.

29058

if (SDValue Cst = combineX86ShufflesConstants(

29059

Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))

29060

return Cst;

29061

29062

// We can only combine unary and binary shuffle mask cases.

29063

if (Ops.size() > 2)

29064

return SDValue();

29065

29066

// Minor canonicalization of the accumulated shuffle mask to make it easier

29067

// to match below. All this does is detect masks with sequential pairs of

29068

// elements, and shrink them to the half-width mask. It does this in a loop

29069

// so it will reduce the size of the mask to the minimal width mask which

29070

// performs an equivalent shuffle.

29071

SmallVector<int, 64> WidenedMask;

29072

while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {

29073

Mask = std::move(WidenedMask);

29074

}

29075

29076

// Canonicalization of binary shuffle masks to improve pattern matching by

29077

// commuting the inputs.

29078

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

29079

ShuffleVectorSDNode::commuteMask(Mask);

29080

std::swap(Ops[0], Ops[1]);

29081

}

29082

29083

// Finally, try to combine into a single shuffle instruction.

29084

return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,

29085

DCI, Subtarget);

29086

}

29087

29088

/// \brief Get the PSHUF-style mask from PSHUF node.

29089

///

29090

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

29091

/// PSHUF-style masks that can be reused with such instructions.

29092

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

29093

MVT VT = N.getSimpleValueType();

29094

SmallVector<int, 4> Mask;

29095

SmallVector<SDValue, 2> Ops;

29096

bool IsUnary;

29097

bool HaveMask =

29098

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);

29099

(void)HaveMask;

29100

assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29100, __extension__ __PRETTY_FUNCTION__));

29101

29102

// If we have more than 128-bits, only the low 128-bits of shuffle mask

29103

// matter. Check that the upper masks are repeats and remove them.

29104

if (VT.getSizeInBits() > 128) {

29105

int LaneElts = 128 / VT.getScalarSizeInBits();

29106

#ifndef NDEBUG

29107

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

29108

for (int j = 0; j < LaneElts; ++j)

29109

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29110, __extension__ __PRETTY_FUNCTION__))

29110

"Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29110, __extension__ __PRETTY_FUNCTION__));

29111

#endif

29112

Mask.resize(LaneElts);

29113

}

29114

29115

switch (N.getOpcode()) {

29116

case X86ISD::PSHUFD:

29117

return Mask;

29118

case X86ISD::PSHUFLW:

29119

Mask.resize(4);

29120

return Mask;

29121

case X86ISD::PSHUFHW:

29122

Mask.erase(Mask.begin(), Mask.begin() + 4);

29123

for (int &M : Mask)

29124

M -= 4;

29125

return Mask;

29126

default:

29127

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29127);

29128

}

29129

}

29130

29131

/// \brief Search for a combinable shuffle across a chain ending in pshufd.

29132

///

29133

/// We walk up the chain and look for a combinable shuffle, skipping over

29134

/// shuffles that we could hoist this shuffle's transformation past without

29135

/// altering anything.

29136

static SDValue

29137

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

29138

SelectionDAG &DAG) {

29139

assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29140, __extension__ __PRETTY_FUNCTION__))

29140

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29140, __extension__ __PRETTY_FUNCTION__));

29141

SDLoc DL(N);

29142

29143

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

29144

// of the shuffles in the chain so that we can form a fresh chain to replace

29145

// this one.

29146

SmallVector<SDValue, 8> Chain;

29147

SDValue V = N.getOperand(0);

29148

for (; V.hasOneUse(); V = V.getOperand(0)) {

29149

switch (V.getOpcode()) {

29150

default:

29151

return SDValue(); // Nothing combined!

29152

29153

case ISD::BITCAST:

29154

// Skip bitcasts as we always know the type for the target specific

29155

// instructions.

29156

continue;

29157

29158

case X86ISD::PSHUFD:

29159

// Found another dword shuffle.

29160

break;

29161

29162

case X86ISD::PSHUFLW:

29163

// Check that the low words (being shuffled) are the identity in the

29164

// dword shuffle, and the high words are self-contained.

29165

if (Mask[0] != 0 || Mask[1] != 1 ||

29166

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

29167

return SDValue();

29168

29169

Chain.push_back(V);

29170

continue;

29171

29172

case X86ISD::PSHUFHW:

29173

// Check that the high words (being shuffled) are the identity in the

29174

// dword shuffle, and the low words are self-contained.

29175

if (Mask[2] != 2 || Mask[3] != 3 ||

29176

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

29177

return SDValue();

29178

29179

Chain.push_back(V);

29180

continue;

29181

29182

case X86ISD::UNPCKL:

29183

case X86ISD::UNPCKH:

29184

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

29185

// shuffle into a preceding word shuffle.

29186

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

29187

V.getSimpleValueType().getVectorElementType() != MVT::i16)

29188

return SDValue();

29189

29190

// Search for a half-shuffle which we can combine with.

29191

unsigned CombineOp =

29192

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

29193

if (V.getOperand(0) != V.getOperand(1) ||

29194

!V->isOnlyUserOf(V.getOperand(0).getNode()))

29195

return SDValue();

29196

Chain.push_back(V);

29197

V = V.getOperand(0);

29198

do {

29199

switch (V.getOpcode()) {

29200

default:

29201

return SDValue(); // Nothing to combine.

29202

29203

case X86ISD::PSHUFLW:

29204

case X86ISD::PSHUFHW:

29205

if (V.getOpcode() == CombineOp)

29206

break;

29207

29208

Chain.push_back(V);

29209

29210

LLVM_FALLTHROUGH[[clang::fallthrough]];

29211

case ISD::BITCAST:

29212

V = V.getOperand(0);

29213

continue;

29214

}

29215

break;

29216

} while (V.hasOneUse());

29217

break;

29218

}

29219

// Break out of the loop if we break out of the switch.

29220

break;

29221

}

29222

29223

if (!V.hasOneUse())

29224

// We fell out of the loop without finding a viable combining instruction.

29225

return SDValue();

29226

29227

// Merge this node's mask and our incoming mask.

29228

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

29229

for (int &M : Mask)

29230

M = VMask[M];

29231

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

29232

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

29233

29234

// Rebuild the chain around this new shuffle.

29235

while (!Chain.empty()) {

29236

SDValue W = Chain.pop_back_val();

29237

29238

if (V.getValueType() != W.getOperand(0).getValueType())

29239

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

29240

29241

switch (W.getOpcode()) {

29242

default:

29243

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29243);

29244

29245

case X86ISD::UNPCKL:

29246

case X86ISD::UNPCKH:

29247

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

29248

break;

29249

29250

case X86ISD::PSHUFD:

29251

case X86ISD::PSHUFLW:

29252

case X86ISD::PSHUFHW:

29253

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

29254

break;

29255

}

29256

}

29257

if (V.getValueType() != N.getValueType())

29258

V = DAG.getBitcast(N.getValueType(), V);

29259

29260

// Return the new chain to replace N.

29261

return V;

29262

}

29263

29264

/// \brief Search for a combinable shuffle across a chain ending in pshuflw or

29265

/// pshufhw.

29266

///

29267

/// We walk up the chain, skipping shuffles of the other half and looking

29268

/// through shuffles which switch halves trying to find a shuffle of the same

29269

/// pair of dwords.

29270

static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,

29271

SelectionDAG &DAG,

29272

TargetLowering::DAGCombinerInfo &DCI) {

29273

assert((static_cast <bool> ((N.getOpcode() == X86ISD::PSHUFLW ||
N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29275, __extension__ __PRETTY_FUNCTION__))

29274

(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&(static_cast <bool> ((N.getOpcode() == X86ISD::PSHUFLW ||
N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29275, __extension__ __PRETTY_FUNCTION__))

29275

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> ((N.getOpcode() == X86ISD::PSHUFLW ||
N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29275, __extension__ __PRETTY_FUNCTION__));

29276

SDLoc DL(N);

29277

unsigned CombineOpcode = N.getOpcode();

29278

29279

// Walk up a single-use chain looking for a combinable shuffle.

29280

SDValue V = N.getOperand(0);

29281

for (; V.hasOneUse(); V = V.getOperand(0)) {

29282

switch (V.getOpcode()) {

29283

default:

29284

return false; // Nothing combined!

29285

29286

case ISD::BITCAST:

29287

// Skip bitcasts as we always know the type for the target specific

29288

// instructions.

29289

continue;

29290

29291

case X86ISD::PSHUFLW:

29292

case X86ISD::PSHUFHW:

29293

if (V.getOpcode() == CombineOpcode)

29294

break;

29295

29296

// Other-half shuffles are no-ops.

29297

continue;

29298

}

29299

// Break out of the loop if we break out of the switch.

29300

break;

29301

}

29302

29303

if (!V.hasOneUse())

29304

// We fell out of the loop without finding a viable combining instruction.

29305

return false;

29306

29307

// Combine away the bottom node as its shuffle will be accumulated into

29308

// a preceding shuffle.

29309

DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);

29310

29311

// Record the old value.

29312

SDValue Old = V;

29313

29314

// Merge this node's mask and our incoming mask (adjusted to account for all

29315

// the pshufd instructions encountered).

29316

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

29317

for (int &M : Mask)

29318

M = VMask[M];

29319

V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),

29320

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

29321

29322

// Check that the shuffles didn't cancel each other out. If not, we need to

29323

// combine to the new one.

29324

if (Old != V)

29325

// Replace the combinable shuffle with the combined one, updating all users

29326

// so that we re-evaluate the chain here.

29327

DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);

29328

29329

return true;

29330

}

29331

29332

/// \brief Try to combine x86 target specific shuffles.

29333

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

29334

TargetLowering::DAGCombinerInfo &DCI,

29335

const X86Subtarget &Subtarget) {

29336

SDLoc DL(N);

29337

MVT VT = N.getSimpleValueType();

29338

SmallVector<int, 4> Mask;

29339

unsigned Opcode = N.getOpcode();

29340

29341

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

29342

// single instruction.

29343

if (VT.getScalarSizeInBits() == 64 &&

29344

(Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||

29345

Opcode == X86ISD::UNPCKL)) {

29346

auto BC0 = peekThroughBitcasts(N.getOperand(0));

29347

auto BC1 = peekThroughBitcasts(N.getOperand(1));

29348

EVT VT0 = BC0.getValueType();

29349

EVT VT1 = BC1.getValueType();

29350

unsigned Opcode0 = BC0.getOpcode();

29351

unsigned Opcode1 = BC1.getOpcode();

29352

if (Opcode0 == Opcode1 && VT0 == VT1 &&

29353

(Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

29354

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||

29355

Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {

29356

SDValue Lo, Hi;

29357

if (Opcode == X86ISD::MOVSD) {

29358

Lo = BC1.getOperand(0);

29359

Hi = BC0.getOperand(1);

29360

} else {

29361

Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);

29362

Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);

29363

}

29364

SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

29365

DCI.AddToWorklist(Horiz.getNode());

29366

return DAG.getBitcast(VT, Horiz);

29367

}

29368

}

29369

29370

switch (Opcode) {

29371

case X86ISD::PSHUFD:

29372

case X86ISD::PSHUFLW:

29373

case X86ISD::PSHUFHW:

29374

Mask = getPSHUFShuffleMask(N);

29375

assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29375, __extension__ __PRETTY_FUNCTION__));

29376

break;

29377

case X86ISD::UNPCKL: {

29378

// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in

29379

// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE

29380

// moves upper half elements into the lower half part. For example:

29381

29382

// t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,

29383

// undef:v16i8

29384

// t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2

29385

29386

// will be combined to:

29387

29388

// t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1

29389

29390

// This is only for 128-bit vectors. From SSE4.1 onward this combine may not

29391

// happen due to advanced instructions.

29392

if (!VT.is128BitVector())

29393

return SDValue();

29394

29395

auto Op0 = N.getOperand(0);

29396

auto Op1 = N.getOperand(1);

29397

if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {

29398

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();

29399

29400

unsigned NumElts = VT.getVectorNumElements();

29401

SmallVector<int, 8> ExpectedMask(NumElts, -1);

29402

std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,

29403

NumElts / 2);

29404

29405

auto ShufOp = Op1.getOperand(0);

29406

if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))

29407

return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);

29408

}

29409

return SDValue();

29410

}

29411

case X86ISD::BLENDI: {

29412

SDValue V0 = N->getOperand(0);

29413

SDValue V1 = N->getOperand(1);

29414

assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT == V0.getSimpleValueType() &&
VT == V1.getSimpleValueType() && "Unexpected input vector types"
) ? void (0) : __assert_fail ("VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && \"Unexpected input vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29415, __extension__ __PRETTY_FUNCTION__))

29415

"Unexpected input vector types")(static_cast <bool> (VT == V0.getSimpleValueType() &&
VT == V1.getSimpleValueType() && "Unexpected input vector types"
) ? void (0) : __assert_fail ("VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && \"Unexpected input vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29415, __extension__ __PRETTY_FUNCTION__));

29416

29417

// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector

29418

// operands and changing the mask to 1. This saves us a bunch of

29419

// pattern-matching possibilities related to scalar math ops in SSE/AVX.

29420

// x86InstrInfo knows how to commute this back after instruction selection

29421

// if it would help register allocation.

29422

29423

// TODO: If optimizing for size or a processor that doesn't suffer from

29424

// partial register update stalls, this should be transformed into a MOVSD

29425

// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.

29426

29427

if (VT == MVT::v2f64)

29428

if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))

29429

if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {

29430

SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);

29431

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);

29432

}

29433

29434

return SDValue();

29435

}

29436

case X86ISD::MOVSD:

29437

case X86ISD::MOVSS: {

29438

SDValue V0 = peekThroughBitcasts(N->getOperand(0));

29439

SDValue V1 = peekThroughBitcasts(N->getOperand(1));

29440

bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());

29441

bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());

29442

if (isZero0 && isZero1)

29443

return SDValue();

29444

29445

// We often lower to MOVSD/MOVSS from integer as well as native float

29446

// types; remove unnecessary domain-crossing bitcasts if we can to make it

29447

// easier to combine shuffles later on. We've already accounted for the

29448

// domain switching cost when we decided to lower with it.

29449

bool isFloat = VT.isFloatingPoint();

29450

bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();

29451

bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();

29452

if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {

29453

MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)

29454

: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);

29455

V0 = DAG.getBitcast(NewVT, V0);

29456

V1 = DAG.getBitcast(NewVT, V1);

29457

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));

29458

}

29459

29460

return SDValue();

29461

}

29462

case X86ISD::INSERTPS: {

29463

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29463, __extension__ __PRETTY_FUNCTION__));

29464

SDValue Op0 = N.getOperand(0);

29465

SDValue Op1 = N.getOperand(1);

29466

SDValue Op2 = N.getOperand(2);

29467

unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();

29468

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

29469

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

29470

unsigned ZeroMask = InsertPSMask & 0xF;

29471

29472

// If we zero out all elements from Op0 then we don't need to reference it.

29473

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

29474

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

29475

DAG.getConstant(InsertPSMask, DL, MVT::i8));

29476

29477

// If we zero out the element from Op1 then we don't need to reference it.

29478

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

29479

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

29480

DAG.getConstant(InsertPSMask, DL, MVT::i8));

29481

29482

// Attempt to merge insertps Op1 with an inner target shuffle node.

29483

SmallVector<int, 8> TargetMask1;

29484

SmallVector<SDValue, 2> Ops1;

29485

if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {

29486

int M = TargetMask1[SrcIdx];

29487

if (isUndefOrZero(M)) {

29488

// Zero/UNDEF insertion - zero out element and remove dependency.

29489

InsertPSMask |= (1u << DstIdx);

29490

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

29491

DAG.getConstant(InsertPSMask, DL, MVT::i8));

29492

}

29493

// Update insertps mask srcidx and reference the source input directly.

29494

assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29494, __extension__ __PRETTY_FUNCTION__));

29495

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

29496

Op1 = Ops1[M < 4 ? 0 : 1];

29497

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

29498

DAG.getConstant(InsertPSMask, DL, MVT::i8));

29499

}

29500

29501

// Attempt to merge insertps Op0 with an inner target shuffle node.

29502

SmallVector<int, 8> TargetMask0;

29503

SmallVector<SDValue, 2> Ops0;

29504

if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))

29505

return SDValue();

29506

29507

bool Updated = false;

29508

bool UseInput00 = false;

29509

bool UseInput01 = false;

29510

for (int i = 0; i != 4; ++i) {

29511

int M = TargetMask0[i];

29512

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

29513

// No change if element is already zero or the inserted element.

29514

continue;

29515

} else if (isUndefOrZero(M)) {

29516

// If the target mask is undef/zero then we must zero the element.

29517

InsertPSMask |= (1u << i);

29518

Updated = true;

29519

continue;

29520

}

29521

29522

// The input vector element must be inline.

29523

if (M != i && M != (i + 4))

29524

return SDValue();

29525

29526

// Determine which inputs of the target shuffle we're using.

29527

UseInput00 |= (0 <= M && M < 4);

29528

UseInput01 |= (4 <= M);

29529

}

29530

29531

// If we're not using both inputs of the target shuffle then use the

29532

// referenced input directly.

29533

if (UseInput00 && !UseInput01) {

29534

Updated = true;

29535

Op0 = Ops0[0];

29536

} else if (!UseInput00 && UseInput01) {

29537

Updated = true;

29538

Op0 = Ops0[1];

29539

}

29540

29541

if (Updated)

29542

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

29543

DAG.getConstant(InsertPSMask, DL, MVT::i8));

29544

29545

return SDValue();

29546

}

29547

default:

29548

return SDValue();

29549

}

29550

29551

// Nuke no-op shuffles that show up after combining.

29552

if (isNoopShuffleMask(Mask))

29553

return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);

29554

29555

// Look for simplifications involving one or two shuffle instructions.

29556

SDValue V = N.getOperand(0);

29557

switch (N.getOpcode()) {

29558

default:

29559

break;

29560

case X86ISD::PSHUFLW:

29561

case X86ISD::PSHUFHW:

29562

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29562, __extension__ __PRETTY_FUNCTION__));

29563

29564

if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))

29565

return SDValue(); // We combined away this shuffle, so we're done.

29566

29567

// See if this reduces to a PSHUFD which is no more expensive and can

29568

// combine with more operations. Note that it has to at least flip the

29569

// dwords as otherwise it would have been removed as a no-op.

29570

if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {

29571

int DMask[] = {0, 1, 2, 3};

29572

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

29573

DMask[DOffset + 0] = DOffset + 1;

29574

DMask[DOffset + 1] = DOffset + 0;

29575

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

29576

V = DAG.getBitcast(DVT, V);

29577

DCI.AddToWorklist(V.getNode());

29578

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

29579

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

29580

DCI.AddToWorklist(V.getNode());

29581

return DAG.getBitcast(VT, V);

29582

}

29583

29584

// Look for shuffle patterns which can be implemented as a single unpack.

29585

// FIXME: This doesn't handle the location of the PSHUFD generically, and

29586

// only works when we have a PSHUFD followed by two half-shuffles.

29587

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

29588

(V.getOpcode() == X86ISD::PSHUFLW ||

29589

V.getOpcode() == X86ISD::PSHUFHW) &&

29590

V.getOpcode() != N.getOpcode() &&

29591

V.hasOneUse()) {

29592

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

29593

if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {

29594

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

29595

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

29596

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

29597

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

29598

int WordMask[8];

29599

for (int i = 0; i < 4; ++i) {

29600

WordMask[i + NOffset] = Mask[i] + NOffset;

29601

WordMask[i + VOffset] = VMask[i] + VOffset;

29602

}

29603

// Map the word mask through the DWord mask.

29604

int MappedMask[8];

29605

for (int i = 0; i < 8; ++i)

29606

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

29607

if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

29608

makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

29609

// We can replace all three shuffles with an unpack.

29610

V = DAG.getBitcast(VT, D.getOperand(0));

29611

DCI.AddToWorklist(V.getNode());

29612

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

29613

: X86ISD::UNPCKH,

29614

DL, VT, V, V);

29615

}

29616

}

29617

}

29618

29619

break;

29620

29621

case X86ISD::PSHUFD:

29622

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

29623

return NewN;

29624

29625

break;

29626

}

29627

29628

return SDValue();

29629

}

29630

29631

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB

29632

/// operation. If true is returned then the operands of ADDSUB operation

29633

/// are written to the parameters \p Opnd0 and \p Opnd1.

29634

///

29635

/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes

29636

/// so it is easier to generically match. We also insert dummy vector shuffle

29637

/// nodes for the operands which explicitly discard the lanes which are unused

29638

/// by this operation to try to flow through the rest of the combiner

29639

/// the fact that they're unused.

29640

static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,

29641

SDValue &Opnd0, SDValue &Opnd1) {

29642

29643

EVT VT = N->getValueType(0);

29644

if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&

29645

(!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&

29646

(!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))

29647

return false;

29648

29649

// We only handle target-independent shuffles.

29650

// FIXME: It would be easy and harmless to use the target shuffle mask

29651

// extraction tool to support more.

29652

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

29653

return false;

29654

29655

ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();

29656

SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());

29657

29658

SDValue V1 = N->getOperand(0);

29659

SDValue V2 = N->getOperand(1);

29660

29661

// We require the first shuffle operand to be the FSUB node, and the second to

29662

// be the FADD node.

29663

if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {

29664

ShuffleVectorSDNode::commuteMask(Mask);

29665

std::swap(V1, V2);

29666

} else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)

29667

return false;

29668

29669

// If there are other uses of these operations we can't fold them.

29670

if (!V1->hasOneUse() || !V2->hasOneUse())

29671

return false;

29672

29673

// Ensure that both operations have the same operands. Note that we can

29674

// commute the FADD operands.

29675

SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);

29676

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

29677

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

29678

return false;

29679

29680

// We're looking for blends between FADD and FSUB nodes. We insist on these

29681

// nodes being lined up in a specific expected pattern.

29682

if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||

29683

isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||

29684

isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||

29685

isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,

29686

8, 25, 10, 27, 12, 29, 14, 31})))

29687

return false;

29688

29689

Opnd0 = LHS;

29690

Opnd1 = RHS;

29691

return true;

29692

}

29693

29694

/// \brief Try to combine a shuffle into a target-specific add-sub or

29695

/// mul-add-sub node.

29696

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

29697

const X86Subtarget &Subtarget,

29698

SelectionDAG &DAG) {

29699

SDValue Opnd0, Opnd1;

29700

if (!isAddSub(N, Subtarget, Opnd0, Opnd1))

29701

return SDValue();

29702

29703

EVT VT = N->getValueType(0);

29704

SDLoc DL(N);

29705

29706

// Try to generate X86ISD::FMADDSUB node here.

29707

SDValue Opnd2;

29708

if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))

29709

return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

29710

29711

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

29712

// the ADDSUB idiom has been successfully recognized. There are no known

29713

// X86 targets with 512-bit ADDSUB instructions!

29714

if (VT.is512BitVector())

29715

return SDValue();

29716

29717

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

29718

}

29719

29720

// We are looking for a shuffle where both sources are concatenated with undef

29721

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

29722

// if we can express this as a single-source shuffle, that's preferable.

29723

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

29724

const X86Subtarget &Subtarget) {

29725

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

29726

return SDValue();

29727

29728

EVT VT = N->getValueType(0);

29729

29730

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

29731

if (!VT.is128BitVector() && !VT.is256BitVector())

29732

return SDValue();

29733

29734

if (VT.getVectorElementType() != MVT::i32 &&

29735

VT.getVectorElementType() != MVT::i64 &&

29736

VT.getVectorElementType() != MVT::f32 &&

29737

VT.getVectorElementType() != MVT::f64)

29738

return SDValue();

29739

29740

SDValue N0 = N->getOperand(0);

29741

SDValue N1 = N->getOperand(1);

29742

29743

// Check that both sources are concats with undef.

29744

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

29745

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

29746

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

29747

!N1.getOperand(1).isUndef())

29748

return SDValue();

29749

29750

// Construct the new shuffle mask. Elements from the first source retain their

29751

// index, but elements from the second source no longer need to skip an undef.

29752

SmallVector<int, 8> Mask;

29753

int NumElts = VT.getVectorNumElements();

29754

29755

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

29756

for (int Elt : SVOp->getMask())

29757

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

29758

29759

SDLoc DL(N);

29760

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

29761

N1.getOperand(0));

29762

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

29763

}

29764

29765

/// Eliminate a redundant shuffle of a horizontal math op.

29766

static SDValue foldShuffleOfHorizOp(SDNode *N) {

29767

if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())

29768

return SDValue();

29769

29770

SDValue HOp = N->getOperand(0);

29771

if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&

29772

HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)

29773

return SDValue();

29774

29775

// 128-bit horizontal math instructions are defined to operate on adjacent

29776

// lanes of each operand as:

29777

// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]

29778

// ...similarly for v2f64 and v8i16.

29779

// TODO: 256-bit is not the same because...x86.

29780

if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)

29781

return SDValue();

29782

29783

// When the operands of a horizontal math op are identical, the low half of

29784

// the result is the same as the high half. If the shuffle is also replicating

29785

// low and high halves, we don't need the shuffle.

29786

// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X

29787

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

29788

// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,

29789

// but this should be tied to whatever horizontal op matching and shuffle

29790

// canonicalization are producing.

29791

if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||

29792

isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||

29793

isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))

29794

return HOp;

29795

29796

return SDValue();

29797

}

29798

29799

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

29800

TargetLowering::DAGCombinerInfo &DCI,

29801

const X86Subtarget &Subtarget) {

29802

SDLoc dl(N);

29803

EVT VT = N->getValueType(0);

29804

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

29805

// If we have legalized the vector types, look for blends of FADD and FSUB

29806

// nodes that we can fuse into an ADDSUB node.

29807

if (TLI.isTypeLegal(VT)) {

Taking false branch

→

29808

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

29809

return AddSub;

29810

29811

if (SDValue HAddSub = foldShuffleOfHorizOp(N))

29812

return HAddSub;

29813

}

29814

29815

// During Type Legalization, when promoting illegal vector types,

29816

// the backend might introduce new shuffle dag nodes and bitcasts.

29817

29818

// This code performs the following transformation:

29819

// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->

29820

// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)

29821

29822

// We do this only if both the bitcast and the BINOP dag nodes have

29823

// one use. Also, perform this transformation only if the new binary

29824

// operation is legal. This is to avoid introducing dag nodes that

29825

// potentially need to be further expanded (or custom lowered) into a

29826

// less optimal sequence of dag nodes.

29827

if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&

29828

N->getOpcode() == ISD::VECTOR_SHUFFLE &&

29829

N->getOperand(0).getOpcode() == ISD::BITCAST &&

29830

N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {

29831

SDValue N0 = N->getOperand(0);

29832

SDValue N1 = N->getOperand(1);

29833

29834

SDValue BC0 = N0.getOperand(0);

29835

EVT SVT = BC0.getValueType();

29836

unsigned Opcode = BC0.getOpcode();

29837

unsigned NumElts = VT.getVectorNumElements();

29838

29839

if (BC0.hasOneUse() && SVT.isVector() &&

29840

SVT.getVectorNumElements() * 2 == NumElts &&

29841

TLI.isOperationLegal(Opcode, VT)) {

29842

bool CanFold = false;

29843

switch (Opcode) {

29844

default : break;

29845

case ISD::ADD:

29846

case ISD::SUB:

29847

case ISD::MUL:

29848

// isOperationLegal lies for integer ops on floating point types.

29849

CanFold = VT.isInteger();

29850

break;

29851

case ISD::FADD:

29852

case ISD::FSUB:

29853

case ISD::FMUL:

29854

// isOperationLegal lies for floating point ops on integer types.

29855

CanFold = VT.isFloatingPoint();

29856

break;

29857

}

29858

29859

unsigned SVTNumElts = SVT.getVectorNumElements();

29860

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

29861

for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)

29862

CanFold = SVOp->getMaskElt(i) == (int)(i * 2);

29863

for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)

29864

CanFold = SVOp->getMaskElt(i) < 0;

29865

29866

if (CanFold) {

29867

SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));

29868

SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));

29869

SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);

29870

return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());

29871

}

29872

}

29873

}

29874

29875

// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,

29876

// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are

29877

// consecutive, non-overlapping, and in the right order.

29878

SmallVector<SDValue, 16> Elts;

29879

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

←

Assuming 'i' is equal to 'e'

→

←

Loop condition is false. Execution continues on line 29888

→

29880

if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {

29881

Elts.push_back(Elt);

29882

continue;

29883

}

29884

Elts.clear();

29885

break;

29886

}

29887

29888

if (Elts.size() == VT.getVectorNumElements())

←

Taking true branch

→

29889

if (SDValue LD =

29890

EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))

←

Calling 'EltsFromConsecutiveLoads'

→

29891

return LD;

29892

29893

// For AVX2, we sometimes want to combine

29894

// (vector_shuffle <mask> (concat_vectors t1, undef)

29895

// (concat_vectors t2, undef))

29896

// Into:

29897

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

29898

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

29899

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

29900

return ShufConcat;

29901

29902

if (isTargetShuffle(N->getOpcode())) {

29903

SDValue Op(N, 0);

29904

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

29905

return Shuffle;

29906

29907

// Try recursively combining arbitrary sequences of x86 shuffle

29908

// instructions into higher-order shuffles. We do this after combining

29909

// specific PSHUF instruction sequences into their minimal form so that we

29910

// can evaluate how many specialized shuffle instructions are involved in

29911

// a particular chain.

29912

if (SDValue Res = combineX86ShufflesRecursively(

29913

{Op}, 0, Op, {0}, {}, /*Depth*/ 1,

29914

/*HasVarMask*/ false, DAG, DCI, Subtarget)) {

29915

DCI.CombineTo(N, Res);

29916

return SDValue();

29917

}

29918

}

29919

29920

return SDValue();

29921

}

29922

29923

/// Check if a vector extract from a target-specific shuffle of a load can be

29924

/// folded into a single element load.

29925

/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but

29926

/// shuffles have been custom lowered so we need to handle those here.

29927

static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,

29928

TargetLowering::DAGCombinerInfo &DCI) {

29929

if (DCI.isBeforeLegalizeOps())

29930

return SDValue();

29931

29932

SDValue InVec = N->getOperand(0);

29933

SDValue EltNo = N->getOperand(1);

29934

EVT EltVT = N->getValueType(0);

29935

29936

if (!isa<ConstantSDNode>(EltNo))

29937

return SDValue();

29938

29939

EVT OriginalVT = InVec.getValueType();

29940

29941

// Peek through bitcasts, don't duplicate a load with other uses.

29942

InVec = peekThroughOneUseBitcasts(InVec);

29943

29944

EVT CurrentVT = InVec.getValueType();

29945

if (!CurrentVT.isVector() ||

29946

CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())

29947

return SDValue();

29948

29949

if (!isTargetShuffle(InVec.getOpcode()))

29950

return SDValue();

29951

29952

// Don't duplicate a load with other uses.

29953

if (!InVec.hasOneUse())

29954

return SDValue();

29955

29956

SmallVector<int, 16> ShuffleMask;

29957

SmallVector<SDValue, 2> ShuffleOps;

29958

bool UnaryShuffle;

29959

if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,

29960

ShuffleOps, ShuffleMask, UnaryShuffle))

29961

return SDValue();

29962

29963

// Select the input vector, guarding against out of range extract vector.

29964

unsigned NumElems = CurrentVT.getVectorNumElements();

29965

int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();

29966

int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

29967

29968

if (Idx == SM_SentinelZero)

29969

return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)

29970

: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);

29971

if (Idx == SM_SentinelUndef)

29972

return DAG.getUNDEF(EltVT);

29973

29974

assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Idx && Idx < (int
)(2 * NumElems) && "Shuffle index out of range") ? void
(0) : __assert_fail ("0 <= Idx && Idx < (int)(2 * NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29974, __extension__ __PRETTY_FUNCTION__));

29975

SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]

29976

: ShuffleOps[1];

29977

29978

// If inputs to shuffle are the same for both ops, then allow 2 uses

29979

unsigned AllowedUses =

29980

(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

29981

29982

if (LdNode.getOpcode() == ISD::BITCAST) {

29983

// Don't duplicate a load with other uses.

29984

if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))

29985

return SDValue();

29986

29987

AllowedUses = 1; // only allow 1 load use if we have a bitcast

29988

LdNode = LdNode.getOperand(0);

29989

}

29990

29991

if (!ISD::isNormalLoad(LdNode.getNode()))

29992

return SDValue();

29993

29994

LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

29995

29996

if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())

29997

return SDValue();

29998

29999

// If there's a bitcast before the shuffle, check if the load type and

30000

// alignment is valid.

30001

unsigned Align = LN0->getAlignment();

30002

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30003

unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(

30004

EltVT.getTypeForEVT(*DAG.getContext()));

30005

30006

if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))

30007

return SDValue();

30008

30009

// All checks match so transform back to vector_shuffle so that DAG combiner

30010

// can finish the job

30011

SDLoc dl(N);

30012

30013

// Create shuffle node taking into account the case that its a unary shuffle

30014

SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];

30015

Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,

30016

ShuffleMask);

30017

Shuffle = DAG.getBitcast(OriginalVT, Shuffle);

30018

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,

30019

EltNo);

30020

}

30021

30022

// Try to match patterns such as

30023

// (i16 bitcast (v16i1 x))

30024

// ->

30025

// (i16 movmsk (16i8 sext (v16i1 x)))

30026

// before the illegal vector is scalarized on subtargets that don't have legal

30027

// vxi1 types.

30028

static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,

30029

const X86Subtarget &Subtarget) {

30030

EVT VT = BitCast.getValueType();

30031

SDValue N0 = BitCast.getOperand(0);

30032

EVT VecVT = N0->getValueType(0);

30033

30034

if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&

30035

N0->getOpcode() == ISD::OR) {

30036

SDValue Op0 = N0->getOperand(0);

30037

SDValue Op1 = N0->getOperand(1);

30038

MVT TrunckVT;

30039

MVT BitcastVT;

30040

switch (VT.getSimpleVT().SimpleTy) {

30041

default:

30042

return SDValue();

30043

case MVT::v16i1:

30044

TrunckVT = MVT::i8;

30045

BitcastVT = MVT::v8i1;

30046

break;

30047

case MVT::v32i1:

30048

TrunckVT = MVT::i16;

30049

BitcastVT = MVT::v16i1;

30050

break;

30051

case MVT::v64i1:

30052

TrunckVT = MVT::i32;

30053

BitcastVT = MVT::v32i1;

30054

break;

30055

}

30056

bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;

30057

bool isArg0UndefLeft =

30058

Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;

30059

bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;

30060

bool isArg1UndefLeft =

30061

Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;

30062

SDValue OpLeft;

30063

SDValue OpRight;

30064

if (isArg0UndefRight && isArg1UndefLeft) {

30065

OpLeft = Op0;

30066

OpRight = Op1;

30067

} else if (isArg1UndefRight && isArg0UndefLeft) {

30068

OpLeft = Op1;

30069

OpRight = Op0;

30070

} else

30071

return SDValue();

30072

SDLoc DL(BitCast);

30073

SDValue Shr = OpLeft->getOperand(0);

30074

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);

30075

SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);

30076

SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);

30077

SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);

30078

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);

30079

}

30080

30081

if (!VT.isScalarInteger() || !VecVT.isSimple())

30082

return SDValue();

30083

30084

// With AVX512 vxi1 types are legal and we prefer using k-regs.

30085

// MOVMSK is supported in SSE2 or later.

30086

if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())

30087

return SDValue();

30088

30089

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

30090

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

30091

// v8i16 and v16i16.

30092

// For these two cases, we can shuffle the upper element bytes to a

30093

// consecutive sequence at the start of the vector and treat the results as

30094

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

30095

// for v16i16 this is not the case, because the shuffle is expensive, so we

30096

// avoid sign-extending to this type entirely.

30097

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

30098

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

30099

MVT SExtVT;

30100

MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

30101

switch (VecVT.getSimpleVT().SimpleTy) {

30102

default:

30103

return SDValue();

30104

case MVT::v2i1:

30105

SExtVT = MVT::v2i64;

30106

FPCastVT = MVT::v2f64;

30107

break;

30108

case MVT::v4i1:

30109

SExtVT = MVT::v4i32;

30110

FPCastVT = MVT::v4f32;

30111

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

30112

// sign-extend to a 256-bit operation to avoid truncation.

30113

if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&

30114

N0->getOperand(0)->getValueType(0).is256BitVector()) {

30115

SExtVT = MVT::v4i64;

30116

FPCastVT = MVT::v4f64;

30117

}

30118

break;

30119

case MVT::v8i1:

30120

SExtVT = MVT::v8i16;

30121

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

30122

// sign-extend to a 256-bit operation to match the compare.

30123

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

30124

// 256-bit because the shuffle is cheaper than sign extending the result of

30125

// the compare.

30126

if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&

30127

(N0->getOperand(0)->getValueType(0).is256BitVector() ||

30128

N0->getOperand(0)->getValueType(0).is512BitVector())) {

30129

SExtVT = MVT::v8i32;

30130

FPCastVT = MVT::v8f32;

30131

}

30132

break;

30133

case MVT::v16i1:

30134

SExtVT = MVT::v16i8;

30135

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

30136

// it is not profitable to sign-extend to 256-bit because this will

30137

// require an extra cross-lane shuffle which is more expensive than

30138

// truncating the result of the compare to 128-bits.

30139

break;

30140

case MVT::v32i1:

30141

SExtVT = MVT::v32i8;

30142

break;

30143

};

30144

30145

SDLoc DL(BitCast);

30146

SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);

30147

30148

if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {

30149

// Handle pre-AVX2 cases by splitting to two v16i1's.

30150

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30151

MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);

30152

SDValue Lo = extract128BitVector(V, 0, DAG, DL);

30153

SDValue Hi = extract128BitVector(V, 16, DAG, DL);

30154

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

30155

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

30156

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

30157

DAG.getConstant(16, DL, ShiftTy));

30158

V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

30159

return DAG.getZExtOrTrunc(V, DL, VT);

30160

}

30161

30162

if (SExtVT == MVT::v8i16) {

30163

assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector")(static_cast <bool> (16 == DAG.ComputeNumSignBits(V) &&
"Expected all/none bit vector") ? void (0) : __assert_fail (
"16 == DAG.ComputeNumSignBits(V) && \"Expected all/none bit vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30163, __extension__ __PRETTY_FUNCTION__));

30164

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

30165

DAG.getUNDEF(MVT::v8i16));

30166

} else

30167

assert(SExtVT.getScalarType() != MVT::i16 &&(static_cast <bool> (SExtVT.getScalarType() != MVT::i16
&& "Vectors of i16 must be packed") ? void (0) : __assert_fail
("SExtVT.getScalarType() != MVT::i16 && \"Vectors of i16 must be packed\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30168, __extension__ __PRETTY_FUNCTION__))

30168

"Vectors of i16 must be packed")(static_cast <bool> (SExtVT.getScalarType() != MVT::i16
&& "Vectors of i16 must be packed") ? void (0) : __assert_fail
("SExtVT.getScalarType() != MVT::i16 && \"Vectors of i16 must be packed\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30168, __extension__ __PRETTY_FUNCTION__));

30169

if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

30170

V = DAG.getBitcast(FPCastVT, V);

30171

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

30172

return DAG.getZExtOrTrunc(V, DL, VT);

30173

}

30174

30175

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

30176

TargetLowering::DAGCombinerInfo &DCI,

30177

const X86Subtarget &Subtarget) {

30178

SDValue N0 = N->getOperand(0);

30179

EVT VT = N->getValueType(0);

30180

EVT SrcVT = N0.getValueType();

30181

30182

// Try to match patterns such as

30183

// (i16 bitcast (v16i1 x))

30184

// ->

30185

// (i16 movmsk (16i8 sext (v16i1 x)))

30186

// before the setcc result is scalarized on subtargets that don't have legal

30187

// vxi1 types.

30188

if (DCI.isBeforeLegalize())

30189

if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))

30190

return V;

30191

// Since MMX types are special and don't usually play with other vector types,

30192

// it's better to handle them early to be sure we emit efficient code by

30193

// avoiding store-load conversions.

30194

30195

// Detect bitcasts between i32 to x86mmx low word.

30196

if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&

30197

SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {

30198

SDValue N00 = N0->getOperand(0);

30199

if (N00.getValueType() == MVT::i32)

30200

return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);

30201

}

30202

30203

// Detect bitcasts between element or subvector extraction to x86mmx.

30204

if (VT == MVT::x86mmx &&

30205

(N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

30206

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

30207

isNullConstant(N0.getOperand(1))) {

30208

SDValue N00 = N0->getOperand(0);

30209

if (N00.getValueType().is128BitVector())

30210

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

30211

DAG.getBitcast(MVT::v2i64, N00));

30212

}

30213

30214

// Detect bitcasts from FP_TO_SINT to x86mmx.

30215

if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&

30216

N0.getOpcode() == ISD::FP_TO_SINT) {

30217

SDLoc DL(N0);

30218

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

30219

DAG.getUNDEF(MVT::v2i32));

30220

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

30221

DAG.getBitcast(MVT::v2i64, Res));

30222

}

30223

30224

// Convert a bitcasted integer logic operation that has one bitcasted

30225

// floating-point operand into a floating-point logic operation. This may

30226

// create a load of a constant, but that is cheaper than materializing the

30227

// constant in an integer register and transferring it to an SSE register or

30228

// transferring the SSE operand to integer register and back.

30229

unsigned FPOpcode;

30230

switch (N0.getOpcode()) {

30231

case ISD::AND: FPOpcode = X86ISD::FAND; break;

30232

case ISD::OR: FPOpcode = X86ISD::FOR; break;

30233

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

30234

default: return SDValue();

30235

}

30236

30237

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

30238

(Subtarget.hasSSE2() && VT == MVT::f64)))

30239

return SDValue();

30240

30241

SDValue LogicOp0 = N0.getOperand(0);

30242

SDValue LogicOp1 = N0.getOperand(1);

30243

SDLoc DL0(N0);

30244

30245

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

30246

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

30247

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&

30248

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

30249

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

30250

return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

30251

}

30252

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

30253

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

30254

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&

30255

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

30256

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

30257

return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

30258

}

30259

30260

return SDValue();

30261

}

30262

30263

// Match a binop + shuffle pyramid that represents a horizontal reduction over

30264

// the elements of a vector.

30265

// Returns the vector that is being reduced on, or SDValue() if a reduction

30266

// was not matched.

30267

static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,

30268

ArrayRef<ISD::NodeType> CandidateBinOps) {

30269

// The pattern must end in an extract from index 0.

30270

if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||

30271

!isNullConstant(Extract->getOperand(1)))

30272

return SDValue();

30273

30274

SDValue Op = Extract->getOperand(0);

30275

unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());

30276

30277

// Match against one of the candidate binary ops.

30278

if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {

30279

return Op.getOpcode() == BinOp;

30280

}))

30281

return SDValue();

30282

30283

// At each stage, we're looking for something that looks like:

30284

// %s = shufflevector <8 x i32> %op, <8 x i32> undef,

30285

// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,

30286

// i32 undef, i32 undef, i32 undef, i32 undef>

30287

// %a = binop <8 x i32> %op, %s

30288

// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,

30289

// we expect something like:

30290

// <4,5,6,7,u,u,u,u>

30291

// <2,3,u,u,u,u,u,u>

30292

// <1,u,u,u,u,u,u,u>

30293

unsigned CandidateBinOp = Op.getOpcode();

30294

for (unsigned i = 0; i < Stages; ++i) {

30295

if (Op.getOpcode() != CandidateBinOp)

30296

return SDValue();

30297

30298

ShuffleVectorSDNode *Shuffle =

30299

dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());

30300

if (Shuffle) {

30301

Op = Op.getOperand(1);

30302

} else {

30303

Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());

30304

Op = Op.getOperand(0);

30305

}

30306

30307

// The first operand of the shuffle should be the same as the other operand

30308

// of the binop.

30309

if (!Shuffle || Shuffle->getOperand(0) != Op)

30310

return SDValue();

30311

30312

// Verify the shuffle has the expected (at this stage of the pyramid) mask.

30313

for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)

30314

if (Shuffle->getMaskElt(Index) != MaskEnd + Index)

30315

return SDValue();

30316

}

30317

30318

BinOp = CandidateBinOp;

30319

return Op;

30320

}

30321

30322

// Given a select, detect the following pattern:

30323

// 1: %2 = zext <N x i8> %0 to <N x i32>

30324

// 2: %3 = zext <N x i8> %1 to <N x i32>

30325

// 3: %4 = sub nsw <N x i32> %2, %3

30326

// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]

30327

// 5: %6 = sub nsw <N x i32> zeroinitializer, %4

30328

// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6

30329

// This is useful as it is the input into a SAD pattern.

30330

static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,

30331

SDValue &Op1) {

30332

// Check the condition of the select instruction is greater-than.

30333

SDValue SetCC = Select->getOperand(0);

30334

if (SetCC.getOpcode() != ISD::SETCC)

30335

return false;

30336

ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();

30337

if (CC != ISD::SETGT && CC != ISD::SETLT)

30338

return false;

30339

30340

SDValue SelectOp1 = Select->getOperand(1);

30341

SDValue SelectOp2 = Select->getOperand(2);

30342

30343

// The following instructions assume SelectOp1 is the subtraction operand

30344

// and SelectOp2 is the negation operand.

30345

// In the case of SETLT this is the other way around.

30346

if (CC == ISD::SETLT)

30347

std::swap(SelectOp1, SelectOp2);

30348

30349

// The second operand of the select should be the negation of the first

30350

// operand, which is implemented as 0 - SelectOp1.

30351

if (!(SelectOp2.getOpcode() == ISD::SUB &&

30352

ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&

30353

SelectOp2.getOperand(1) == SelectOp1))

30354

return false;

30355

30356

// The first operand of SetCC is the first operand of the select, which is the

30357

// difference between the two input vectors.

30358

if (SetCC.getOperand(0) != SelectOp1)

30359

return false;

30360

30361

// In SetLT case, The second operand of the comparison can be either 1 or 0.

30362

APInt SplatVal;

30363

if ((CC == ISD::SETLT) &&

30364

!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&

30365

SplatVal.isOneValue()) ||

30366

(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))

30367

return false;

30368

30369

// In SetGT case, The second operand of the comparison can be either -1 or 0.

30370

if ((CC == ISD::SETGT) &&

30371

!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||

30372

ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))

30373

return false;

30374

30375

// The first operand of the select is the difference between the two input

30376

// vectors.

30377

if (SelectOp1.getOpcode() != ISD::SUB)

30378

return false;

30379

30380

Op0 = SelectOp1.getOperand(0);

30381

Op1 = SelectOp1.getOperand(1);

30382

30383

// Check if the operands of the sub are zero-extended from vectors of i8.

30384

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

30385

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

30386

Op1.getOpcode() != ISD::ZERO_EXTEND ||

30387

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

30388

return false;

30389

30390

return true;

30391

}

30392

30393

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

30394

// to these zexts.

30395

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

30396

const SDValue &Zext1, const SDLoc &DL) {

30397

30398

// Find the appropriate width for the PSADBW.

30399

EVT InVT = Zext0.getOperand(0).getValueType();

30400

unsigned RegSize = std::max(128u, InVT.getSizeInBits());

30401

30402

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

30403

// fill in the missing vector elements with 0.

30404

unsigned NumConcat = RegSize / InVT.getSizeInBits();

30405

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

30406

Ops[0] = Zext0.getOperand(0);

30407

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

30408

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

30409

Ops[0] = Zext1.getOperand(0);

30410

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

30411

30412

// Actually build the SAD

30413

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

30414

return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);

30415

}

30416

30417

// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.

30418

static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,

30419

const X86Subtarget &Subtarget) {

30420

// Bail without SSE41.

30421

if (!Subtarget.hasSSE41())

30422

return SDValue();

30423

30424

EVT ExtractVT = Extract->getValueType(0);

30425

if (ExtractVT != MVT::i16)

30426

return SDValue();

30427

30428

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

30429

unsigned BinOp;

30430

SDValue Src = matchBinOpReduction(

30431

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});

30432

if (!Src)

30433

return SDValue();

30434

30435

EVT SrcVT = Src.getValueType();

30436

EVT SrcSVT = SrcVT.getScalarType();

30437

if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)

30438

return SDValue();

30439

30440

SDLoc DL(Extract);

30441

SDValue MinPos = Src;

30442

30443

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

30444

while (SrcVT.getSizeInBits() > 128) {

30445

unsigned NumElts = SrcVT.getVectorNumElements();

30446

unsigned NumSubElts = NumElts / 2;

30447

SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);

30448

unsigned SubSizeInBits = SrcVT.getSizeInBits();

30449

SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);

30450

SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);

30451

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

30452

}

30453

assert(SrcVT == MVT::v8i16 && "Unexpected value type")(static_cast <bool> (SrcVT == MVT::v8i16 && "Unexpected value type"
) ? void (0) : __assert_fail ("SrcVT == MVT::v8i16 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30453, __extension__ __PRETTY_FUNCTION__));

30454

30455

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

30456

// to flip the value accordingly.

30457

SDValue Mask;

30458

if (BinOp == ISD::SMAX)

30459

Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);

30460

else if (BinOp == ISD::SMIN)

30461

Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);

30462

else if (BinOp == ISD::UMAX)

30463

Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);

30464

30465

if (Mask)

30466

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

30467

30468

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);

30469

30470

if (Mask)

30471

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

30472

30473

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

30474

DAG.getIntPtrConstant(0, DL));

30475

}

30476

30477

// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.

30478

static SDValue combineHorizontalPredicateResult(SDNode *Extract,

30479

SelectionDAG &DAG,

30480

const X86Subtarget &Subtarget) {

30481

// Bail without SSE2 or with AVX512VL (which uses predicate registers).

30482

if (!Subtarget.hasSSE2() || Subtarget.hasVLX())

30483

return SDValue();

30484

30485

EVT ExtractVT = Extract->getValueType(0);

30486

unsigned BitWidth = ExtractVT.getSizeInBits();

30487

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

30488

ExtractVT != MVT::i8)

30489

return SDValue();

30490

30491

// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.

30492

unsigned BinOp = 0;

30493

SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

30494

if (!Match)

30495

return SDValue();

30496

30497

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

30498

// which we can't support here for now.

30499

if (Match.getScalarValueSizeInBits() != BitWidth)

30500

return SDValue();

30501

30502

// We require AVX2 for PMOVMSKB for v16i16/v32i8;

30503

unsigned MatchSizeInBits = Match.getValueSizeInBits();

30504

if (!(MatchSizeInBits == 128 ||

30505

(MatchSizeInBits == 256 &&

30506

((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))

30507

return SDValue();

30508

30509

// Don't bother performing this for 2-element vectors.

30510

if (Match.getValueType().getVectorNumElements() <= 2)

30511

return SDValue();

30512

30513

// Check that we are extracting a reduction of all sign bits.

30514

if (DAG.ComputeNumSignBits(Match) != BitWidth)

30515

return SDValue();

30516

30517

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

30518

MVT MaskVT;

30519

if (64 == BitWidth || 32 == BitWidth)

30520

MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

30521

MatchSizeInBits / BitWidth);

30522

else

30523

MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

30524

30525

APInt CompareBits;

30526

ISD::CondCode CondCode;

30527

if (BinOp == ISD::OR) {

30528

// any_of -> MOVMSK != 0

30529

CompareBits = APInt::getNullValue(32);

30530

CondCode = ISD::CondCode::SETNE;

30531

} else {

30532

// all_of -> MOVMSK == ((1 << NumElts) - 1)

30533

CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());

30534

CondCode = ISD::CondCode::SETEQ;

30535

}

30536

30537

// Perform the select as i32/i64 and then truncate to avoid partial register

30538

// stalls.

30539

unsigned ResWidth = std::max(BitWidth, 32u);

30540

EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);

30541

SDLoc DL(Extract);

30542

SDValue Zero = DAG.getConstant(0, DL, ResVT);

30543

SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);

30544

SDValue Res = DAG.getBitcast(MaskVT, Match);

30545

Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);

30546

Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),

30547

Ones, Zero, CondCode);

30548

return DAG.getSExtOrTrunc(Res, DL, ExtractVT);

30549

}

30550

30551

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

30552

const X86Subtarget &Subtarget) {

30553

// PSADBW is only supported on SSE2 and up.

30554

if (!Subtarget.hasSSE2())

30555

return SDValue();

30556

30557

// Verify the type we're extracting from is any integer type above i16.

30558

EVT VT = Extract->getOperand(0).getValueType();

30559

if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))

30560

return SDValue();

30561

30562

unsigned RegSize = 128;

30563

if (Subtarget.hasBWI())

30564

RegSize = 512;

30565

else if (Subtarget.hasAVX2())

30566

RegSize = 256;

30567

30568

// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.

30569

// TODO: We should be able to handle larger vectors by splitting them before

30570

// feeding them into several SADs, and then reducing over those.

30571

if (RegSize / VT.getVectorNumElements() < 8)

30572

return SDValue();

30573

30574

// Match shuffle + add pyramid.

30575

unsigned BinOp = 0;

30576

SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});

30577

30578

// The operand is expected to be zero extended from i8

30579

// (verified in detectZextAbsDiff).

30580

// In order to convert to i64 and above, additional any/zero/sign

30581

// extend is expected.

30582

// The zero extend from 32 bit has no mathematical effect on the result.

30583

// Also the sign extend is basically zero extend

30584

// (extends the sign bit which is zero).

30585

// So it is correct to skip the sign/zero extend instruction.

30586

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

30587

Root.getOpcode() == ISD::ZERO_EXTEND ||

30588

Root.getOpcode() == ISD::ANY_EXTEND))

30589

Root = Root.getOperand(0);

30590

30591

// If there was a match, we want Root to be a select that is the root of an

30592

// abs-diff pattern.

30593

if (!Root || (Root.getOpcode() != ISD::VSELECT))

30594

return SDValue();

30595

30596

// Check whether we have an abs-diff pattern feeding into the select.

30597

SDValue Zext0, Zext1;

30598

if (!detectZextAbsDiff(Root, Zext0, Zext1))

30599

return SDValue();

30600

30601

// Create the SAD instruction.

30602

SDLoc DL(Extract);

30603

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

30604

30605

// If the original vector was wider than 8 elements, sum over the results

30606

// in the SAD vector.

30607

unsigned Stages = Log2_32(VT.getVectorNumElements());

30608

MVT SadVT = SAD.getSimpleValueType();

30609

if (Stages > 3) {

30610

unsigned SadElems = SadVT.getVectorNumElements();

30611

30612

for(unsigned i = Stages - 3; i > 0; --i) {

30613

SmallVector<int, 16> Mask(SadElems, -1);

30614

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

30615

Mask[j] = MaskEnd + j;

30616

30617

SDValue Shuffle =

30618

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

30619

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

30620

}

30621

}

30622

30623

MVT Type = Extract->getSimpleValueType(0);

30624

unsigned TypeSizeInBits = Type.getSizeInBits();

30625

// Return the lowest TypeSizeInBits bits.

30626

MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);

30627

SAD = DAG.getBitcast(ResVT, SAD);

30628

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,

30629

Extract->getOperand(1));

30630

}

30631

30632

// Attempt to peek through a target shuffle and extract the scalar from the

30633

// source.

30634

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

30635

TargetLowering::DAGCombinerInfo &DCI,

30636

const X86Subtarget &Subtarget) {

30637

if (DCI.isBeforeLegalizeOps())

30638

return SDValue();

30639

30640

SDValue Src = N->getOperand(0);

30641

SDValue Idx = N->getOperand(1);

30642

30643

EVT VT = N->getValueType(0);

30644

EVT SrcVT = Src.getValueType();

30645

EVT SrcSVT = SrcVT.getVectorElementType();

30646

unsigned NumSrcElts = SrcVT.getVectorNumElements();

30647

30648

// Don't attempt this for boolean mask vectors or unknown extraction indices.

30649

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

30650

return SDValue();

30651

30652

// Resolve the target shuffle inputs and mask.

30653

SmallVector<int, 16> Mask;

30654

SmallVector<SDValue, 2> Ops;

30655

if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))

30656

return SDValue();

30657

30658

// Attempt to narrow/widen the shuffle mask to the correct size.

30659

if (Mask.size() != NumSrcElts) {

30660

if ((NumSrcElts % Mask.size()) == 0) {

30661

SmallVector<int, 16> ScaledMask;

30662

int Scale = NumSrcElts / Mask.size();

30663

scaleShuffleMask<int>(Scale, Mask, ScaledMask);

30664

Mask = std::move(ScaledMask);

30665

} else if ((Mask.size() % NumSrcElts) == 0) {

30666

SmallVector<int, 16> WidenedMask;

30667

while (Mask.size() > NumSrcElts &&

30668

canWidenShuffleElements(Mask, WidenedMask))

30669

Mask = std::move(WidenedMask);

30670

// TODO - investigate support for wider shuffle masks with known upper

30671

// undef/zero elements for implicit zero-extension.

30672

}

30673

}

30674

30675

// Check if narrowing/widening failed.

30676

if (Mask.size() != NumSrcElts)

30677

return SDValue();

30678

30679

int SrcIdx = Mask[N->getConstantOperandVal(1)];

30680

SDLoc dl(N);

30681

30682

// If the shuffle source element is undef/zero then we can just accept it.

30683

if (SrcIdx == SM_SentinelUndef)

30684

return DAG.getUNDEF(VT);

30685

30686

if (SrcIdx == SM_SentinelZero)

30687

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

30688

: DAG.getConstant(0, dl, VT);

30689

30690

SDValue SrcOp = Ops[SrcIdx / Mask.size()];

30691

SrcOp = DAG.getBitcast(SrcVT, SrcOp);

30692

SrcIdx = SrcIdx % Mask.size();

30693

30694

// We can only extract other elements from 128-bit vectors and in certain

30695

// circumstances, depending on SSE-level.

30696

// TODO: Investigate using extract_subvector for larger vectors.

30697

// TODO: Investigate float/double extraction if it will be just stored.

30698

if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&

30699

((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

30700

assert(SrcSVT == VT && "Unexpected extraction type")(static_cast <bool> (SrcSVT == VT && "Unexpected extraction type"
) ? void (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30700, __extension__ __PRETTY_FUNCTION__));

30701

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,

30702

DAG.getIntPtrConstant(SrcIdx, dl));

30703

}

30704

30705

if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

30706

(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {

30707

assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() >= SrcSVT.getSizeInBits
() && "Unexpected extraction type") ? void (0) : __assert_fail
("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30708, __extension__ __PRETTY_FUNCTION__))

30708

"Unexpected extraction type")(static_cast <bool> (VT.getSizeInBits() >= SrcSVT.getSizeInBits
() && "Unexpected extraction type") ? void (0) : __assert_fail
("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30708, __extension__ __PRETTY_FUNCTION__));

30709

unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

30710

SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,

30711

DAG.getIntPtrConstant(SrcIdx, dl));

30712

return DAG.getZExtOrTrunc(ExtOp, dl, VT);

30713

}

30714

30715

return SDValue();

30716

}

30717

30718

/// Detect vector gather/scatter index generation and convert it from being a

30719

/// bunch of shuffles and extracts into a somewhat faster sequence.

30720

/// For i686, the best sequence is apparently storing the value and loading

30721

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

30722

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

30723

TargetLowering::DAGCombinerInfo &DCI,

30724

const X86Subtarget &Subtarget) {

30725

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

30726

return NewOp;

30727

30728

// TODO - Remove this once we can handle the implicit zero-extension of

30729

// X86ISD::PEXTRW/X86ISD::PEXTRB in:

30730

// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and

30731

// combineBasicSADPattern.

30732

if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

30733

return SDValue();

30734

30735

if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))

30736

return NewOp;

30737

30738

SDValue InputVector = N->getOperand(0);

30739

SDValue EltIdx = N->getOperand(1);

30740

30741

EVT SrcVT = InputVector.getValueType();

30742

EVT VT = N->getValueType(0);

30743

SDLoc dl(InputVector);

30744

30745

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

30746

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

30747

VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {

30748

SDValue MMXSrc = InputVector.getOperand(0);

30749

30750

// The bitcast source is a direct mmx result.

30751

if (MMXSrc.getValueType() == MVT::x86mmx)

30752

return DAG.getBitcast(VT, InputVector);

30753

}

30754

30755

// Detect mmx to i32 conversion through a v2i32 elt extract.

30756

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

30757

VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {

30758

SDValue MMXSrc = InputVector.getOperand(0);

30759

30760

// The bitcast source is a direct mmx result.

30761

if (MMXSrc.getValueType() == MVT::x86mmx)

30762

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);

30763

}

30764

30765

if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&

30766

isa<ConstantSDNode>(EltIdx) &&

30767

isa<ConstantSDNode>(InputVector.getOperand(0))) {

30768

uint64_t ExtractedElt = N->getConstantOperandVal(1);

30769

uint64_t InputValue = InputVector.getConstantOperandVal(0);

30770

uint64_t Res = (InputValue >> ExtractedElt) & 1;

30771

return DAG.getConstant(Res, dl, MVT::i1);

30772

}

30773

30774

// Check whether this extract is the root of a sum of absolute differences

30775

// pattern. This has to be done here because we really want it to happen

30776

// pre-legalization,

30777

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

30778

return SAD;

30779

30780

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

30781

if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))

30782

return Cmp;

30783

30784

// Attempt to replace min/max v8i16 reductions with PHMINPOSUW.

30785

if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))

30786

return MinMax;

30787

30788

// Only operate on vectors of 4 elements, where the alternative shuffling

30789

// gets to be more expensive.

30790

if (SrcVT != MVT::v4i32)

30791

return SDValue();

30792

30793

// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a

30794

// single use which is a sign-extend or zero-extend, and all elements are

30795

// used.

30796

SmallVector<SDNode *, 4> Uses;

30797

unsigned ExtractedElements = 0;

30798

for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),

30799

UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {

30800

if (UI.getUse().getResNo() != InputVector.getResNo())

30801

return SDValue();

30802

30803

SDNode *Extract = *UI;

30804

if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

30805

return SDValue();

30806

30807

if (Extract->getValueType(0) != MVT::i32)

30808

return SDValue();

30809

if (!Extract->hasOneUse())

30810

return SDValue();

30811

if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&

30812

Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)

30813

return SDValue();

30814

if (!isa<ConstantSDNode>(Extract->getOperand(1)))

30815

return SDValue();

30816

30817

// Record which element was extracted.

30818

ExtractedElements |= 1 << Extract->getConstantOperandVal(1);

30819

Uses.push_back(Extract);

30820

}

30821

30822

// If not all the elements were used, this may not be worthwhile.

30823

if (ExtractedElements != 15)

30824

return SDValue();

30825

30826

// Ok, we've now decided to do the transformation.

30827

// If 64-bit shifts are legal, use the extract-shift sequence,

30828

// otherwise bounce the vector off the cache.

30829

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30830

SDValue Vals[4];

30831

30832

if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {

30833

SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);

30834

auto &DL = DAG.getDataLayout();

30835

EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);

30836

SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,

30837

DAG.getConstant(0, dl, VecIdxTy));

30838

SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,

30839

DAG.getConstant(1, dl, VecIdxTy));

30840

30841

SDValue ShAmt = DAG.getConstant(

30842

32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));

30843

Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);

30844

Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,

30845

DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));

30846

Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);

30847

Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,

30848

DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));

30849

} else {

30850

// Store the value to a temporary stack slot.

30851

SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);

30852

SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,

30853

MachinePointerInfo());

30854

30855

EVT ElementType = SrcVT.getVectorElementType();

30856

unsigned EltSize = ElementType.getSizeInBits() / 8;

30857

30858

// Replace each use (extract) with a load of the appropriate element.

30859

for (unsigned i = 0; i < 4; ++i) {

30860

uint64_t Offset = EltSize * i;

30861

auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());

30862

SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);

30863

30864

SDValue ScalarAddr =

30865

DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);

30866

30867

// Load the scalar.

30868

Vals[i] =

30869

DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());

30870

}

30871

}

30872

30873

// Replace the extracts

30874

for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),

30875

UE = Uses.end(); UI != UE; ++UI) {

30876

SDNode *Extract = *UI;

30877

30878

uint64_t IdxVal = Extract->getConstantOperandVal(1);

30879

DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);

30880

}

30881

30882

// The replacement was made in place; don't return anything.

30883

return SDValue();

30884

}

30885

30886

/// If a vector select has an operand that is -1 or 0, try to simplify the

30887

/// select to a bitwise logic operation.

30888

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

30889

static SDValue

30890

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

30891

TargetLowering::DAGCombinerInfo &DCI,

30892

const X86Subtarget &Subtarget) {

30893

SDValue Cond = N->getOperand(0);

30894

SDValue LHS = N->getOperand(1);

30895

SDValue RHS = N->getOperand(2);

30896

EVT VT = LHS.getValueType();

30897

EVT CondVT = Cond.getValueType();

30898

SDLoc DL(N);

30899

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30900

30901

if (N->getOpcode() != ISD::VSELECT)

30902

return SDValue();

30903

30904

assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30904, __extension__ __PRETTY_FUNCTION__));

30905

30906

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

30907

// Check if the first operand is all zeros and Cond type is vXi1.

30908

// This situation only applies to avx512.

30909

if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&

30910

CondVT.getVectorElementType() == MVT::i1) {

30911

// Invert the cond to not(cond) : xor(op,allones)=not(op)

30912

SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,

30913

DAG.getAllOnesConstant(DL, CondVT));

30914

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

30915

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

30916

}

30917

30918

// To use the condition operand as a bitwise mask, it must have elements that

30919

// are the same size as the select elements. Ie, the condition operand must

30920

// have already been promoted from the IR select condition type <N x i1>.

30921

// Don't check if the types themselves are equal because that excludes

30922

// vector floating-point selects.

30923

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

30924

return SDValue();

30925

30926

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

30927

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

30928

30929

// Try to invert the condition if true value is not all 1s and false value is

30930

// not all 0s.

30931

if (!TValIsAllOnes && !FValIsAllZeros &&

30932

// Check if the selector will be produced by CMPP*/PCMP*.

30933

Cond.getOpcode() == ISD::SETCC &&

30934

// Check if SETCC has already been promoted.

30935

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

30936

CondVT) {

30937

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

30938

30939

if (TValIsAllZeros || FValIsAllOnes) {

30940

SDValue CC = Cond.getOperand(2);

30941

ISD::CondCode NewCC =

30942

ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),

30943

Cond.getOperand(0).getValueType().isInteger());

30944

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

30945

NewCC);

30946

std::swap(LHS, RHS);

30947

TValIsAllOnes = FValIsAllOnes;

30948

FValIsAllZeros = TValIsAllZeros;

30949

}

30950

}

30951

30952

// Cond value must be 'sign splat' to be converted to a logical op.

30953

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

30954

return SDValue();

30955

30956

// vselect Cond, 111..., 000... -> Cond

30957

if (TValIsAllOnes && FValIsAllZeros)

30958

return DAG.getBitcast(VT, Cond);

30959

30960

if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))

30961

return SDValue();

30962

30963

// vselect Cond, 111..., X -> or Cond, X

30964

if (TValIsAllOnes) {

30965

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

30966

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

30967

return DAG.getBitcast(VT, Or);

30968

}

30969

30970

// vselect Cond, X, 000... -> and Cond, X

30971

if (FValIsAllZeros) {

30972

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

30973

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

30974

return DAG.getBitcast(VT, And);

30975

}

30976

30977

// vselect Cond, 000..., X -> andn Cond, X

30978

if (TValIsAllZeros) {

30979

MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);

30980

SDValue CastCond = DAG.getBitcast(AndNVT, Cond);

30981

SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);

30982

SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);

30983

return DAG.getBitcast(VT, AndN);

30984

}

30985

30986

return SDValue();

30987

}

30988

30989

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

30990

SDValue Cond = N->getOperand(0);

30991

SDValue LHS = N->getOperand(1);

30992

SDValue RHS = N->getOperand(2);

30993

SDLoc DL(N);

30994

30995

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

30996

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

30997

if (!TrueC || !FalseC)

30998

return SDValue();

30999

31000

// Don't do this for crazy integer types.

31001

EVT VT = N->getValueType(0);

31002

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

31003

return SDValue();

31004

31005

// We're going to use the condition bit in math or logic ops. We could allow

31006

// this with a wider condition value (post-legalization it becomes an i8),

31007

// but if nothing is creating selects that late, it doesn't matter.

31008

if (Cond.getValueType() != MVT::i1)

31009

return SDValue();

31010

31011

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

31012

// 3, 5, or 9 with i32/i64, so those get transformed too.

31013

// TODO: For constants that overflow or do not differ by power-of-2 or small

31014

// multiplier, convert to 'and' + 'add'.

31015

const APInt &TrueVal = TrueC->getAPIntValue();

31016

const APInt &FalseVal = FalseC->getAPIntValue();

31017

bool OV;

31018

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

31019

if (OV)

31020

return SDValue();

31021

31022

APInt AbsDiff = Diff.abs();

31023

if (AbsDiff.isPowerOf2() ||

31024

((VT == MVT::i32 || VT == MVT::i64) &&

31025

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

31026

31027

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

31028

// of the condition can usually be folded into a compare predicate, but even

31029

// without that, the sequence should be cheaper than a CMOV alternative.

31030

if (TrueVal.slt(FalseVal)) {

31031

Cond = DAG.getNOT(DL, Cond, MVT::i1);

31032

std::swap(TrueC, FalseC);

31033

}

31034

31035

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

31036

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

31037

31038

// Multiply condition by the difference if non-one.

31039

if (!AbsDiff.isOneValue())

31040

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

31041

31042

// Add the base if non-zero.

31043

if (!FalseC->isNullValue())

31044

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

31045

31046

return R;

31047

}

31048

31049

return SDValue();

31050

}

31051

31052

// If this is a bitcasted op that can be represented as another type, push the

31053

// the bitcast to the inputs. This allows more opportunities for pattern

31054

// matching masked instructions. This is called when we know that the operation

31055

// is used as one of the inputs of a vselect.

31056

static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,

31057

TargetLowering::DAGCombinerInfo &DCI) {

31058

// Make sure we have a bitcast.

31059

if (OrigOp.getOpcode() != ISD::BITCAST)

31060

return false;

31061

31062

SDValue Op = OrigOp.getOperand(0);

31063

31064

// If the operation is used by anything other than the bitcast, we shouldn't

31065

// do this combine as that would replicate the operation.

31066

if (!Op.hasOneUse())

31067

return false;

31068

31069

MVT VT = OrigOp.getSimpleValueType();

31070

MVT EltVT = VT.getVectorElementType();

31071

SDLoc DL(Op.getNode());

31072

31073

auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,

31074

SDValue Op2) {

31075

Op0 = DAG.getBitcast(VT, Op0);

31076

DCI.AddToWorklist(Op0.getNode());

31077

Op1 = DAG.getBitcast(VT, Op1);

31078

DCI.AddToWorklist(Op1.getNode());

31079

DCI.CombineTo(OrigOp.getNode(),

31080

DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));

31081

return true;

31082

};

31083

31084

unsigned Opcode = Op.getOpcode();

31085

switch (Opcode) {

31086

case X86ISD::SHUF128: {

31087

if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)

31088

return false;

31089

// Only change element size, not type.

31090

if (VT.isInteger() != Op.getSimpleValueType().isInteger())

31091

return false;

31092

return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),

31093

Op.getOperand(2));

31094

}

31095

case X86ISD::SUBV_BROADCAST: {

31096

unsigned EltSize = EltVT.getSizeInBits();

31097

if (EltSize != 32 && EltSize != 64)

31098

return false;

31099

// Only change element size, not type.

31100

if (VT.isInteger() != Op.getSimpleValueType().isInteger())

31101

return false;

31102

SDValue Op0 = Op.getOperand(0);

31103

MVT Op0VT = MVT::getVectorVT(EltVT,

31104

Op0.getSimpleValueType().getSizeInBits() / EltSize);

31105

Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));

31106

DCI.AddToWorklist(Op0.getNode());

31107

DCI.CombineTo(OrigOp.getNode(),

31108

DAG.getNode(Opcode, DL, VT, Op0));

31109

return true;

31110

}

31111

}

31112

31113

return false;

31114

}

31115

31116

/// Do target-specific dag combines on SELECT and VSELECT nodes.

31117

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

31118

TargetLowering::DAGCombinerInfo &DCI,

31119

const X86Subtarget &Subtarget) {

31120

SDLoc DL(N);

31121

SDValue Cond = N->getOperand(0);

31122

// Get the LHS/RHS of the select.

31123

SDValue LHS = N->getOperand(1);

31124

SDValue RHS = N->getOperand(2);

31125

EVT VT = LHS.getValueType();

31126

EVT CondVT = Cond.getValueType();

31127

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

31128

31129

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

31130

// instructions match the semantics of the common C idiom x<y?x:y but not

31131

// x<=y?x:y, because of how they handle negative zero (which can be

31132

// ignored in unsafe-math mode).

31133

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

31134

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

31135

VT != MVT::f80 && VT != MVT::f128 &&

31136

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

31137

(Subtarget.hasSSE2() ||

31138

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

31139

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

31140

31141

unsigned Opcode = 0;

31142

// Check for x CC y ? x : y.

31143

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

31144

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

31145

switch (CC) {

31146

default: break;

31147

case ISD::SETULT:

31148

// Converting this to a min would handle NaNs incorrectly, and swapping

31149

// the operands would cause it to handle comparisons between positive

31150

// and negative zero incorrectly.

31151

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

31152

if (!DAG.getTarget().Options.UnsafeFPMath &&

31153

!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))

31154

break;

31155

std::swap(LHS, RHS);

31156

}

31157

Opcode = X86ISD::FMIN;

31158

break;

31159

case ISD::SETOLE:

31160

// Converting this to a min would handle comparisons between positive

31161

// and negative zero incorrectly.

31162

if (!DAG.getTarget().Options.UnsafeFPMath &&

31163

!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))

31164

break;

31165

Opcode = X86ISD::FMIN;

31166

break;

31167

case ISD::SETULE:

31168

// Converting this to a min would handle both negative zeros and NaNs

31169

// incorrectly, but we can swap the operands to fix both.

31170

std::swap(LHS, RHS);

31171

LLVM_FALLTHROUGH[[clang::fallthrough]];

31172

case ISD::SETOLT:

31173

case ISD::SETLT:

31174

case ISD::SETLE:

31175

Opcode = X86ISD::FMIN;

31176

break;

31177

31178

case ISD::SETOGE:

31179

// Converting this to a max would handle comparisons between positive

31180

// and negative zero incorrectly.

31181

if (!DAG.getTarget().Options.UnsafeFPMath &&

31182

!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))

31183

break;

31184

Opcode = X86ISD::FMAX;

31185

break;

31186

case ISD::SETUGT:

31187

// Converting this to a max would handle NaNs incorrectly, and swapping

31188

// the operands would cause it to handle comparisons between positive

31189

// and negative zero incorrectly.

31190

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

31191

if (!DAG.getTarget().Options.UnsafeFPMath &&

31192

!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))

31193

break;

31194

std::swap(LHS, RHS);

31195

}

31196

Opcode = X86ISD::FMAX;

31197

break;

31198

case ISD::SETUGE:

31199

// Converting this to a max would handle both negative zeros and NaNs

31200

// incorrectly, but we can swap the operands to fix both.

31201

std::swap(LHS, RHS);

31202

LLVM_FALLTHROUGH[[clang::fallthrough]];

31203

case ISD::SETOGT:

31204

case ISD::SETGT:

31205

case ISD::SETGE:

31206

Opcode = X86ISD::FMAX;

31207

break;

31208

}

31209

// Check for x CC y ? y : x -- a min/max with reversed arms.

31210

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

31211

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

31212

switch (CC) {

31213

default: break;

31214

case ISD::SETOGE:

31215

// Converting this to a min would handle comparisons between positive

31216

// and negative zero incorrectly, and swapping the operands would

31217

// cause it to handle NaNs incorrectly.

31218

if (!DAG.getTarget().Options.UnsafeFPMath &&

31219

!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {

31220

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

31221

break;

31222

std::swap(LHS, RHS);

31223

}

31224

Opcode = X86ISD::FMIN;

31225

break;

31226

case ISD::SETUGT:

31227

// Converting this to a min would handle NaNs incorrectly.

31228

if (!DAG.getTarget().Options.UnsafeFPMath &&

31229

(!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))

31230

break;

31231

Opcode = X86ISD::FMIN;

31232

break;

31233

case ISD::SETUGE:

31234

// Converting this to a min would handle both negative zeros and NaNs

31235

// incorrectly, but we can swap the operands to fix both.

31236

std::swap(LHS, RHS);

31237

LLVM_FALLTHROUGH[[clang::fallthrough]];

31238

case ISD::SETOGT:

31239

case ISD::SETGT:

31240

case ISD::SETGE:

31241

Opcode = X86ISD::FMIN;

31242

break;

31243

31244

case ISD::SETULT:

31245

// Converting this to a max would handle NaNs incorrectly.

31246

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

31247

break;

31248

Opcode = X86ISD::FMAX;

31249

break;

31250

case ISD::SETOLE:

31251

// Converting this to a max would handle comparisons between positive

31252

// and negative zero incorrectly, and swapping the operands would

31253

// cause it to handle NaNs incorrectly.

31254

if (!DAG.getTarget().Options.UnsafeFPMath &&

31255

!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {

31256

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

31257

break;

31258

std::swap(LHS, RHS);

31259

}

31260

Opcode = X86ISD::FMAX;

31261

break;

31262

case ISD::SETULE:

31263

// Converting this to a max would handle both negative zeros and NaNs

31264

// incorrectly, but we can swap the operands to fix both.

31265

std::swap(LHS, RHS);

31266

LLVM_FALLTHROUGH[[clang::fallthrough]];

31267

case ISD::SETOLT:

31268

case ISD::SETLT:

31269

case ISD::SETLE:

31270

Opcode = X86ISD::FMAX;

31271

break;

31272

}

31273

}

31274

31275

if (Opcode)

31276

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

31277

}

31278

31279

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

31280

// lowering on KNL. In this case we convert it to

31281

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

31282

// The same situation for all 128 and 256-bit vectors of i8 and i16.

31283

// Since SKX these selects have a proper lowering.

31284

if (Subtarget.hasAVX512() && CondVT.isVector() &&

31285

CondVT.getVectorElementType() == MVT::i1 &&

31286

(VT.is128BitVector() || VT.is256BitVector()) &&

31287

(VT.getVectorElementType() == MVT::i8 ||

31288

VT.getVectorElementType() == MVT::i16) &&

31289

!(Subtarget.hasBWI() && Subtarget.hasVLX())) {

31290

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

31291

DCI.AddToWorklist(Cond.getNode());

31292

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

31293

}

31294

31295

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

31296

return V;

31297

31298

// Canonicalize max and min:

31299

// (x > y) ? x : y -> (x >= y) ? x : y

31300

// (x < y) ? x : y -> (x <= y) ? x : y

31301

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

31302

// the need for an extra compare

31303

// against zero. e.g.

31304

// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0

31305

// subl %esi, %edi

31306

// testl %edi, %edi

31307

// movl $0, %eax

31308

// cmovgl %edi, %eax

31309

// =>

31310

// xorl %eax, %eax

31311

// subl %esi, $edi

31312

// cmovsl %eax, %edi

31313

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

31314

DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

31315

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

31316

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

31317

switch (CC) {

31318

default: break;

31319

case ISD::SETLT:

31320

case ISD::SETGT: {

31321

ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;

31322

Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),

31323

Cond.getOperand(0), Cond.getOperand(1), NewCC);

31324

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

31325

}

31326

}

31327

}

31328

31329

// Early exit check

31330

if (!TLI.isTypeLegal(VT))

31331

return SDValue();

31332

31333

// Match VSELECTs into subs with unsigned saturation.

31334

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

31335

// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.

31336

((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||

31337

(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {

31338

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

31339

31340

// Check if one of the arms of the VSELECT is a zero vector. If it's on the

31341

// left side invert the predicate to simplify logic below.

31342

SDValue Other;

31343

if (ISD::isBuildVectorAllZeros(LHS.getNode())) {

31344

Other = RHS;

31345

CC = ISD::getSetCCInverse(CC, true);

31346

} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {

31347

Other = LHS;

31348

}

31349

31350

if (Other.getNode() && Other->getNumOperands() == 2 &&

31351

DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {

31352

SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);

31353

SDValue CondRHS = Cond->getOperand(1);

31354

31355

// Look for a general sub with unsigned saturation first.

31356

// x >= y ? x-y : 0 --> subus x, y

31357

// x > y ? x-y : 0 --> subus x, y

31358

if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&

31359

Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))

31360

return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);

31361

31362

if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))

31363

if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {

31364

if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))

31365

if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())

31366

// If the RHS is a constant we have to reverse the const

31367

// canonicalization.

31368

// x > C-1 ? x+-C : 0 --> subus x, C

31369

if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&

31370

CondRHSConst->getAPIntValue() ==

31371

(-OpRHSConst->getAPIntValue() - 1))

31372

return DAG.getNode(

31373

X86ISD::SUBUS, DL, VT, OpLHS,

31374

DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));

31375

31376

// Another special case: If C was a sign bit, the sub has been

31377

// canonicalized into a xor.

31378

// FIXME: Would it be better to use computeKnownBits to determine

31379

// whether it's safe to decanonicalize the xor?

31380

// x s< 0 ? x^C : 0 --> subus x, C

31381

if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&

31382

ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&

31383

OpRHSConst->getAPIntValue().isSignMask())

31384

// Note that we have to rebuild the RHS constant here to ensure we

31385

// don't rely on particular values of undef lanes.

31386

return DAG.getNode(

31387

X86ISD::SUBUS, DL, VT, OpLHS,

31388

DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));

31389

}

31390

}

31391

}

31392

31393

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

31394

return V;

31395

31396

// If this is a *dynamic* select (non-constant condition) and we can match

31397

// this node with one of the variable blend instructions, restructure the

31398

// condition so that blends can use the high (sign) bit of each element and

31399

// use SimplifyDemandedBits to simplify the condition operand.

31400

if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&

31401

!DCI.isBeforeLegalize() &&

31402

!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

31403

unsigned BitWidth = Cond.getScalarValueSizeInBits();

31404

31405

// Don't optimize vector selects that map to mask-registers.

31406

if (BitWidth == 1)

31407

return SDValue();

31408

31409

// We can only handle the cases where VSELECT is directly legal on the

31410

// subtarget. We custom lower VSELECT nodes with constant conditions and

31411

// this makes it hard to see whether a dynamic VSELECT will correctly

31412

// lower, so we both check the operation's status and explicitly handle the

31413

// cases where a *dynamic* blend will fail even though a constant-condition

31414

// blend could be custom lowered.

31415

// FIXME: We should find a better way to handle this class of problems.

31416

// Potentially, we should combine constant-condition vselect nodes

31417

// pre-legalization into shuffles and not mark as many types as custom

31418

// lowered.

31419

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

31420

return SDValue();

31421

// FIXME: We don't support i16-element blends currently. We could and

31422

// should support them by making *all* the bits in the condition be set

31423

// rather than just the high bit and using an i8-element blend.

31424

if (VT.getVectorElementType() == MVT::i16)

31425

return SDValue();

31426

// Dynamic blending was only available from SSE4.1 onward.

31427

if (VT.is128BitVector() && !Subtarget.hasSSE41())

31428

return SDValue();

31429

// Byte blends are only available in AVX2

31430

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

31431

return SDValue();

31432

// There are no 512-bit blend instructions that use sign bits.

31433

if (VT.is512BitVector())

31434

return SDValue();

31435

31436

assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size")(static_cast <bool> (BitWidth >= 8 && BitWidth
<= 64 && "Invalid mask size") ? void (0) : __assert_fail
("BitWidth >= 8 && BitWidth <= 64 && \"Invalid mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 31436, __extension__ __PRETTY_FUNCTION__));

31437

APInt DemandedMask(APInt::getSignMask(BitWidth));

31438

KnownBits Known;

31439

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

31440

!DCI.isBeforeLegalizeOps());

31441

if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||

31442

TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {

31443

// If we changed the computation somewhere in the DAG, this change will

31444

// affect all users of Cond. Make sure it is fine and update all the nodes

31445

// so that we do not use the generic VSELECT anymore. Otherwise, we may

31446

// perform wrong optimizations as we messed with the actual expectation

31447

// for the vector boolean values.

31448

if (Cond != TLO.Old) {

31449

// Check all uses of the condition operand to check whether it will be

31450

// consumed by non-BLEND instructions. Those may require that all bits

31451

// are set properly.

31452

for (SDNode *U : Cond->uses()) {

31453

// TODO: Add other opcodes eventually lowered into BLEND.

31454

if (U->getOpcode() != ISD::VSELECT)

31455

return SDValue();

31456

}

31457

31458

// Update all users of the condition before committing the change, so

31459

// that the VSELECT optimizations that expect the correct vector boolean

31460

// value will not be triggered.

31461

for (SDNode *U : Cond->uses()) {

31462

SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),

31463

U->getValueType(0), Cond, U->getOperand(1),

31464

U->getOperand(2));

31465

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

31466

}

31467

DCI.CommitTargetLoweringOpt(TLO);

31468

return SDValue();

31469

}

31470

// Only Cond (rather than other nodes in the computation chain) was

31471

// changed. Change the condition just for N to keep the opportunity to

31472

// optimize all other users their own way.

31473

SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);

31474

DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);

31475

return SDValue();

31476

}

31477

}

31478

31479

// Look for vselects with LHS/RHS being bitcasted from an operation that

31480

// can be executed on another type. Push the bitcast to the inputs of

31481

// the operation. This exposes opportunities for using masking instructions.

31482

if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&

31483

CondVT.getVectorElementType() == MVT::i1) {

31484

if (combineBitcastForMaskedOp(LHS, DAG, DCI))

31485

return SDValue(N, 0);

31486

if (combineBitcastForMaskedOp(RHS, DAG, DCI))

31487

return SDValue(N, 0);

31488

}

31489

31490

// Custom action for SELECT MMX

31491

if (VT == MVT::x86mmx) {

31492

LHS = DAG.getBitcast(MVT::i64, LHS);

31493

RHS = DAG.getBitcast(MVT::i64, RHS);

31494

SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);

31495

return DAG.getBitcast(VT, newSelect);

31496

}

31497

31498

return SDValue();

31499

}

31500

31501

/// Combine:

31502

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

31503

/// to:

31504

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

31505

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

31506

/// Note that this is only legal for some op/cc combinations.

31507

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

31508

SelectionDAG &DAG,

31509

const X86Subtarget &Subtarget) {

31510

// This combine only operates on CMP-like nodes.

31511

if (!(Cmp.getOpcode() == X86ISD::CMP ||

31512

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

31513

return SDValue();

31514

31515

// Can't replace the cmp if it has more uses than the one we're looking at.

31516

// FIXME: We would like to be able to handle this, but would need to make sure

31517

// all uses were updated.

31518

if (!Cmp.hasOneUse())

31519

return SDValue();

31520

31521

// This only applies to variations of the common case:

31522

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

31523

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

31524

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

31525

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

31526

// Using the proper condcodes (see below), overflow is checked for.

31527

31528

// FIXME: We can generalize both constraints:

31529

// - XOR/OR/AND (if they were made to survive AtomicExpand)

31530

// - LHS != 1

31531

// if the result is compared.

31532

31533

SDValue CmpLHS = Cmp.getOperand(0);

31534

SDValue CmpRHS = Cmp.getOperand(1);

31535

31536

if (!CmpLHS.hasOneUse())

31537

return SDValue();

31538

31539

unsigned Opc = CmpLHS.getOpcode();

31540

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

31541

return SDValue();

31542

31543

SDValue OpRHS = CmpLHS.getOperand(2);

31544

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

31545

if (!OpRHSC)

31546

return SDValue();

31547

31548

APInt Addend = OpRHSC->getAPIntValue();

31549

if (Opc == ISD::ATOMIC_LOAD_SUB)

31550

Addend = -Addend;

31551

31552

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

31553

if (!CmpRHSC)

31554

return SDValue();

31555

31556

APInt Comparison = CmpRHSC->getAPIntValue();

31557

31558

// If the addend is the negation of the comparison value, then we can do

31559

// a full comparison by emitting the atomic arithmetic as a locked sub.

31560

if (Comparison == -Addend) {

31561

// The CC is fine, but we need to rewrite the LHS of the comparison as an

31562

// atomic sub.

31563

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

31564

auto AtomicSub = DAG.getAtomic(

31565

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),

31566

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

31567

/*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),

31568

AN->getMemOperand());

31569

// If the comparision uses the CF flag we can't use INC/DEC instructions.

31570

bool NeedCF = false;

31571

switch (CC) {

31572

default: break;

31573

case X86::COND_A: case X86::COND_AE:

31574

case X86::COND_B: case X86::COND_BE:

31575

NeedCF = true;

31576

break;

31577

}

31578

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);

31579

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),

31580

DAG.getUNDEF(CmpLHS.getValueType()));

31581

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

31582

return LockOp;

31583

}

31584

31585

// We can handle comparisons with zero in a number of cases by manipulating

31586

// the CC used.

31587

if (!Comparison.isNullValue())

31588

return SDValue();

31589

31590

if (CC == X86::COND_S && Addend == 1)

31591

CC = X86::COND_LE;

31592

else if (CC == X86::COND_NS && Addend == 1)

31593

CC = X86::COND_G;

31594

else if (CC == X86::COND_G && Addend == -1)

31595

CC = X86::COND_GE;

31596

else if (CC == X86::COND_LE && Addend == -1)

31597

CC = X86::COND_L;

31598

else

31599

return SDValue();

31600

31601

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

31602

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),

31603

DAG.getUNDEF(CmpLHS.getValueType()));

31604

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

31605

return LockOp;

31606

}

31607

31608

// Check whether a boolean test is testing a boolean value generated by

31609

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

31610

// code.

31611

31612

// Simplify the following patterns:

31613

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

31614

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

31615

// to (Op EFLAGS Cond)

31616

31617

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

31618

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

31619

// to (Op EFLAGS !Cond)

31620

31621

// where Op could be BRCOND or CMOV.

31622

31623

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

31624

// This combine only operates on CMP-like nodes.

31625

if (!(Cmp.getOpcode() == X86ISD::CMP ||

31626

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

31627

return SDValue();

31628

31629

// Quit if not used as a boolean value.

31630

if (CC != X86::COND_E && CC != X86::COND_NE)

31631

return SDValue();

31632

31633

// Check CMP operands. One of them should be 0 or 1 and the other should be

31634

// an SetCC or extended from it.

31635

SDValue Op1 = Cmp.getOperand(0);

31636

SDValue Op2 = Cmp.getOperand(1);

31637

31638

SDValue SetCC;

31639

const ConstantSDNode* C = nullptr;

31640

bool needOppositeCond = (CC == X86::COND_E);

31641

bool checkAgainstTrue = false; // Is it a comparison against 1?

31642

31643

if ((C = dyn_cast<ConstantSDNode>(Op1)))

31644

SetCC = Op2;

31645

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

31646

SetCC = Op1;

31647

else // Quit if all operands are not constants.

31648

return SDValue();

31649

31650

if (C->getZExtValue() == 1) {

31651

needOppositeCond = !needOppositeCond;

31652

checkAgainstTrue = true;

31653

} else if (C->getZExtValue() != 0)

31654

// Quit if the constant is neither 0 or 1.

31655

return SDValue();

31656

31657

bool truncatedToBoolWithAnd = false;

31658

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

31659

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

31660

SetCC.getOpcode() == ISD::TRUNCATE ||

31661

SetCC.getOpcode() == ISD::AND) {

31662

if (SetCC.getOpcode() == ISD::AND) {

31663

int OpIdx = -1;

31664

if (isOneConstant(SetCC.getOperand(0)))

31665

OpIdx = 1;

31666

if (isOneConstant(SetCC.getOperand(1)))

31667

OpIdx = 0;

31668

if (OpIdx < 0)

31669

break;

31670

SetCC = SetCC.getOperand(OpIdx);

31671

truncatedToBoolWithAnd = true;

31672

} else

31673

SetCC = SetCC.getOperand(0);

31674

}

31675

31676

switch (SetCC.getOpcode()) {

31677

case X86ISD::SETCC_CARRY:

31678

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

31679

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

31680

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

31681

// truncated to i1 using 'and'.

31682

if (checkAgainstTrue && !truncatedToBoolWithAnd)

31683

break;

31684

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 31685, __extension__ __PRETTY_FUNCTION__))

31685

"Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 31685, __extension__ __PRETTY_FUNCTION__));

31686

LLVM_FALLTHROUGH[[clang::fallthrough]];

31687

case X86ISD::SETCC:

31688

// Set the condition code or opposite one if necessary.

31689

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

31690

if (needOppositeCond)

31691

CC = X86::GetOppositeBranchCondition(CC);

31692

return SetCC.getOperand(1);

31693

case X86ISD::CMOV: {

31694

// Check whether false/true value has canonical one, i.e. 0 or 1.

31695

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

31696

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

31697

// Quit if true value is not a constant.

31698

if (!TVal)

31699

return SDValue();

31700

// Quit if false value is not a constant.

31701

if (!FVal) {

31702

SDValue Op = SetCC.getOperand(0);

31703

// Skip 'zext' or 'trunc' node.

31704

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

31705

Op.getOpcode() == ISD::TRUNCATE)

31706

Op = Op.getOperand(0);

31707

// A special case for rdrand/rdseed, where 0 is set if false cond is

31708

// found.

31709

if ((Op.getOpcode() != X86ISD::RDRAND &&

31710

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

31711

return SDValue();

31712

}

31713

// Quit if false value is not the constant 0 or 1.

31714

bool FValIsFalse = true;

31715

if (FVal && FVal->getZExtValue() != 0) {

31716

if (FVal->getZExtValue() != 1)

31717

return SDValue();

31718

// If FVal is 1, opposite cond is needed.

31719

needOppositeCond = !needOppositeCond;

31720

FValIsFalse = false;

31721

}

31722

// Quit if TVal is not the constant opposite of FVal.

31723

if (FValIsFalse && TVal->getZExtValue() != 1)

31724

return SDValue();

31725

if (!FValIsFalse && TVal->getZExtValue() != 0)

31726

return SDValue();

31727

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

31728

if (needOppositeCond)

31729

CC = X86::GetOppositeBranchCondition(CC);

31730

return SetCC.getOperand(3);

31731

}

31732

}

31733

31734

return SDValue();

31735

}

31736

31737

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

31738

/// Match:

31739

/// (X86or (X86setcc) (X86setcc))

31740

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

31741

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

31742

X86::CondCode &CC1, SDValue &Flags,

31743

bool &isAnd) {

31744

if (Cond->getOpcode() == X86ISD::CMP) {

31745

if (!isNullConstant(Cond->getOperand(1)))

31746

return false;

31747

31748

Cond = Cond->getOperand(0);

31749

}

31750

31751

isAnd = false;

31752

31753

SDValue SetCC0, SetCC1;

31754

switch (Cond->getOpcode()) {

31755

default: return false;

31756

case ISD::AND:

31757

case X86ISD::AND:

31758

isAnd = true;

31759

LLVM_FALLTHROUGH[[clang::fallthrough]];

31760

case ISD::OR:

31761

case X86ISD::OR:

31762

SetCC0 = Cond->getOperand(0);

31763

SetCC1 = Cond->getOperand(1);

31764

break;

31765

};

31766

31767

// Make sure we have SETCC nodes, using the same flags value.

31768

if (SetCC0.getOpcode() != X86ISD::SETCC ||

31769

SetCC1.getOpcode() != X86ISD::SETCC ||

31770

SetCC0->getOperand(1) != SetCC1->getOperand(1))

31771

return false;

31772

31773

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

31774

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

31775

Flags = SetCC0->getOperand(1);

31776

return true;

31777

}

31778

31779

// When legalizing carry, we create carries via add X, -1

31780

// If that comes from an actual carry, via setcc, we use the

31781

// carry directly.

31782

static SDValue combineCarryThroughADD(SDValue EFLAGS) {

31783

if (EFLAGS.getOpcode() == X86ISD::ADD) {

31784

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

31785

SDValue Carry = EFLAGS.getOperand(0);

31786

while (Carry.getOpcode() == ISD::TRUNCATE ||

31787

Carry.getOpcode() == ISD::ZERO_EXTEND ||

31788

Carry.getOpcode() == ISD::SIGN_EXTEND ||

31789

Carry.getOpcode() == ISD::ANY_EXTEND ||

31790

(Carry.getOpcode() == ISD::AND &&

31791

isOneConstant(Carry.getOperand(1))))

31792

Carry = Carry.getOperand(0);

31793

if (Carry.getOpcode() == X86ISD::SETCC ||

31794

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

31795

if (Carry.getConstantOperandVal(0) == X86::COND_B)

31796

return Carry.getOperand(1);

31797

}

31798

}

31799

}

31800

31801

return SDValue();

31802

}

31803

31804

/// Optimize an EFLAGS definition used according to the condition code \p CC

31805

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

31806

/// uses of chain values.

31807

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

31808

SelectionDAG &DAG,

31809

const X86Subtarget &Subtarget) {

31810

if (CC == X86::COND_B)

31811

if (SDValue Flags = combineCarryThroughADD(EFLAGS))

31812

return Flags;

31813

31814

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

31815

return R;

31816

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

31817

}

31818

31819

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

31820

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

31821

TargetLowering::DAGCombinerInfo &DCI,

31822

const X86Subtarget &Subtarget) {

31823

SDLoc DL(N);

31824

31825

SDValue FalseOp = N->getOperand(0);

31826

SDValue TrueOp = N->getOperand(1);

31827

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

31828

SDValue Cond = N->getOperand(3);

31829

31830

if (CC == X86::COND_E || CC == X86::COND_NE) {

31831

switch (Cond.getOpcode()) {

31832

default: break;

31833

case X86ISD::BSR:

31834

case X86ISD::BSF:

31835

// If operand of BSR / BSF are proven never zero, then ZF cannot be set.

31836

if (DAG.isKnownNeverZero(Cond.getOperand(0)))

31837

return (CC == X86::COND_E) ? FalseOp : TrueOp;

31838

}

31839

}

31840

31841

// Try to simplify the EFLAGS and condition code operands.

31842

// We can't always do this as FCMOV only supports a subset of X86 cond.

31843

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

31844

if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {

31845

SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),

31846

Flags};

31847

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

31848

}

31849

}

31850

31851

// If this is a select between two integer constants, try to do some

31852

// optimizations. Note that the operands are ordered the opposite of SELECT

31853

// operands.

31854

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

31855

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

31856

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

31857

// larger than FalseC (the false value).

31858

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

31859

CC = X86::GetOppositeBranchCondition(CC);

31860

std::swap(TrueC, FalseC);

31861

std::swap(TrueOp, FalseOp);

31862

}

31863

31864

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

31865

// This is efficient for any integer data type (including i8/i16) and

31866

// shift amount.

31867

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

31868

Cond = getSETCC(CC, Cond, DL, DAG);

31869

31870

// Zero extend the condition if needed.

31871

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

31872

31873

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

31874

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

31875

DAG.getConstant(ShAmt, DL, MVT::i8));

31876

return Cond;

31877

}

31878

31879

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

31880

// for any integer data type, including i8/i16.

31881

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

31882

Cond = getSETCC(CC, Cond, DL, DAG);

31883

31884

// Zero extend the condition if needed.

31885

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

31886

FalseC->getValueType(0), Cond);

31887

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

31888

SDValue(FalseC, 0));

31889

return Cond;

31890

}

31891

31892

// Optimize cases that will turn into an LEA instruction. This requires

31893

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

31894

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

31895

uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();

31896

if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

31897

31898

bool isFastMultiplier = false;

31899

if (Diff < 10) {

31900

switch ((unsigned char)Diff) {

31901

default: break;

31902

case 1: // result = add base, cond

31903

case 2: // result = lea base( , cond*2)

31904

case 3: // result = lea base(cond, cond*2)

31905

case 4: // result = lea base( , cond*4)

31906

case 5: // result = lea base(cond, cond*4)

31907

case 8: // result = lea base( , cond*8)

31908

case 9: // result = lea base(cond, cond*8)

31909

isFastMultiplier = true;

31910

break;

31911

}

31912

}

31913

31914

if (isFastMultiplier) {

31915

APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();

31916

Cond = getSETCC(CC, Cond, DL ,DAG);

31917

// Zero extend the condition if needed.

31918

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

31919

Cond);

31920

// Scale the condition by the difference.

31921

if (Diff != 1)

31922

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

31923

DAG.getConstant(Diff, DL, Cond.getValueType()));

31924

31925

// Add the base if non-zero.

31926

if (FalseC->getAPIntValue() != 0)

31927

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

31928

SDValue(FalseC, 0));

31929

return Cond;

31930

}

31931

}

31932

}

31933

}

31934

31935

// Handle these cases:

31936

// (select (x != c), e, c) -> select (x != c), e, x),

31937

// (select (x == c), c, e) -> select (x == c), x, e)

31938

// where the c is an integer constant, and the "select" is the combination

31939

// of CMOV and CMP.

31940

31941

// The rationale for this change is that the conditional-move from a constant

31942

// needs two instructions, however, conditional-move from a register needs

31943

// only one instruction.

31944

31945

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

31946

// some instruction-combining opportunities. This opt needs to be

31947

// postponed as late as possible.

31948

31949

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

31950

// the DCI.xxxx conditions are provided to postpone the optimization as

31951

// late as possible.

31952

31953

ConstantSDNode *CmpAgainst = nullptr;

31954

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

31955

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

31956

!isa<ConstantSDNode>(Cond.getOperand(0))) {

31957

31958

if (CC == X86::COND_NE &&

31959

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

31960

CC = X86::GetOppositeBranchCondition(CC);

31961

std::swap(TrueOp, FalseOp);

31962

}

31963

31964

if (CC == X86::COND_E &&

31965

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

31966

SDValue Ops[] = { FalseOp, Cond.getOperand(0),

31967

DAG.getConstant(CC, DL, MVT::i8), Cond };

31968

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

31969

}

31970

}

31971

}

31972

31973

// Fold and/or of setcc's to double CMOV:

31974

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

31975

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

31976

31977

// This combine lets us generate:

31978

// cmovcc1 (jcc1 if we don't have CMOV)

31979

// cmovcc2 (same)

31980

// instead of:

31981

// setcc1

31982

// setcc2

31983

// and/or

31984

// cmovne (jne if we don't have CMOV)

31985

// When we can't use the CMOV instruction, it might increase branch

31986

// mispredicts.

31987

// When we can use CMOV, or when there is no mispredict, this improves

31988

// throughput and reduces register pressure.

31989

31990

if (CC == X86::COND_NE) {

31991

SDValue Flags;

31992

X86::CondCode CC0, CC1;

31993

bool isAndSetCC;

31994

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

31995

if (isAndSetCC) {

31996

std::swap(FalseOp, TrueOp);

31997

CC0 = X86::GetOppositeBranchCondition(CC0);

31998

CC1 = X86::GetOppositeBranchCondition(CC1);

31999

}

32000

32001

SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),

32002

Flags};

32003

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

32004

SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};

32005

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

32006

return CMOV;

32007

}

32008

}

32009

32010

return SDValue();

32011

}

32012

32013

/// Different mul shrinking modes.

32014

enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

32015

32016

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

32017

EVT VT = N->getOperand(0).getValueType();

32018

if (VT.getScalarSizeInBits() != 32)

32019

return false;

32020

32021

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32021, __extension__ __PRETTY_FUNCTION__));

32022

unsigned SignBits[2] = {1, 1};

32023

bool IsPositive[2] = {false, false};

32024

for (unsigned i = 0; i < 2; i++) {

32025

SDValue Opd = N->getOperand(i);

32026

32027

// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to

32028

// compute signbits for it separately.

32029

if (Opd.getOpcode() == ISD::ANY_EXTEND) {

32030

// For anyextend, it is safe to assume an appropriate number of leading

32031

// sign/zero bits.

32032

if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)

32033

SignBits[i] = 25;

32034

else if (Opd.getOperand(0).getValueType().getVectorElementType() ==

32035

MVT::i16)

32036

SignBits[i] = 17;

32037

else

32038

return false;

32039

IsPositive[i] = true;

32040

} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {

32041

// All the operands of BUILD_VECTOR need to be int constant.

32042

// Find the smallest value range which all the operands belong to.

32043

SignBits[i] = 32;

32044

IsPositive[i] = true;

32045

for (const SDValue &SubOp : Opd.getNode()->op_values()) {

32046

if (SubOp.isUndef())

32047

continue;

32048

auto *CN = dyn_cast<ConstantSDNode>(SubOp);

32049

if (!CN)

32050

return false;

32051

APInt IntVal = CN->getAPIntValue();

32052

if (IntVal.isNegative())

32053

IsPositive[i] = false;

32054

SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());

32055

}

32056

} else {

32057

SignBits[i] = DAG.ComputeNumSignBits(Opd);

32058

if (Opd.getOpcode() == ISD::ZERO_EXTEND)

32059

IsPositive[i] = true;

32060

}

32061

}

32062

32063

bool AllPositive = IsPositive[0] && IsPositive[1];

32064

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

32065

// When ranges are from -128 ~ 127, use MULS8 mode.

32066

if (MinSignBits >= 25)

32067

Mode = MULS8;

32068

// When ranges are from 0 ~ 255, use MULU8 mode.

32069

else if (AllPositive && MinSignBits >= 24)

32070

Mode = MULU8;

32071

// When ranges are from -32768 ~ 32767, use MULS16 mode.

32072

else if (MinSignBits >= 17)

32073

Mode = MULS16;

32074

// When ranges are from 0 ~ 65535, use MULU16 mode.

32075

else if (AllPositive && MinSignBits >= 16)

32076

Mode = MULU16;

32077

else

32078

return false;

32079

return true;

32080

}

32081

32082

/// When the operands of vector mul are extended from smaller size values,

32083

/// like i8 and i16, the type of mul may be shrinked to generate more

32084

/// efficient code. Two typical patterns are handled:

32085

/// Pattern1:

32086

/// %2 = sext/zext <N x i8> %1 to <N x i32>

32087

/// %4 = sext/zext <N x i8> %3 to <N x i32>

32088

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

32089

/// %5 = mul <N x i32> %2, %4

32090

///

32091

/// Pattern2:

32092

/// %2 = zext/sext <N x i16> %1 to <N x i32>

32093

/// %4 = zext/sext <N x i16> %3 to <N x i32>

32094

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

32095

/// %5 = mul <N x i32> %2, %4

32096

///

32097

/// There are four mul shrinking modes:

32098

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

32099

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

32100

/// generate pmullw+sext32 for it (MULS8 mode).

32101

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

32102

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

32103

/// generate pmullw+zext32 for it (MULU8 mode).

32104

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

32105

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

32106

/// generate pmullw+pmulhw for it (MULS16 mode).

32107

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

32108

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

32109

/// generate pmullw+pmulhuw for it (MULU16 mode).

32110

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

32111

const X86Subtarget &Subtarget) {

32112

// Check for legality

32113

// pmullw/pmulhw are not supported by SSE.

32114

if (!Subtarget.hasSSE2())

32115

return SDValue();

32116

32117

// Check for profitability

32118

// pmulld is supported since SSE41. It is better to use pmulld

32119

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

32120

// the expansion.

32121

bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();

32122

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

32123

return SDValue();

32124

32125

ShrinkMode Mode;

32126

if (!canReduceVMulWidth(N, DAG, Mode))

32127

return SDValue();

32128

32129

SDLoc DL(N);

32130

SDValue N0 = N->getOperand(0);

32131

SDValue N1 = N->getOperand(1);

32132

EVT VT = N->getOperand(0).getValueType();

32133

unsigned NumElts = VT.getVectorNumElements();

32134

if ((NumElts % 2) != 0)

32135

return SDValue();

32136

32137

unsigned RegSize = 128;

32138

MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);

32139

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

32140

32141

// Shrink the operands of mul.

32142

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

32143

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

32144

32145

if (NumElts >= OpsVT.getVectorNumElements()) {

32146

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

32147

// lower part is needed.

32148

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

32149

if (Mode == MULU8 || Mode == MULS8) {

32150

return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,

32151

DL, VT, MulLo);

32152

} else {

32153

MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);

32154

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

32155

// the higher part is also needed.

32156

SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,

32157

ReducedVT, NewN0, NewN1);

32158

32159

// Repack the lower part and higher part result of mul into a wider

32160

// result.

32161

// Generate shuffle functioning as punpcklwd.

32162

SmallVector<int, 16> ShuffleMask(NumElts);

32163

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

32164

ShuffleMask[2 * i] = i;

32165

ShuffleMask[2 * i + 1] = i + NumElts;

32166

}

32167

SDValue ResLo =

32168

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

32169

ResLo = DAG.getBitcast(ResVT, ResLo);

32170

// Generate shuffle functioning as punpckhwd.

32171

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

32172

ShuffleMask[2 * i] = i + NumElts / 2;

32173

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

32174

}

32175

SDValue ResHi =

32176

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

32177

ResHi = DAG.getBitcast(ResVT, ResHi);

32178

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

32179

}

32180

} else {

32181

// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want

32182

// to legalize the mul explicitly because implicit legalization for type

32183

// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack

32184

// instructions which will not exist when we explicitly legalize it by

32185

// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with

32186

// <4 x i16> undef).

32187

32188

// Legalize the operands of mul.

32189

// FIXME: We may be able to handle non-concatenated vectors by insertion.

32190

unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();

32191

if ((RegSize % ReducedSizeInBits) != 0)

32192

return SDValue();

32193

32194

SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,

32195

DAG.getUNDEF(ReducedVT));

32196

Ops[0] = NewN0;

32197

NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

32198

Ops[0] = NewN1;

32199

NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

32200

32201

if (Mode == MULU8 || Mode == MULS8) {

32202

// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower

32203

// part is needed.

32204

SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

32205

32206

// convert the type of mul result to VT.

32207

MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

32208

SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG

32209

: ISD::SIGN_EXTEND_VECTOR_INREG,

32210

DL, ResVT, Mul);

32211

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

32212

DAG.getIntPtrConstant(0, DL));

32213

} else {

32214

// Generate the lower and higher part of mul: pmulhw/pmulhuw. For

32215

// MULU16/MULS16, both parts are needed.

32216

SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

32217

SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,

32218

OpsVT, NewN0, NewN1);

32219

32220

// Repack the lower part and higher part result of mul into a wider

32221

// result. Make sure the type of mul result is VT.

32222

MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

32223

SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);

32224

Res = DAG.getBitcast(ResVT, Res);

32225

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

32226

DAG.getIntPtrConstant(0, DL));

32227

}

32228

}

32229

}

32230

32231

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

32232

EVT VT, SDLoc DL) {

32233

32234

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

32235

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

32236

DAG.getConstant(Mult, DL, VT));

32237

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

32238

DAG.getConstant(Shift, DL, MVT::i8));

32239

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

32240

N->getOperand(0));

32241

return Result;

32242

};

32243

32244

auto combineMulMulAddOrSub = [&](bool isAdd) {

32245

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

32246

DAG.getConstant(9, DL, VT));

32247

Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));

32248

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

32249

N->getOperand(0));

32250

return Result;

32251

};

32252

32253

switch (MulAmt) {

32254

default:

32255

break;

32256

case 11:

32257

// mul x, 11 => add ((shl (mul x, 5), 1), x)

32258

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

32259

case 21:

32260

// mul x, 21 => add ((shl (mul x, 5), 2), x)

32261

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

32262

case 22:

32263

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

32264

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

32265

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

32266

case 19:

32267

// mul x, 19 => sub ((shl (mul x, 5), 2), x)

32268

return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);

32269

case 13:

32270

// mul x, 13 => add ((shl (mul x, 3), 2), x)

32271

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

32272

case 23:

32273

// mul x, 13 => sub ((shl (mul x, 3), 3), x)

32274

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

32275

case 14:

32276

// mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)

32277

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

32278

combineMulShlAddOrSub(3, 2, /*isAdd*/ true));

32279

case 26:

32280

// mul x, 26 => sub ((mul (mul x, 9), 3), x)

32281

return combineMulMulAddOrSub(/*isAdd*/ false);

32282

case 28:

32283

// mul x, 28 => add ((mul (mul x, 9), 3), x)

32284

return combineMulMulAddOrSub(/*isAdd*/ true);

32285

case 29:

32286

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

32287

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

32288

combineMulMulAddOrSub(/*isAdd*/ true));

32289

case 30:

32290

// mul x, 30 => sub (sub ((shl x, 5), x), x)

32291

return DAG.getNode(

32292

ISD::SUB, DL, VT,

32293

DAG.getNode(ISD::SUB, DL, VT,

32294

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

32295

DAG.getConstant(5, DL, MVT::i8)),

32296

N->getOperand(0)),

32297

N->getOperand(0));

32298

}

32299

return SDValue();

32300

}

32301

32302

/// Optimize a single multiply with constant into two operations in order to

32303

/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

32304

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

32305

TargetLowering::DAGCombinerInfo &DCI,

32306

const X86Subtarget &Subtarget) {

32307

EVT VT = N->getValueType(0);

32308

if (DCI.isBeforeLegalize() && VT.isVector())

32309

return reduceVMULWidth(N, DAG, Subtarget);

32310

32311

if (!MulConstantOptimization)

32312

return SDValue();

32313

// An imul is usually smaller than the alternative sequence.

32314

if (DAG.getMachineFunction().getFunction()->optForMinSize())

32315

return SDValue();

32316

32317

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

32318

return SDValue();

32319

32320

if (VT != MVT::i64 && VT != MVT::i32)

32321

return SDValue();

32322

32323

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

32324

if (!C)

32325

return SDValue();

32326

uint64_t MulAmt = C->getZExtValue();

32327

if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)

32328

return SDValue();

32329

32330

uint64_t MulAmt1 = 0;

32331

uint64_t MulAmt2 = 0;

32332

if ((MulAmt % 9) == 0) {

32333

MulAmt1 = 9;

32334

MulAmt2 = MulAmt / 9;

32335

} else if ((MulAmt % 5) == 0) {

32336

MulAmt1 = 5;

32337

MulAmt2 = MulAmt / 5;

32338

} else if ((MulAmt % 3) == 0) {

32339

MulAmt1 = 3;

32340

MulAmt2 = MulAmt / 3;

32341

}

32342

32343

SDLoc DL(N);

32344

SDValue NewMul;

32345

if (MulAmt2 &&

32346

(isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){

32347

32348

if (isPowerOf2_64(MulAmt2) &&

32349

!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))

32350

// If second multiplifer is pow2, issue it first. We want the multiply by

32351

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

32352

// is an add.

32353

std::swap(MulAmt1, MulAmt2);

32354

32355

if (isPowerOf2_64(MulAmt1))

32356

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

32357

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

32358

else

32359

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

32360

DAG.getConstant(MulAmt1, DL, VT));

32361

32362

if (isPowerOf2_64(MulAmt2))

32363

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

32364

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

32365

else

32366

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

32367

DAG.getConstant(MulAmt2, DL, VT));

32368

} else if (!Subtarget.slowLEA())

32369

NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);

32370

32371

if (!NewMul) {

32372

assert(MulAmt != 0 &&(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__))

32373

MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__))

32374

"Both cases that could cause potential overflows should have "(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__))

32375

"already been handled.")(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__));

32376

int64_t SignMulAmt = C->getSExtValue();

32377

if ((SignMulAmt != INT64_MIN(-9223372036854775807L -1)) && (SignMulAmt != INT64_MAX(9223372036854775807L)) &&

32378

(SignMulAmt != -INT64_MAX(9223372036854775807L))) {

32379

int NumSign = SignMulAmt > 0 ? 1 : -1;

32380

bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);

32381

bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);

32382

if (IsPowerOf2_64PlusOne) {

32383

// (mul x, 2^N + 1) => (add (shl x, N), x)

32384

NewMul = DAG.getNode(

32385

ISD::ADD, DL, VT, N->getOperand(0),

32386

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

32387

DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,

32388

MVT::i8)));

32389

} else if (IsPowerOf2_64MinusOne) {

32390

// (mul x, 2^N - 1) => (sub (shl x, N), x)

32391

NewMul = DAG.getNode(

32392

ISD::SUB, DL, VT,

32393

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

32394

DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,

32395

MVT::i8)),

32396

N->getOperand(0));

32397

}

32398

// To negate, subtract the number from zero

32399

if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)

32400

NewMul =

32401

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);

32402

}

32403

}

32404

32405

if (NewMul)

32406

// Do not add new nodes to DAG combiner worklist.

32407

DCI.CombineTo(N, NewMul, false);

32408

32409

return SDValue();

32410

}

32411

32412

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

32413

SDValue N0 = N->getOperand(0);

32414

SDValue N1 = N->getOperand(1);

32415

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

32416

EVT VT = N0.getValueType();

32417

32418

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

32419

// since the result of setcc_c is all zero's or all ones.

32420

if (VT.isInteger() && !VT.isVector() &&

32421

N1C && N0.getOpcode() == ISD::AND &&

32422

N0.getOperand(1).getOpcode() == ISD::Constant) {

32423

SDValue N00 = N0.getOperand(0);

32424

APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();

32425

Mask <<= N1C->getAPIntValue();

32426

bool MaskOK = false;

32427

// We can handle cases concerning bit-widening nodes containing setcc_c if

32428

// we carefully interrogate the mask to make sure we are semantics

32429

// preserving.

32430

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

32431

// of the underlying setcc_c operation if the setcc_c was zero extended.

32432

// Consider the following example:

32433

// zext(setcc_c) -> i32 0x0000FFFF

32434

// c1 -> i32 0x0000FFFF

32435

// c2 -> i32 0x00000001

32436

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

32437

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

32438

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

32439

MaskOK = true;

32440

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

32441

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

32442

MaskOK = true;

32443

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

32444

N00.getOpcode() == ISD::ANY_EXTEND) &&

32445

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

32446

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

32447

}

32448

if (MaskOK && Mask != 0) {

32449

SDLoc DL(N);

32450

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

32451

}

32452

}

32453

32454

// Hardware support for vector shifts is sparse which makes us scalarize the

32455

// vector operations in many cases. Also, on sandybridge ADD is faster than

32456

// shl.

32457

// (shl V, 1) -> add V,V

32458

if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))

32459

if (auto *N1SplatC = N1BV->getConstantSplatNode()) {

32460

assert(N0.getValueType().isVector() && "Invalid vector shift type")(static_cast <bool> (N0.getValueType().isVector() &&
"Invalid vector shift type") ? void (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32460, __extension__ __PRETTY_FUNCTION__));

32461

// We shift all of the values by one. In many cases we do not have

32462

// hardware support for this operation. This is better expressed as an ADD

32463

// of two values.

32464

if (N1SplatC->getAPIntValue() == 1)

32465

return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);

32466

}

32467

32468

return SDValue();

32469

}

32470

32471

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {

32472

SDValue N0 = N->getOperand(0);

32473

SDValue N1 = N->getOperand(1);

32474

EVT VT = N0.getValueType();

32475

unsigned Size = VT.getSizeInBits();

32476

32477

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

32478

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

32479

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

32480

// depending on sign of (SarConst - [56,48,32,24,16])

32481

32482

// sexts in X86 are MOVs. The MOVs have the same code size

32483

// as above SHIFTs (only SHIFT on 1 has lower code size).

32484

// However the MOVs have 2 advantages to a SHIFT:

32485

// 1. MOVs can write to a register that differs from source

32486

// 2. MOVs accept memory operands

32487

32488

if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||

32489

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

32490

N0.getOperand(1).getOpcode() != ISD::Constant)

32491

return SDValue();

32492

32493

SDValue N00 = N0.getOperand(0);

32494

SDValue N01 = N0.getOperand(1);

32495

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

32496

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

32497

EVT CVT = N1.getValueType();

32498

32499

if (SarConst.isNegative())

32500

return SDValue();

32501

32502

for (MVT SVT : MVT::integer_valuetypes()) {

32503

unsigned ShiftSize = SVT.getSizeInBits();

32504

// skipping types without corresponding sext/zext and

32505

// ShlConst that is not one of [56,48,32,24,16]

32506

if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)

32507

continue;

32508

SDLoc DL(N);

32509

SDValue NN =

32510

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

32511

SarConst = SarConst - (Size - ShiftSize);

32512

if (SarConst == 0)

32513

return NN;

32514

else if (SarConst.isNegative())

32515

return DAG.getNode(ISD::SHL, DL, VT, NN,

32516

DAG.getConstant(-SarConst, DL, CVT));

32517

else

32518

return DAG.getNode(ISD::SRA, DL, VT, NN,

32519

DAG.getConstant(SarConst, DL, CVT));

32520

}

32521

return SDValue();

32522

}

32523

32524

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {

32525

SDValue N0 = N->getOperand(0);

32526

SDValue N1 = N->getOperand(1);

32527

EVT VT = N0.getValueType();

32528

32529

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

32530

// TODO: This is a generic DAG combine that became an x86-only combine to

32531

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

32532

// and-not ('andn').

32533

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

32534

return SDValue();

32535

32536

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

32537

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

32538

if (!ShiftC || !AndC)

32539

return SDValue();

32540

32541

// If we can shrink the constant mask below 8-bits or 32-bits, then this

32542

// transform should reduce code size. It may also enable secondary transforms

32543

// from improved known-bits analysis or instruction selection.

32544

APInt MaskVal = AndC->getAPIntValue();

32545

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

32546

unsigned OldMaskSize = MaskVal.getMinSignedBits();

32547

unsigned NewMaskSize = NewMaskVal.getMinSignedBits();

32548

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

32549

(OldMaskSize > 32 && NewMaskSize <= 32)) {

32550

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

32551

SDLoc DL(N);

32552

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

32553

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

32554

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

32555

}

32556

return SDValue();

32557

}

32558

32559

/// \brief Returns a vector of 0s if the node in input is a vector logical

32560

/// shift by a constant amount which is known to be bigger than or equal

32561

/// to the vector element size in bits.

32562

static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,

32563

const X86Subtarget &Subtarget) {

32564

EVT VT = N->getValueType(0);

32565

32566

if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&

32567

(!Subtarget.hasInt256() ||

32568

(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))

32569

return SDValue();

32570

32571

SDValue Amt = N->getOperand(1);

32572

SDLoc DL(N);

32573

if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))

32574

if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {

32575

const APInt &ShiftAmt = AmtSplat->getAPIntValue();

32576

unsigned MaxAmount =

32577

VT.getSimpleVT().getScalarSizeInBits();

32578

32579

// SSE2/AVX2 logical shifts always return a vector of 0s

32580

// if the shift amount is bigger than or equal to

32581

// the element size. The constant shift amount will be

32582

// encoded as a 8-bit immediate.

32583

if (ShiftAmt.trunc(8).uge(MaxAmount))

32584

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);

32585

}

32586

32587

return SDValue();

32588

}

32589

32590

static SDValue combineShift(SDNode* N, SelectionDAG &DAG,

32591

TargetLowering::DAGCombinerInfo &DCI,

32592

const X86Subtarget &Subtarget) {

32593

if (N->getOpcode() == ISD::SHL)

32594

if (SDValue V = combineShiftLeft(N, DAG))

32595

return V;

32596

32597

if (N->getOpcode() == ISD::SRA)

32598

if (SDValue V = combineShiftRightArithmetic(N, DAG))

32599

return V;

32600

32601

if (N->getOpcode() == ISD::SRL)

32602

if (SDValue V = combineShiftRightLogical(N, DAG))

32603

return V;

32604

32605

// Try to fold this logical shift into a zero vector.

32606

if (N->getOpcode() != ISD::SRA)

32607

if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))

32608

return V;

32609

32610

return SDValue();

32611

}

32612

32613

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

32614

TargetLowering::DAGCombinerInfo &DCI,

32615

const X86Subtarget &Subtarget) {

32616

unsigned Opcode = N->getOpcode();

32617

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected shift opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32618, __extension__ __PRETTY_FUNCTION__))

32618

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected shift opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32618, __extension__ __PRETTY_FUNCTION__));

32619

32620

EVT VT = N->getValueType(0);

32621

SDValue N0 = N->getOperand(0);

32622

SDValue N1 = N->getOperand(1);

32623

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

32624

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

32625

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32627, __extension__ __PRETTY_FUNCTION__))

32626

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32627, __extension__ __PRETTY_FUNCTION__))

32627

"Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32627, __extension__ __PRETTY_FUNCTION__));

32628

32629

// Constant Folding.

32630

APInt UndefElts0, UndefElts1;

32631

SmallVector<APInt, 32> EltBits0, EltBits1;

32632

if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&

32633

(N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&

32634

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

32635

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

32636

unsigned NumLanes = VT.getSizeInBits() / 128;

32637

unsigned NumDstElts = VT.getVectorNumElements();

32638

unsigned NumSrcElts = NumDstElts / 2;

32639

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

32640

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

32641

bool IsSigned = (X86ISD::PACKSS == Opcode);

32642

32643

APInt Undefs(NumDstElts, 0);

32644

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));

32645

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

32646

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

32647

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

32648

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

32649

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

32650

32651

if (UndefElts[SrcIdx]) {

32652

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

32653

continue;

32654

}

32655

32656

APInt &Val = EltBits[SrcIdx];

32657

if (IsSigned) {

32658

// PACKSS: Truncate signed value with signed saturation.

32659

// Source values less than dst minint are saturated to minint.

32660

// Source values greater than dst maxint are saturated to maxint.

32661

if (Val.isSignedIntN(DstBitsPerElt))

32662

Val = Val.trunc(DstBitsPerElt);

32663

else if (Val.isNegative())

32664

Val = APInt::getSignedMinValue(DstBitsPerElt);

32665

else

32666

Val = APInt::getSignedMaxValue(DstBitsPerElt);

32667

} else {

32668

// PACKUS: Truncate signed value with unsigned saturation.

32669

// Source values less than zero are saturated to zero.

32670

// Source values greater than dst maxuint are saturated to maxuint.

32671

if (Val.isIntN(DstBitsPerElt))

32672

Val = Val.trunc(DstBitsPerElt);

32673

else if (Val.isNegative())

32674

Val = APInt::getNullValue(DstBitsPerElt);

32675

else

32676

Val = APInt::getAllOnesValue(DstBitsPerElt);

32677

}

32678

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

32679

}

32680

}

32681

32682

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

32683

}

32684

32685

// Attempt to combine as shuffle.

32686

SDValue Op(N, 0);

32687

if (SDValue Res = combineX86ShufflesRecursively(

32688

{Op}, 0, Op, {0}, {}, /*Depth*/ 1,

32689

/*HasVarMask*/ false, DAG, DCI, Subtarget)) {

32690

DCI.CombineTo(N, Res);

32691

return SDValue();

32692

}

32693

32694

return SDValue();

32695

}

32696

32697

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

32698

TargetLowering::DAGCombinerInfo &DCI,

32699

const X86Subtarget &Subtarget) {

32700

unsigned Opcode = N->getOpcode();

32701

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32703, __extension__ __PRETTY_FUNCTION__))

32702

X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32703, __extension__ __PRETTY_FUNCTION__))

32703

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32703, __extension__ __PRETTY_FUNCTION__));

32704

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

32705

EVT VT = N->getValueType(0);

32706

SDValue N0 = N->getOperand(0);

32707

SDValue N1 = N->getOperand(1);

32708

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

32709

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32710, __extension__ __PRETTY_FUNCTION__))

32710

"Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32710, __extension__ __PRETTY_FUNCTION__));

32711

32712

// Out of range logical bit shifts are guaranteed to be zero.

32713

// Out of range arithmetic bit shifts splat the sign bit.

32714

APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();

32715

if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {

32716

if (LogicalShift)

32717

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));

32718

else

32719

ShiftVal = NumBitsPerElt - 1;

32720

}

32721

32722

// Shift N0 by zero -> N0.

32723

if (!ShiftVal)

32724

return N0;

32725

32726

// Shift zero -> zero.

32727

if (ISD::isBuildVectorAllZeros(N0.getNode()))

32728

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));

32729

32730

// fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).

32731

// This VSRLI only looks at the sign bit, which is unmodified by VSRAI.

32732

// TODO - support other sra opcodes as needed.

32733

if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&

32734

N0.getOpcode() == X86ISD::VSRAI)

32735

return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);

32736

32737

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

32738

if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&

32739

N1 == N0.getOperand(1)) {

32740

SDValue N00 = N0.getOperand(0);

32741

unsigned NumSignBits = DAG.ComputeNumSignBits(N00);

32742

if (ShiftVal.ult(NumSignBits))

32743

return N00;

32744

}

32745

32746

// We can decode 'whole byte' logical bit shifts as shuffles.

32747

if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {

32748

SDValue Op(N, 0);

32749

if (SDValue Res = combineX86ShufflesRecursively(

32750

{Op}, 0, Op, {0}, {}, /*Depth*/ 1,

32751

/*HasVarMask*/ false, DAG, DCI, Subtarget)) {

32752

DCI.CombineTo(N, Res);

32753

return SDValue();

32754

}

32755

}

32756

32757

// Constant Folding.

32758

APInt UndefElts;

32759

SmallVector<APInt, 32> EltBits;

32760

if (N->isOnlyUserOf(N0.getNode()) &&

32761

getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {

32762

assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32763, __extension__ __PRETTY_FUNCTION__))

32763

"Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32763, __extension__ __PRETTY_FUNCTION__));

32764

unsigned ShiftImm = ShiftVal.getZExtValue();

32765

for (APInt &Elt : EltBits) {

32766

if (X86ISD::VSHLI == Opcode)

32767

Elt <<= ShiftImm;

32768

else if (X86ISD::VSRAI == Opcode)

32769

Elt.ashrInPlace(ShiftImm);

32770

else

32771

Elt.lshrInPlace(ShiftImm);

32772

}

32773

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

32774

}

32775

32776

return SDValue();

32777

}

32778

32779

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

32780

TargetLowering::DAGCombinerInfo &DCI,

32781

const X86Subtarget &Subtarget) {

32782

assert((static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))

32783

((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))

32784

(N->getOpcode() == X86ISD::PINSRW &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))

32785

N->getValueType(0) == MVT::v8i16)) &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))

32786

"Unexpected vector insertion")(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__));

32787

32788

// Attempt to combine PINSRB/PINSRW patterns to a shuffle.

32789

SDValue Op(N, 0);

32790

if (SDValue Res = combineX86ShufflesRecursively(

32791

{Op}, 0, Op, {0}, {}, /*Depth*/ 1,

32792

/*HasVarMask*/ false, DAG, DCI, Subtarget)) {

32793

DCI.CombineTo(N, Res);

32794

return SDValue();

32795

}

32796

32797

return SDValue();

32798

}

32799

32800

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

32801

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

32802

/// OR -> CMPNEQSS.

32803

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

32804

TargetLowering::DAGCombinerInfo &DCI,

32805

const X86Subtarget &Subtarget) {

32806

unsigned opcode;

32807

32808

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

32809

// we're requiring SSE2 for both.

32810

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

32811

SDValue N0 = N->getOperand(0);

32812

SDValue N1 = N->getOperand(1);

32813

SDValue CMP0 = N0->getOperand(1);

32814

SDValue CMP1 = N1->getOperand(1);

32815

SDLoc DL(N);

32816

32817

// The SETCCs should both refer to the same CMP.

32818

if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)

32819

return SDValue();

32820

32821

SDValue CMP00 = CMP0->getOperand(0);

32822

SDValue CMP01 = CMP0->getOperand(1);

32823

EVT VT = CMP00.getValueType();

32824

32825

if (VT == MVT::f32 || VT == MVT::f64) {

32826

bool ExpectingFlags = false;

32827

// Check for any users that want flags:

32828

for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();

32829

!ExpectingFlags && UI != UE; ++UI)

32830

switch (UI->getOpcode()) {

32831

default:

32832

case ISD::BR_CC:

32833

case ISD::BRCOND:

32834

case ISD::SELECT:

32835

ExpectingFlags = true;

32836

break;

32837

case ISD::CopyToReg:

32838

case ISD::SIGN_EXTEND:

32839

case ISD::ZERO_EXTEND:

32840

case ISD::ANY_EXTEND:

32841

break;

32842

}

32843

32844

if (!ExpectingFlags) {

32845

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

32846

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

32847

32848

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

32849

X86::CondCode tmp = cc0;

32850

cc0 = cc1;

32851

cc1 = tmp;

32852

}

32853

32854

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

32855

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

32856

// FIXME: need symbolic constants for these magic numbers.

32857

// See X86ATTInstPrinter.cpp:printSSECC().

32858

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

32859

if (Subtarget.hasAVX512()) {

32860

SDValue FSetCC =

32861

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

32862

DAG.getConstant(x86cc, DL, MVT::i8));

32863

return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),

32864

FSetCC, DAG.getIntPtrConstant(0, DL));

32865

}

32866

SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,

32867

CMP00.getValueType(), CMP00, CMP01,

32868

DAG.getConstant(x86cc, DL,

32869

MVT::i8));

32870

32871

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

32872

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

32873

32874

if (is64BitFP && !Subtarget.is64Bit()) {

32875

// On a 32-bit target, we cannot bitcast the 64-bit float to a

32876

// 64-bit integer, since that's not a legal type. Since

32877

// OnesOrZeroesF is all ones of all zeroes, we don't need all the

32878

// bits, but can do this little dance to extract the lowest 32 bits

32879

// and work with those going forward.

32880

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

32881

OnesOrZeroesF);

32882

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

32883

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

32884

Vector32, DAG.getIntPtrConstant(0, DL));

32885

IntVT = MVT::i32;

32886

}

32887

32888

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

32889

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

32890

DAG.getConstant(1, DL, IntVT));

32891

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

32892

ANDed);

32893

return OneBitOfTruth;

32894

}

32895

}

32896

}

32897

}

32898

return SDValue();

32899

}

32900

32901

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

32902

static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {

32903

assert(N->getOpcode() == ISD::AND)(static_cast <bool> (N->getOpcode() == ISD::AND) ? void
(0) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32903, __extension__ __PRETTY_FUNCTION__));

32904

32905

EVT VT = N->getValueType(0);

32906

SDValue N0 = N->getOperand(0);

32907

SDValue N1 = N->getOperand(1);

32908

SDLoc DL(N);

32909

32910

if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)

32911

return SDValue();

32912

32913

if (N0.getOpcode() == ISD::XOR &&

32914

ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))

32915

return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

32916

32917

if (N1.getOpcode() == ISD::XOR &&

32918

ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))

32919

return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

32920

32921

return SDValue();

32922

}

32923

32924

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

32925

// register. In most cases we actually compare or select YMM-sized registers

32926

// and mixing the two types creates horrible code. This method optimizes

32927

// some of the transition sequences.

32928

static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,

32929

TargetLowering::DAGCombinerInfo &DCI,

32930

const X86Subtarget &Subtarget) {

32931

EVT VT = N->getValueType(0);

32932

if (!VT.is256BitVector())

32933

return SDValue();

32934

32935

assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32937, __extension__ __PRETTY_FUNCTION__))

32936

N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32937, __extension__ __PRETTY_FUNCTION__))

32937

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32937, __extension__ __PRETTY_FUNCTION__));

32938

32939

SDValue Narrow = N->getOperand(0);

32940

EVT NarrowVT = Narrow->getValueType(0);

32941

if (!NarrowVT.is128BitVector())

32942

return SDValue();

32943

32944

if (Narrow->getOpcode() != ISD::XOR &&

32945

Narrow->getOpcode() != ISD::AND &&

32946

Narrow->getOpcode() != ISD::OR)

32947

return SDValue();

32948

32949

SDValue N0 = Narrow->getOperand(0);

32950

SDValue N1 = Narrow->getOperand(1);

32951

SDLoc DL(Narrow);

32952

32953

// The Left side has to be a trunc.

32954

if (N0.getOpcode() != ISD::TRUNCATE)

32955

return SDValue();

32956

32957

// The type of the truncated inputs.

32958

EVT WideVT = N0->getOperand(0)->getValueType(0);

32959

if (WideVT != VT)

32960

return SDValue();

32961

32962

// The right side has to be a 'trunc' or a constant vector.

32963

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;

32964

ConstantSDNode *RHSConstSplat = nullptr;

32965

if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))

32966

RHSConstSplat = RHSBV->getConstantSplatNode();

32967

if (!RHSTrunc && !RHSConstSplat)

32968

return SDValue();

32969

32970

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

32971

32972

if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))

32973

return SDValue();

32974

32975

// Set N0 and N1 to hold the inputs to the new wide operation.

32976

N0 = N0->getOperand(0);

32977

if (RHSConstSplat) {

32978

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),

32979

SDValue(RHSConstSplat, 0));

32980

N1 = DAG.getSplatBuildVector(WideVT, DL, N1);

32981

} else if (RHSTrunc) {

32982

N1 = N1->getOperand(0);

32983

}

32984

32985

// Generate the wide operation.

32986

SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);

32987

unsigned Opcode = N->getOpcode();

32988

switch (Opcode) {

32989

case ISD::ANY_EXTEND:

32990

return Op;

32991

case ISD::ZERO_EXTEND: {

32992

unsigned InBits = NarrowVT.getScalarSizeInBits();

32993

APInt Mask = APInt::getAllOnesValue(InBits);

32994

Mask = Mask.zext(VT.getScalarSizeInBits());

32995

return DAG.getNode(ISD::AND, DL, VT,

32996

Op, DAG.getConstant(Mask, DL, VT));

32997

}

32998

case ISD::SIGN_EXTEND:

32999

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

33000

Op, DAG.getValueType(NarrowVT));

33001

default:

33002

llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33002);

33003

}

33004

}

33005

33006

/// If both input operands of a logic op are being cast from floating point

33007

/// types, try to convert this into a floating point logic node to avoid

33008

/// unnecessary moves from SSE to integer registers.

33009

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

33010

const X86Subtarget &Subtarget) {

33011

unsigned FPOpcode = ISD::DELETED_NODE;

33012

if (N->getOpcode() == ISD::AND)

33013

FPOpcode = X86ISD::FAND;

33014

else if (N->getOpcode() == ISD::OR)

33015

FPOpcode = X86ISD::FOR;

33016

else if (N->getOpcode() == ISD::XOR)

33017

FPOpcode = X86ISD::FXOR;

33018

33019

assert(FPOpcode != ISD::DELETED_NODE &&(static_cast <bool> (FPOpcode != ISD::DELETED_NODE &&
"Unexpected input node for FP logic conversion") ? void (0) :
__assert_fail ("FPOpcode != ISD::DELETED_NODE && \"Unexpected input node for FP logic conversion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33020, __extension__ __PRETTY_FUNCTION__))

33020

"Unexpected input node for FP logic conversion")(static_cast <bool> (FPOpcode != ISD::DELETED_NODE &&
"Unexpected input node for FP logic conversion") ? void (0) :
__assert_fail ("FPOpcode != ISD::DELETED_NODE && \"Unexpected input node for FP logic conversion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33020, __extension__ __PRETTY_FUNCTION__));

33021

33022

EVT VT = N->getValueType(0);

33023

SDValue N0 = N->getOperand(0);

33024

SDValue N1 = N->getOperand(1);

33025

SDLoc DL(N);

33026

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

33027

((Subtarget.hasSSE1() && VT == MVT::i32) ||

33028

(Subtarget.hasSSE2() && VT == MVT::i64))) {

33029

SDValue N00 = N0.getOperand(0);

33030

SDValue N10 = N1.getOperand(0);

33031

EVT N00Type = N00.getValueType();

33032

EVT N10Type = N10.getValueType();

33033

if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {

33034

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

33035

return DAG.getBitcast(VT, FPLogic);

33036

}

33037

}

33038

return SDValue();

33039

}

33040

33041

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

33042

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

33043

/// with a shift-right to eliminate loading the vector constant mask value.

33044

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

33045

const X86Subtarget &Subtarget) {

33046

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

33047

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

33048

EVT VT0 = Op0.getValueType();

33049

EVT VT1 = Op1.getValueType();

33050

33051

if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())

33052

return SDValue();

33053

33054

APInt SplatVal;

33055

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

33056

!SplatVal.isMask())

33057

return SDValue();

33058

33059

if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))

33060

return SDValue();

33061

33062

unsigned EltBitWidth = VT0.getScalarSizeInBits();

33063

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

33064

return SDValue();

33065

33066

SDLoc DL(N);

33067

unsigned ShiftVal = SplatVal.countTrailingOnes();

33068

SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

33069

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);

33070

return DAG.getBitcast(N->getValueType(0), Shift);

33071

}

33072

33073

// Get the index node from the lowered DAG of a GEP IR instruction with one

33074

// indexing dimension.

33075

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

33076

if (Ld->isIndexed())

33077

return SDValue();

33078

33079

SDValue Base = Ld->getBasePtr();

33080

33081

if (Base.getOpcode() != ISD::ADD)

33082

return SDValue();

33083

33084

SDValue ShiftedIndex = Base.getOperand(0);

33085

33086

if (ShiftedIndex.getOpcode() != ISD::SHL)

33087

return SDValue();

33088

33089

return ShiftedIndex.getOperand(0);

33090

33091

}

33092

33093

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

33094

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

33095

switch (VT.getSizeInBits()) {

33096

default: return false;

33097

case 64: return Subtarget.is64Bit() ? true : false;

33098

case 32: return true;

33099

}

33100

}

33101

return false;

33102

}

33103

33104

// This function recognizes cases where X86 bzhi instruction can replace and

33105

// 'and-load' sequence.

33106

// In case of loading integer value from an array of constants which is defined

33107

// as follows:

33108

33109

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

33110

33111

// then applying a bitwise and on the result with another input.

33112

// It's equivalent to performing bzhi (zero high bits) on the input, with the

33113

// same index of the load.

33114

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

33115

const X86Subtarget &Subtarget) {

33116

MVT VT = Node->getSimpleValueType(0);

33117

SDLoc dl(Node);

33118

33119

// Check if subtarget has BZHI instruction for the node's type

33120

if (!hasBZHI(Subtarget, VT))

33121

return SDValue();

33122

33123

// Try matching the pattern for both operands.

33124

for (unsigned i = 0; i < 2; i++) {

33125

SDValue N = Node->getOperand(i);

33126

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

33127

33128

// continue if the operand is not a load instruction

33129

if (!Ld)

33130

return SDValue();

33131

33132

const Value *MemOp = Ld->getMemOperand()->getValue();

33133

33134

if (!MemOp)

33135

return SDValue();

33136

33137

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

33138

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

33139

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

33140

33141

Constant *Init = GV->getInitializer();

33142

Type *Ty = Init->getType();

33143

if (!isa<ConstantDataArray>(Init) ||

33144

!Ty->getArrayElementType()->isIntegerTy() ||

33145

Ty->getArrayElementType()->getScalarSizeInBits() !=

33146

VT.getSizeInBits() ||

33147

Ty->getArrayNumElements() >

33148

Ty->getArrayElementType()->getScalarSizeInBits())

33149

continue;

33150

33151

// Check if the array's constant elements are suitable to our case.

33152

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

33153

bool ConstantsMatch = true;

33154

for (uint64_t j = 0; j < ArrayElementCount; j++) {

33155

ConstantInt *Elem =

33156

dyn_cast<ConstantInt>(Init->getAggregateElement(j));

33157

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

33158

ConstantsMatch = false;

33159

break;

33160

}

33161

}

33162

if (!ConstantsMatch)

33163

continue;

33164

33165

// Do the transformation (For 32-bit type):

33166

// -> (and (load arr[idx]), inp)

33167

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

33168

// that will be replaced with one bzhi instruction.

33169

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

33170

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);

33171

33172

// Get the Node which indexes into the array.

33173

SDValue Index = getIndexFromUnindexedLoad(Ld);

33174

if (!Index)

33175

return SDValue();

33176

Index = DAG.getZExtOrTrunc(Index, dl, VT);

33177

33178

SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);

33179

33180

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

33181

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

33182

33183

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

33184

}

33185

}

33186

}

33187

}

33188

return SDValue();

33189

}

33190

33191

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

33192

TargetLowering::DAGCombinerInfo &DCI,

33193

const X86Subtarget &Subtarget) {

33194

EVT VT = N->getValueType(0);

33195

33196

// If this is SSE1 only convert to FAND to avoid scalarization.

33197

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

33198

return DAG.getBitcast(

33199

MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,

33200

DAG.getBitcast(MVT::v4f32, N->getOperand(0)),

33201

DAG.getBitcast(MVT::v4f32, N->getOperand(1))));

33202

}

33203

33204

if (DCI.isBeforeLegalizeOps())

33205

return SDValue();

33206

33207

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

33208

return R;

33209

33210

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

33211

return FPLogic;

33212

33213

if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))

33214

return R;

33215

33216

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

33217

return ShiftRight;

33218

33219

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

33220

return R;

33221

33222

// Attempt to recursively combine a bitmask AND with shuffles.

33223

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

33224

SDValue Op(N, 0);

33225

if (SDValue Res = combineX86ShufflesRecursively(

33226

{Op}, 0, Op, {0}, {}, /*Depth*/ 1,

33227

/*HasVarMask*/ false, DAG, DCI, Subtarget)) {

33228

DCI.CombineTo(N, Res);

33229

return SDValue();

33230

}

33231

}

33232

33233

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

33234

if ((VT.getScalarSizeInBits() % 8) == 0 &&

33235

N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

33236

isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {

33237

SDValue BitMask = N->getOperand(1);

33238

SDValue SrcVec = N->getOperand(0).getOperand(0);

33239

EVT SrcVecVT = SrcVec.getValueType();

33240

33241

// Check that the constant bitmask masks whole bytes.

33242

APInt UndefElts;

33243

SmallVector<APInt, 64> EltBits;

33244

if (VT == SrcVecVT.getScalarType() &&

33245

N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&

33246

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

33247

llvm::all_of(EltBits, [](APInt M) {

33248

return M.isNullValue() || M.isAllOnesValue();

33249

})) {

33250

unsigned NumElts = SrcVecVT.getVectorNumElements();

33251

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

33252

unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

33253

33254

// Create a root shuffle mask from the byte mask and the extracted index.

33255

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

33256

for (unsigned i = 0; i != Scale; ++i) {

33257

if (UndefElts[i])

33258

continue;

33259

int VecIdx = Scale * Idx + i;

33260

ShuffleMask[VecIdx] =

33261

EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;

33262

}

33263

33264

if (SDValue Shuffle = combineX86ShufflesRecursively(

33265

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,

33266

/*HasVarMask*/ false, DAG, DCI, Subtarget))

33267

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,

33268

N->getOperand(0).getOperand(1));

33269

}

33270

}

33271

33272

return SDValue();

33273

}

33274

33275

// Try to fold:

33276

// (or (and (m, y), (pandn m, x)))

33277

// into:

33278

// (vselect m, x, y)

33279

// As a special case, try to fold:

33280

// (or (and (m, (sub 0, x)), (pandn m, x)))

33281

// into:

33282

// (sub (xor X, M), M)

33283

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

33284

const X86Subtarget &Subtarget) {

33285

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33285, __extension__ __PRETTY_FUNCTION__));

33286

33287

SDValue N0 = N->getOperand(0);

33288

SDValue N1 = N->getOperand(1);

33289

EVT VT = N->getValueType(0);

33290

33291

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

33292

(VT.is256BitVector() && Subtarget.hasInt256())))

33293

return SDValue();

33294

33295

// Canonicalize AND to LHS.

33296

if (N1.getOpcode() == ISD::AND)

33297

std::swap(N0, N1);

33298

33299

// TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for

33300

// ANDNP combine allows other combines to happen that prevent matching.

33301

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

33302

return SDValue();

33303

33304

SDValue Mask = N1.getOperand(0);

33305

SDValue X = N1.getOperand(1);

33306

SDValue Y;

33307

if (N0.getOperand(0) == Mask)

33308

Y = N0.getOperand(1);

33309

if (N0.getOperand(1) == Mask)

33310

Y = N0.getOperand(0);

33311

33312

// Check to see if the mask appeared in both the AND and ANDNP.

33313

if (!Y.getNode())

33314

return SDValue();

33315

33316

// Validate that X, Y, and Mask are bitcasts, and see through them.

33317

Mask = peekThroughBitcasts(Mask);

33318

X = peekThroughBitcasts(X);

33319

Y = peekThroughBitcasts(Y);

33320

33321

EVT MaskVT = Mask.getValueType();

33322

unsigned EltBits = MaskVT.getScalarSizeInBits();

33323

33324

// TODO: Attempt to handle floating point cases as well?

33325

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

33326

return SDValue();

33327

33328

SDLoc DL(N);

33329

33330

// Try to match:

33331

// (or (and (M, (sub 0, X)), (pandn M, X)))

33332

// which is a special case of vselect:

33333

// (vselect M, (sub 0, X), X)

33334

// Per:

33335

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

33336

// We know that, if fNegate is 0 or 1:

33337

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

33338

33339

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

33340

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

33341

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

33342

// This lets us transform our vselect to:

33343

// (add (xor X, M), (and M, 1))

33344

// And further to:

33345

// (sub (xor X, M), M)

33346

if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&

33347

DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {

33348

auto IsNegV = [](SDNode *N, SDValue V) {

33349

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

33350

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

33351

};

33352

SDValue V;

33353

if (IsNegV(Y.getNode(), X))

33354

V = X;

33355

else if (IsNegV(X.getNode(), Y))

33356

V = Y;

33357

33358

if (V) {

33359

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

33360

SDValue SubOp2 = Mask;

33361

33362

// If the negate was on the false side of the select, then

33363

// the operands of the SUB need to be swapped. PR 27251.

33364

// This is because the pattern being matched above is

33365

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

33366

// but if the pattern matched was

33367

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

33368

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

33369

// pattern also needs to be a negation of the replacement pattern above.

33370

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

33371

// sub accomplishes the negation of the replacement pattern.

33372

if (V == Y)

33373

std::swap(SubOp1, SubOp2);

33374

33375

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

33376

return DAG.getBitcast(VT, Res);

33377

}

33378

}

33379

33380

// PBLENDVB is only available on SSE 4.1.

33381

if (!Subtarget.hasSSE41())

33382

return SDValue();

33383

33384

MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;

33385

33386

X = DAG.getBitcast(BlendVT, X);

33387

Y = DAG.getBitcast(BlendVT, Y);

33388

Mask = DAG.getBitcast(BlendVT, Mask);

33389

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

33390

return DAG.getBitcast(VT, Mask);

33391

}

33392

33393

// Helper function for combineOrCmpEqZeroToCtlzSrl

33394

// Transforms:

33395

// seteq(cmp x, 0)

33396

// into:

33397

// srl(ctlz x), log2(bitsize(x))

33398

// Input pattern is checked by caller.

33399

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,

33400

SelectionDAG &DAG) {

33401

SDValue Cmp = Op.getOperand(1);

33402

EVT VT = Cmp.getOperand(0).getValueType();

33403

unsigned Log2b = Log2_32(VT.getSizeInBits());

33404

SDLoc dl(Op);

33405

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

33406

// The result of the shift is true or false, and on X86, the 32-bit

33407

// encoding of shr and lzcnt is more desirable.

33408

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

33409

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

33410

DAG.getConstant(Log2b, dl, VT));

33411

return DAG.getZExtOrTrunc(Scc, dl, ExtTy);

33412

}

33413

33414

// Try to transform:

33415

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

33416

// into:

33417

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

33418

// Will also attempt to match more generic cases, eg:

33419

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

33420

// Only applies if the target supports the FastLZCNT feature.

33421

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

33422

TargetLowering::DAGCombinerInfo &DCI,

33423

const X86Subtarget &Subtarget) {

33424

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

33425

return SDValue();

33426

33427

auto isORCandidate = [](SDValue N) {

33428

return (N->getOpcode() == ISD::OR && N->hasOneUse());

33429

};

33430

33431

// Check the zero extend is extending to 32-bit or more. The code generated by

33432

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

33433

// instructions to clear the upper bits.

33434

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

33435

!isORCandidate(N->getOperand(0)))

33436

return SDValue();

33437

33438

// Check the node matches: setcc(eq, cmp 0)

33439

auto isSetCCCandidate = [](SDValue N) {

33440

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

33441

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

33442

N->getOperand(1).getOpcode() == X86ISD::CMP &&

33443

isNullConstant(N->getOperand(1).getOperand(1)) &&

33444

N->getOperand(1).getValueType().bitsGE(MVT::i32);

33445

};

33446

33447

SDNode *OR = N->getOperand(0).getNode();

33448

SDValue LHS = OR->getOperand(0);

33449

SDValue RHS = OR->getOperand(1);

33450

33451

// Save nodes matching or(or, setcc(eq, cmp 0)).

33452

SmallVector<SDNode *, 2> ORNodes;

33453

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

33454

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

33455

ORNodes.push_back(OR);

33456

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

33457

LHS = OR->getOperand(0);

33458

RHS = OR->getOperand(1);

33459

}

33460

33461

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

33462

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

33463

!isORCandidate(SDValue(OR, 0)))

33464

return SDValue();

33465

33466

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

33467

// to

33468

// or(srl(ctlz),srl(ctlz)).

33469

// The dag combiner can then fold it into:

33470

// srl(or(ctlz, ctlz)).

33471

EVT VT = OR->getValueType(0);

33472

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);

33473

SDValue Ret, NewRHS;

33474

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))

33475

Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

33476

33477

if (!Ret)

33478

return SDValue();

33479

33480

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

33481

while (ORNodes.size() > 0) {

33482

OR = ORNodes.pop_back_val();

33483

LHS = OR->getOperand(0);

33484

RHS = OR->getOperand(1);

33485

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

33486

if (RHS->getOpcode() == ISD::OR)

33487

std::swap(LHS, RHS);

33488

EVT VT = OR->getValueType(0);

33489

SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);

33490

if (!NewRHS)

33491

return SDValue();

33492

Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);

33493

}

33494

33495

if (Ret)

33496

Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

33497

33498

return Ret;

33499

}

33500

33501

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

33502

TargetLowering::DAGCombinerInfo &DCI,

33503

const X86Subtarget &Subtarget) {

33504

SDValue N0 = N->getOperand(0);

33505

SDValue N1 = N->getOperand(1);

33506

EVT VT = N->getValueType(0);

33507

33508

// If this is SSE1 only convert to FOR to avoid scalarization.

33509

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

33510

return DAG.getBitcast(MVT::v4i32,

33511

DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,

33512

DAG.getBitcast(MVT::v4f32, N0),

33513

DAG.getBitcast(MVT::v4f32, N1)));

33514

}

33515

33516

if (DCI.isBeforeLegalizeOps())

33517

return SDValue();

33518

33519

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

33520

return R;

33521

33522

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

33523

return FPLogic;

33524

33525

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

33526

return R;

33527

33528

if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)

33529

return SDValue();

33530

33531

// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)

33532

bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();

33533

33534

// SHLD/SHRD instructions have lower register pressure, but on some

33535

// platforms they have higher latency than the equivalent

33536

// series of shifts/or that would otherwise be generated.

33537

// Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions

33538

// have higher latencies and we are not optimizing for size.

33539

if (!OptForSize && Subtarget.isSHLDSlow())

33540

return SDValue();

33541

33542

if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)

33543

std::swap(N0, N1);

33544

if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)

33545

return SDValue();

33546

if (!N0.hasOneUse() || !N1.hasOneUse())

33547

return SDValue();

33548

33549

SDValue ShAmt0 = N0.getOperand(1);

33550

if (ShAmt0.getValueType() != MVT::i8)

33551

return SDValue();

33552

SDValue ShAmt1 = N1.getOperand(1);

33553

if (ShAmt1.getValueType() != MVT::i8)

33554

return SDValue();

33555

if (ShAmt0.getOpcode() == ISD::TRUNCATE)

33556

ShAmt0 = ShAmt0.getOperand(0);

33557

if (ShAmt1.getOpcode() == ISD::TRUNCATE)

33558

ShAmt1 = ShAmt1.getOperand(0);

33559

33560

SDLoc DL(N);

33561

unsigned Opc = X86ISD::SHLD;

33562

SDValue Op0 = N0.getOperand(0);

33563

SDValue Op1 = N1.getOperand(0);

33564

if (ShAmt0.getOpcode() == ISD::SUB ||

33565

ShAmt0.getOpcode() == ISD::XOR) {

33566

Opc = X86ISD::SHRD;

33567

std::swap(Op0, Op1);

33568

std::swap(ShAmt0, ShAmt1);

33569

}

33570

33571

// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )

33572

// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )

33573

// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )

33574

// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )

33575

unsigned Bits = VT.getSizeInBits();

33576

if (ShAmt1.getOpcode() == ISD::SUB) {

33577

SDValue Sum = ShAmt1.getOperand(0);

33578

if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {

33579

SDValue ShAmt1Op1 = ShAmt1.getOperand(1);

33580

if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)

33581

ShAmt1Op1 = ShAmt1Op1.getOperand(0);

33582

if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)

33583

return DAG.getNode(Opc, DL, VT,

33584

Op0, Op1,

33585

DAG.getNode(ISD::TRUNCATE, DL,

33586

MVT::i8, ShAmt0));

33587

}

33588

} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {

33589

ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);

33590

if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)

33591

return DAG.getNode(Opc, DL, VT,

33592

N0.getOperand(0), N1.getOperand(0),

33593

DAG.getNode(ISD::TRUNCATE, DL,

33594

MVT::i8, ShAmt0));

33595

} else if (ShAmt1.getOpcode() == ISD::XOR) {

33596

SDValue Mask = ShAmt1.getOperand(1);

33597

if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {

33598

unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);

33599

SDValue ShAmt1Op0 = ShAmt1.getOperand(0);

33600

if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)

33601

ShAmt1Op0 = ShAmt1Op0.getOperand(0);

33602

if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {

33603

if (Op1.getOpcode() == InnerShift &&

33604

isa<ConstantSDNode>(Op1.getOperand(1)) &&

33605

Op1.getConstantOperandVal(1) == 1) {

33606

return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),

33607

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));

33608

}

33609

// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).

33610

if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&

33611

Op1.getOperand(0) == Op1.getOperand(1)) {

33612

return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),

33613

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));

33614

}

33615

}

33616

}

33617

}

33618

33619

return SDValue();

33620

}

33621

33622

/// Try to turn tests against the signbit in the form of:

33623

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

33624

/// into:

33625

/// SETGT(X, -1)

33626

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

33627

// This is only worth doing if the output type is i8 or i1.

33628

EVT ResultType = N->getValueType(0);

33629

if (ResultType != MVT::i8 && ResultType != MVT::i1)

33630

return SDValue();

33631

33632

SDValue N0 = N->getOperand(0);

33633

SDValue N1 = N->getOperand(1);

33634

33635

// We should be performing an xor against a truncated shift.

33636

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

33637

return SDValue();

33638

33639

// Make sure we are performing an xor against one.

33640

if (!isOneConstant(N1))

33641

return SDValue();

33642

33643

// SetCC on x86 zero extends so only act on this if it's a logical shift.

33644

SDValue Shift = N0.getOperand(0);

33645

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

33646

return SDValue();

33647

33648

// Make sure we are truncating from one of i16, i32 or i64.

33649

EVT ShiftTy = Shift.getValueType();

33650

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

33651

return SDValue();

33652

33653

// Make sure the shift amount extracts the sign bit.

33654

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

33655

Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)

33656

return SDValue();

33657

33658

// Create a greater-than comparison against -1.

33659

// N.B. Using SETGE against 0 works but we want a canonical looking

33660

// comparison, using SETGT matches up with what TranslateX86CC.

33661

SDLoc DL(N);

33662

SDValue ShiftOp = Shift.getOperand(0);

33663

EVT ShiftOpTy = ShiftOp.getValueType();

33664

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33665

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

33666

*DAG.getContext(), ResultType);

33667

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

33668

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

33669

if (SetCCResultType != ResultType)

33670

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

33671

return Cond;

33672

}

33673

33674

/// Turn vector tests of the signbit in the form of:

33675

/// xor (sra X, elt_size(X)-1), -1

33676

/// into:

33677

/// pcmpgt X, -1

33678

///

33679

/// This should be called before type legalization because the pattern may not

33680

/// persist after that.

33681

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

33682

const X86Subtarget &Subtarget) {

33683

EVT VT = N->getValueType(0);

33684

if (!VT.isSimple())

33685

return SDValue();

33686

33687

switch (VT.getSimpleVT().SimpleTy) {

33688

default: return SDValue();

33689

case MVT::v16i8:

33690

case MVT::v8i16:

33691

case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;

33692

case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;

33693

case MVT::v32i8:

33694

case MVT::v16i16:

33695

case MVT::v8i32:

33696

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

33697

}

33698

33699

// There must be a shift right algebraic before the xor, and the xor must be a

33700

// 'not' operation.

33701

SDValue Shift = N->getOperand(0);

33702

SDValue Ones = N->getOperand(1);

33703

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

33704

!ISD::isBuildVectorAllOnes(Ones.getNode()))

33705

return SDValue();

33706

33707

// The shift should be smearing the sign bit across each vector element.

33708

auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));

33709

if (!ShiftBV)

33710

return SDValue();

33711

33712

EVT ShiftEltTy = Shift.getValueType().getVectorElementType();

33713

auto *ShiftAmt = ShiftBV->getConstantSplatNode();

33714

if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)

33715

return SDValue();

33716

33717

// Create a greater-than comparison against -1. We don't use the more obvious

33718

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

33719

return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);

33720

}

33721

33722

/// Check if truncation with saturation form type \p SrcVT to \p DstVT

33723

/// is valid for the given \p Subtarget.

33724

static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,

33725

const X86Subtarget &Subtarget) {

33726

if (!Subtarget.hasAVX512())

33727

return false;

33728

33729

// FIXME: Scalar type may be supported if we move it to vector register.

33730

if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)

33731

return false;

33732

33733

EVT SrcElVT = SrcVT.getScalarType();

33734

EVT DstElVT = DstVT.getScalarType();

33735

if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)

33736

return false;

33737

if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)

33738

return false;

33739

if (SrcVT.is512BitVector() || Subtarget.hasVLX())

33740

return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();

33741

return false;

33742

}

33743

33744

/// Detect a pattern of truncation with saturation:

33745

/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

33746

/// Return the source value to be truncated or SDValue() if the pattern was not

33747

/// matched.

33748

static SDValue detectUSatPattern(SDValue In, EVT VT) {

33749

if (In.getOpcode() != ISD::UMIN)

33750

return SDValue();

33751

33752

//Saturation with truncation. We truncate from InVT to VT.

33753

assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (In.getScalarValueSizeInBits() >
VT.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33754, __extension__ __PRETTY_FUNCTION__))

33754

"Unexpected types for truncate operation")(static_cast <bool> (In.getScalarValueSizeInBits() >
VT.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33754, __extension__ __PRETTY_FUNCTION__));

33755

33756

APInt C;

33757

if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {

33758

// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

33759

// the element size of the destination type.

33760

return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :

33761

SDValue();

33762

}

33763

return SDValue();

33764

}

33765

33766

/// Detect a pattern of truncation with saturation:

33767

/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

33768

/// The types should allow to use VPMOVUS* instruction on AVX512.

33769

/// Return the source value to be truncated or SDValue() if the pattern was not

33770

/// matched.

33771

static SDValue detectAVX512USatPattern(SDValue In, EVT VT,

33772

const X86Subtarget &Subtarget) {

33773

if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))

33774

return SDValue();

33775

return detectUSatPattern(In, VT);

33776

}

33777

33778

static SDValue

33779

combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,

33780

const X86Subtarget &Subtarget) {

33781

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33782

if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))

33783

return SDValue();

33784

if (auto USatVal = detectUSatPattern(In, VT))

33785

if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))

33786

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

33787

return SDValue();

33788

}

33789

33790

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

33791

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

33792

/// X86ISD::AVG instruction.

33793

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

33794

const X86Subtarget &Subtarget,

33795

const SDLoc &DL) {

33796

if (!VT.isVector() || !VT.isSimple())

33797

return SDValue();

33798

EVT InVT = In.getValueType();

33799

unsigned NumElems = VT.getVectorNumElements();

33800

33801

EVT ScalarVT = VT.getVectorElementType();

33802

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&

33803

isPowerOf2_32(NumElems)))

33804

return SDValue();

33805

33806

// InScalarVT is the intermediate type in AVG pattern and it should be greater

33807

// than the original input type (i8/i16).

33808

EVT InScalarVT = InVT.getVectorElementType();

33809

if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())

33810

return SDValue();

33811

33812

if (!Subtarget.hasSSE2())

33813

return SDValue();

33814

if (Subtarget.hasBWI()) {

33815

if (VT.getSizeInBits() > 512)

33816

return SDValue();

33817

} else if (Subtarget.hasAVX2()) {

33818

if (VT.getSizeInBits() > 256)

33819

return SDValue();

33820

} else {

33821

if (VT.getSizeInBits() > 128)

33822

return SDValue();

33823

}

33824

33825

// Detect the following pattern:

33826

33827

// %1 = zext <N x i8> %a to <N x i32>

33828

// %2 = zext <N x i8> %b to <N x i32>

33829

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

33830

// %4 = add nuw nsw <N x i32> %3, %2

33831

// %5 = lshr <N x i32> %N, <i32 1 x N>

33832

// %6 = trunc <N x i32> %5 to <N x i8>

33833

33834

// In AVX512, the last instruction can also be a trunc store.

33835

33836

if (In.getOpcode() != ISD::SRL)

33837

return SDValue();

33838

33839

// A lambda checking the given SDValue is a constant vector and each element

33840

// is in the range [Min, Max].

33841

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

33842

BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);

33843

if (!BV || !BV->isConstant())

33844

return false;

33845

for (SDValue Op : V->ops()) {

33846

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);

33847

if (!C)

33848

return false;

33849

uint64_t Val = C->getZExtValue();

33850

if (Val < Min || Val > Max)

33851

return false;

33852

}

33853

return true;

33854

};

33855

33856

// Check if each element of the vector is left-shifted by one.

33857

auto LHS = In.getOperand(0);

33858

auto RHS = In.getOperand(1);

33859

if (!IsConstVectorInRange(RHS, 1, 1))

33860

return SDValue();

33861

if (LHS.getOpcode() != ISD::ADD)

33862

return SDValue();

33863

33864

// Detect a pattern of a + b + 1 where the order doesn't matter.

33865

SDValue Operands[3];

33866

Operands[0] = LHS.getOperand(0);

33867

Operands[1] = LHS.getOperand(1);

33868

33869

// Take care of the case when one of the operands is a constant vector whose

33870

// element is in the range [1, 256].

33871

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

33872

Operands[0].getOpcode() == ISD::ZERO_EXTEND &&

33873

Operands[0].getOperand(0).getValueType() == VT) {

33874

// The pattern is detected. Subtract one from the constant vector, then

33875

// demote it and emit X86ISD::AVG instruction.

33876

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

33877

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

33878

Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);

33879

return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),

33880

Operands[1]);

33881

}

33882

33883

if (Operands[0].getOpcode() == ISD::ADD)

33884

std::swap(Operands[0], Operands[1]);

33885

else if (Operands[1].getOpcode() != ISD::ADD)

33886

return SDValue();

33887

Operands[2] = Operands[1].getOperand(0);

33888

Operands[1] = Operands[1].getOperand(1);

33889

33890

// Now we have three operands of two additions. Check that one of them is a

33891

// constant vector with ones, and the other two are promoted from i8/i16.

33892

for (int i = 0; i < 3; ++i) {

33893

if (!IsConstVectorInRange(Operands[i], 1, 1))

33894

continue;

33895

std::swap(Operands[i], Operands[2]);

33896

33897

// Check if Operands[0] and Operands[1] are results of type promotion.

33898

for (int j = 0; j < 2; ++j)

33899

if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||

33900

Operands[j].getOperand(0).getValueType() != VT)

33901

return SDValue();

33902

33903

// The pattern is detected, emit X86ISD::AVG instruction.

33904

return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),

33905

Operands[1].getOperand(0));

33906

}

33907

33908

return SDValue();

33909

}

33910

33911

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

33912

TargetLowering::DAGCombinerInfo &DCI,

33913

const X86Subtarget &Subtarget) {

33914

LoadSDNode *Ld = cast<LoadSDNode>(N);

33915

EVT RegVT = Ld->getValueType(0);

33916

EVT MemVT = Ld->getMemoryVT();

33917

SDLoc dl(Ld);

33918

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33919

33920

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

33921

// into two 16-byte operations. Also split non-temporal aligned loads on

33922

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

33923

ISD::LoadExtType Ext = Ld->getExtensionType();

33924

bool Fast;

33925

unsigned AddressSpace = Ld->getAddressSpace();

33926

unsigned Alignment = Ld->getAlignment();

33927

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

33928

Ext == ISD::NON_EXTLOAD &&

33929

((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||

33930

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

33931

AddressSpace, Alignment, &Fast) && !Fast))) {

33932

unsigned NumElems = RegVT.getVectorNumElements();

33933

if (NumElems < 2)

33934

return SDValue();

33935

33936

SDValue Ptr = Ld->getBasePtr();

33937

33938

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

33939

NumElems/2);

33940

SDValue Load1 =

33941

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),

33942

Alignment, Ld->getMemOperand()->getFlags());

33943

33944

Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);

33945

SDValue Load2 =

33946

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),

33947

std::min(16U, Alignment), Ld->getMemOperand()->getFlags());

33948

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

33949

Load1.getValue(1),

33950

Load2.getValue(1));

33951

33952

SDValue NewVec = DAG.getUNDEF(RegVT);

33953

NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);

33954

NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);

33955

return DCI.CombineTo(N, NewVec, TF, true);

33956

}

33957

33958

return SDValue();

33959

}

33960

33961

/// If V is a build vector of boolean constants and exactly one of those

33962

/// constants is true, return the operand index of that true element.

33963

/// Otherwise, return -1.

33964

static int getOneTrueElt(SDValue V) {

33965

// This needs to be a build vector of booleans.

33966

// TODO: Checking for the i1 type matches the IR definition for the mask,

33967

// but the mask check could be loosened to i8 or other types. That might

33968

// also require checking more than 'allOnesValue'; eg, the x86 HW

33969

// instructions only require that the MSB is set for each mask element.

33970

// The ISD::MSTORE comments/definition do not specify how the mask operand

33971

// is formatted.

33972

auto *BV = dyn_cast<BuildVectorSDNode>(V);

33973

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

33974

return -1;

33975

33976

int TrueIndex = -1;

33977

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

33978

for (unsigned i = 0; i < NumElts; ++i) {

33979

const SDValue &Op = BV->getOperand(i);

33980

if (Op.isUndef())

33981

continue;

33982

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

33983

if (!ConstNode)

33984

return -1;

33985

if (ConstNode->getAPIntValue().isAllOnesValue()) {

33986

// If we already found a one, this is too many.

33987

if (TrueIndex >= 0)

33988

return -1;

33989

TrueIndex = i;

33990

}

33991

}

33992

return TrueIndex;

33993

}

33994

33995

/// Given a masked memory load/store operation, return true if it has one mask

33996

/// bit set. If it has one mask bit set, then also return the memory address of

33997

/// the scalar element to load/store, the vector index to insert/extract that

33998

/// scalar element, and the alignment for the scalar memory access.

33999

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

34000

SelectionDAG &DAG, SDValue &Addr,

34001

SDValue &Index, unsigned &Alignment) {

34002

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

34003

if (TrueMaskElt < 0)

34004

return false;

34005

34006

// Get the address of the one scalar element that is specified by the mask

34007

// using the appropriate offset from the base pointer.

34008

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

34009

Addr = MaskedOp->getBasePtr();

34010

if (TrueMaskElt != 0) {

34011

unsigned Offset = TrueMaskElt * EltVT.getStoreSize();

34012

Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));

34013

}

34014

34015

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

34016

Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());

34017

return true;

34018

}

34019

34020

/// If exactly one element of the mask is set for a non-extending masked load,

34021

/// it is a scalar load and vector insert.

34022

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

34023

/// mask have already been optimized in IR, so we don't bother with those here.

34024

static SDValue

34025

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

34026

TargetLowering::DAGCombinerInfo &DCI) {

34027

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

34028

// However, some target hooks may need to be added to know when the transform

34029

// is profitable. Endianness would also have to be considered.

34030

34031

SDValue Addr, VecIndex;

34032

unsigned Alignment;

34033

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))

34034

return SDValue();

34035

34036

// Load the one scalar element that is specified by the mask using the

34037

// appropriate offset from the base pointer.

34038

SDLoc DL(ML);

34039

EVT VT = ML->getValueType(0);

34040

EVT EltVT = VT.getVectorElementType();

34041

SDValue Load =

34042

DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),

34043

Alignment, ML->getMemOperand()->getFlags());

34044

34045

// Insert the loaded element into the appropriate place in the vector.

34046

SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),

34047

Load, VecIndex);

34048

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

34049

}

34050

34051

static SDValue

34052

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

34053

TargetLowering::DAGCombinerInfo &DCI) {

34054

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

34055

return SDValue();

34056

34057

SDLoc DL(ML);

34058

EVT VT = ML->getValueType(0);

34059

34060

// If we are loading the first and last elements of a vector, it is safe and

34061

// always faster to load the whole vector. Replace the masked load with a

34062

// vector load and select.

34063

unsigned NumElts = VT.getVectorNumElements();

34064

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

34065

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

34066

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

34067

if (LoadFirstElt && LoadLastElt) {

34068

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

34069

ML->getMemOperand());

34070

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());

34071

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

34072

}

34073

34074

// Convert a masked load with a constant mask into a masked load and a select.

34075

// This allows the select operation to use a faster kind of select instruction

34076

// (for example, vblendvps -> vblendps).

34077

34078

// Don't try this if the pass-through operand is already undefined. That would

34079

// cause an infinite loop because that's what we're about to create.

34080

if (ML->getSrc0().isUndef())

34081

return SDValue();

34082

34083

// The new masked load has an undef pass-through operand. The select uses the

34084

// original pass-through operand.

34085

SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

34086

ML->getMask(), DAG.getUNDEF(VT),

34087

ML->getMemoryVT(), ML->getMemOperand(),

34088

ML->getExtensionType());

34089

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());

34090

34091

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

34092

}

34093

34094

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

34095

TargetLowering::DAGCombinerInfo &DCI,

34096

const X86Subtarget &Subtarget) {

34097

MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

34098

34099

// TODO: Expanding load with constant mask may be optimized as well.

34100

if (Mld->isExpandingLoad())

34101

return SDValue();

34102

34103

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

34104

if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))

34105

return ScalarLoad;

34106

// TODO: Do some AVX512 subsets benefit from this transform?

34107

if (!Subtarget.hasAVX512())

34108

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

34109

return Blend;

34110

}

34111

34112

if (Mld->getExtensionType() != ISD::SEXTLOAD)

34113

return SDValue();

34114

34115

// Resolve extending loads.

34116

EVT VT = Mld->getValueType(0);

34117

unsigned NumElems = VT.getVectorNumElements();

34118

EVT LdVT = Mld->getMemoryVT();

34119

SDLoc dl(Mld);

34120

34121

assert(LdVT != VT && "Cannot extend to the same type")(static_cast <bool> (LdVT != VT && "Cannot extend to the same type"
) ? void (0) : __assert_fail ("LdVT != VT && \"Cannot extend to the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34121, __extension__ __PRETTY_FUNCTION__));

34122

unsigned ToSz = VT.getScalarSizeInBits();

34123

unsigned FromSz = LdVT.getScalarSizeInBits();

34124

// From/To sizes and ElemCount must be pow of two.

34125

assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for extending masked load") ? void
(0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34126, __extension__ __PRETTY_FUNCTION__))

34126

"Unexpected size for extending masked load")(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for extending masked load") ? void
(0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34126, __extension__ __PRETTY_FUNCTION__));

34127

34128

unsigned SizeRatio = ToSz / FromSz;

34129

assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits())(static_cast <bool> (SizeRatio * NumElems * FromSz == VT
.getSizeInBits()) ? void (0) : __assert_fail ("SizeRatio * NumElems * FromSz == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34129, __extension__ __PRETTY_FUNCTION__));

34130

34131

// Create a type on which we perform the shuffle.

34132

EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),

34133

LdVT.getScalarType(), NumElems*SizeRatio);

34134

assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())(static_cast <bool> (WideVecVT.getSizeInBits() == VT.getSizeInBits
()) ? void (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34134, __extension__ __PRETTY_FUNCTION__));

34135

34136

// Convert Src0 value.

34137

SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());

34138

if (!Mld->getSrc0().isUndef()) {

34139

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

34140

for (unsigned i = 0; i != NumElems; ++i)

34141

ShuffleVec[i] = i * SizeRatio;

34142

34143

// Can't shuffle using an illegal type.

34144

assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(WideVecVT) && "WideVecVT should be legal") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34145, __extension__ __PRETTY_FUNCTION__))

34145

"WideVecVT should be legal")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(WideVecVT) && "WideVecVT should be legal") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34145, __extension__ __PRETTY_FUNCTION__));

34146

WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,

34147

DAG.getUNDEF(WideVecVT), ShuffleVec);

34148

}

34149

34150

// Prepare the new mask.

34151

SDValue NewMask;

34152

SDValue Mask = Mld->getMask();

34153

if (Mask.getValueType() == VT) {

34154

// Mask and original value have the same type.

34155

NewMask = DAG.getBitcast(WideVecVT, Mask);

34156

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

34157

for (unsigned i = 0; i != NumElems; ++i)

34158

ShuffleVec[i] = i * SizeRatio;

34159

for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)

34160

ShuffleVec[i] = NumElems * SizeRatio;

34161

NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,

34162

DAG.getConstant(0, dl, WideVecVT),

34163

ShuffleVec);

34164

} else {

34165

assert(Mask.getValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Mask.getValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34165, __extension__ __PRETTY_FUNCTION__));

34166

unsigned WidenNumElts = NumElems*SizeRatio;

34167

unsigned MaskNumElts = VT.getVectorNumElements();

34168

EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

34169

WidenNumElts);

34170

34171

unsigned NumConcat = WidenNumElts / MaskNumElts;

34172

SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());

34173

SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);

34174

Ops[0] = Mask;

34175

NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);

34176

}

34177

34178

SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),

34179

Mld->getBasePtr(), NewMask, WideSrc0,

34180

Mld->getMemoryVT(), Mld->getMemOperand(),

34181

ISD::NON_EXTLOAD);

34182

SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);

34183

return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);

34184

}

34185

34186

/// If exactly one element of the mask is set for a non-truncating masked store,

34187

/// it is a vector extract and scalar store.

34188

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

34189

/// mask have already been optimized in IR, so we don't bother with those here.

34190

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

34191

SelectionDAG &DAG) {

34192

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

34193

// However, some target hooks may need to be added to know when the transform

34194

// is profitable. Endianness would also have to be considered.

34195

34196

SDValue Addr, VecIndex;

34197

unsigned Alignment;

34198

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))

34199

return SDValue();

34200

34201

// Extract the one scalar element that is actually being stored.

34202

SDLoc DL(MS);

34203

EVT VT = MS->getValue().getValueType();

34204

EVT EltVT = VT.getVectorElementType();

34205

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,

34206

MS->getValue(), VecIndex);

34207

34208

// Store that element at the appropriate offset from the base pointer.

34209

return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),

34210

Alignment, MS->getMemOperand()->getFlags());

34211

}

34212

34213

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

34214

const X86Subtarget &Subtarget) {

34215

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

34216

34217

if (Mst->isCompressingStore())

34218

return SDValue();

34219

34220

if (!Mst->isTruncatingStore()) {

34221

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))

34222

return ScalarStore;

34223

34224

// If the mask is checking (0 > X), we're creating a vector with all-zeros

34225

// or all-ones elements based on the sign bits of X. AVX1 masked store only

34226

// cares about the sign bit of each mask element, so eliminate the compare:

34227

// mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X

34228

// Note that by waiting to match an x86-specific PCMPGT node, we're

34229

// eliminating potentially more complex matching of a setcc node which has

34230

// a full range of predicates.

34231

SDValue Mask = Mst->getMask();

34232

if (Mask.getOpcode() == X86ISD::PCMPGT &&

34233

ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {

34234

assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&(static_cast <bool> (Mask.getValueType() == Mask.getOperand
(1).getValueType() && "Unexpected type for PCMPGT") ?
void (0) : __assert_fail ("Mask.getValueType() == Mask.getOperand(1).getValueType() && \"Unexpected type for PCMPGT\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34235, __extension__ __PRETTY_FUNCTION__))

34235

"Unexpected type for PCMPGT")(static_cast <bool> (Mask.getValueType() == Mask.getOperand
(1).getValueType() && "Unexpected type for PCMPGT") ?
void (0) : __assert_fail ("Mask.getValueType() == Mask.getOperand(1).getValueType() && \"Unexpected type for PCMPGT\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34235, __extension__ __PRETTY_FUNCTION__));

34236

return DAG.getMaskedStore(

34237

Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),

34238

Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());

34239

}

34240

34241

// TODO: AVX512 targets should also be able to simplify something like the

34242

// pattern above, but that pattern will be different. It will either need to

34243

// match setcc more generally or match PCMPGTM later (in tablegen?).

34244

34245

return SDValue();

34246

}

34247

34248

// Resolve truncating stores.

34249

EVT VT = Mst->getValue().getValueType();

34250

unsigned NumElems = VT.getVectorNumElements();

34251

EVT StVT = Mst->getMemoryVT();

34252

SDLoc dl(Mst);

34253

34254

assert(StVT != VT && "Cannot truncate to the same type")(static_cast <bool> (StVT != VT && "Cannot truncate to the same type"
) ? void (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34254, __extension__ __PRETTY_FUNCTION__));

34255

unsigned FromSz = VT.getScalarSizeInBits();

34256

unsigned ToSz = StVT.getScalarSizeInBits();

34257

34258

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

34259

34260

// The truncating store is legal in some cases. For example

34261

// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw

34262

// are designated for truncate store.

34263

// In this case we don't need any further transformations.

34264

if (TLI.isTruncStoreLegal(VT, StVT))

34265

return SDValue();

34266

34267

// From/To sizes and ElemCount must be pow of two.

34268

assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for truncating masked store") ?
void (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34269, __extension__ __PRETTY_FUNCTION__))

34269

"Unexpected size for truncating masked store")(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for truncating masked store") ?
void (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34269, __extension__ __PRETTY_FUNCTION__));

34270

// We are going to use the original vector elt for storing.

34271

// Accumulated smaller vector elements must be a multiple of the store size.

34272

assert (((NumElems * FromSz) % ToSz) == 0 &&(static_cast <bool> (((NumElems * FromSz) % ToSz) == 0 &&
"Unexpected ratio for truncating masked store") ? void (0) :
__assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34273, __extension__ __PRETTY_FUNCTION__))

34273

"Unexpected ratio for truncating masked store")(static_cast <bool> (((NumElems * FromSz) % ToSz) == 0 &&
"Unexpected ratio for truncating masked store") ? void (0) :
__assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34273, __extension__ __PRETTY_FUNCTION__));

34274

34275

unsigned SizeRatio = FromSz / ToSz;

34276

assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())(static_cast <bool> (SizeRatio * NumElems * ToSz == VT.
getSizeInBits()) ? void (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34276, __extension__ __PRETTY_FUNCTION__));

34277

34278

// Create a type on which we perform the shuffle.

34279

EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),

34280

StVT.getScalarType(), NumElems*SizeRatio);

34281

34282

34283

34284

SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());

34285

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

34286

for (unsigned i = 0; i != NumElems; ++i)

34287

ShuffleVec[i] = i * SizeRatio;

34288

34289

// Can't shuffle using an illegal type.

34290

34291

34292

34293

SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,

34294

DAG.getUNDEF(WideVecVT),

34295

ShuffleVec);

34296

34297

SDValue NewMask;

34298

SDValue Mask = Mst->getMask();

34299

if (Mask.getValueType() == VT) {

34300

// Mask and original value have the same type.

34301

NewMask = DAG.getBitcast(WideVecVT, Mask);

34302

for (unsigned i = 0; i != NumElems; ++i)

34303

ShuffleVec[i] = i * SizeRatio;

34304

for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)

34305

ShuffleVec[i] = NumElems*SizeRatio;

34306

NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,

34307

DAG.getConstant(0, dl, WideVecVT),

34308

ShuffleVec);

34309

} else {

34310

34311

unsigned WidenNumElts = NumElems*SizeRatio;

34312

unsigned MaskNumElts = VT.getVectorNumElements();

34313

EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

34314

WidenNumElts);

34315

34316

unsigned NumConcat = WidenNumElts / MaskNumElts;

34317

SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());

34318

SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);

34319

Ops[0] = Mask;

34320

NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);

34321

}

34322

34323

return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,

34324

Mst->getBasePtr(), NewMask, StVT,

34325

Mst->getMemOperand(), false);

34326

}

34327

34328

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

34329

const X86Subtarget &Subtarget) {

34330

StoreSDNode *St = cast<StoreSDNode>(N);

34331

EVT VT = St->getValue().getValueType();

34332

EVT StVT = St->getMemoryVT();

34333

SDLoc dl(St);

34334

SDValue StoredVal = St->getOperand(1);

34335

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

34336

34337

// If we are saving a concatenation of two XMM registers and 32-byte stores

34338

// are slow, such as on Sandy Bridge, perform two 16-byte stores.

34339

bool Fast;

34340

unsigned AddressSpace = St->getAddressSpace();

34341

unsigned Alignment = St->getAlignment();

34342

if (VT.is256BitVector() && StVT == VT &&

34343

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

34344

AddressSpace, Alignment, &Fast) &&

34345

!Fast) {

34346

unsigned NumElems = VT.getVectorNumElements();

34347

if (NumElems < 2)

34348

return SDValue();

34349

34350

SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);

34351

SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);

34352

34353

SDValue Ptr0 = St->getBasePtr();

34354

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);

34355

34356

SDValue Ch0 =

34357

DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),

34358

Alignment, St->getMemOperand()->getFlags());

34359

SDValue Ch1 =

34360

DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),

34361

std::min(16U, Alignment), St->getMemOperand()->getFlags());

34362

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

34363

}

34364

34365

// Optimize trunc store (of multiple scalars) to shuffle and store.

34366

// First, pack all of the elements in one place. Next, store to memory

34367

// in fewer chunks.

34368

if (St->isTruncatingStore() && VT.isVector()) {

34369

// Check if we can detect an AVG pattern from the truncation. If yes,

34370

// replace the trunc store by a normal store with the result of X86ISD::AVG

34371

// instruction.

34372

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

34373

Subtarget, dl))

34374

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

34375

St->getPointerInfo(), St->getAlignment(),

34376

St->getMemOperand()->getFlags());

34377

34378

if (SDValue Val =

34379

detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))

34380

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

34381

dl, Val, St->getBasePtr(),

34382

St->getMemoryVT(), St->getMemOperand(), DAG);

34383

34384

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

34385

unsigned NumElems = VT.getVectorNumElements();

34386

34387

unsigned FromSz = VT.getScalarSizeInBits();

34388

unsigned ToSz = StVT.getScalarSizeInBits();

34389

34390

// The truncating store is legal in some cases. For example

34391

// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw

34392

// are designated for truncate store.

34393

// In this case we don't need any further transformations.

34394

if (TLI.isTruncStoreLegalOrCustom(VT, StVT))

34395

return SDValue();

34396

34397

// From, To sizes and ElemCount must be pow of two

34398

if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();

34399

// We are going to use the original vector elt for storing.

34400

// Accumulated smaller vector elements must be a multiple of the store size.

34401

if (0 != (NumElems * FromSz) % ToSz) return SDValue();

34402

34403

unsigned SizeRatio = FromSz / ToSz;

34404

34405

34406

34407

// Create a type on which we perform the shuffle

34408

EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),

34409

StVT.getScalarType(), NumElems*SizeRatio);

34410

34411

34412

34413

SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());

34414

SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);

34415

for (unsigned i = 0; i != NumElems; ++i)

34416

ShuffleVec[i] = i * SizeRatio;

34417

34418

// Can't shuffle using an illegal type.

34419

if (!TLI.isTypeLegal(WideVecVT))

34420

return SDValue();

34421

34422

SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,

34423

DAG.getUNDEF(WideVecVT),

34424

ShuffleVec);

34425

// At this point all of the data is stored at the bottom of the

34426

// register. We now need to save it to mem.

34427

34428

// Find the largest store unit

34429

MVT StoreType = MVT::i8;

34430

for (MVT Tp : MVT::integer_valuetypes()) {

34431

if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)

34432

StoreType = Tp;

34433

}

34434

34435

// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.

34436

if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&

34437

(64 <= NumElems * ToSz))

34438

StoreType = MVT::f64;

34439

34440

// Bitcast the original vector into a vector of store-size units

34441

EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),

34442

StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());

34443

assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits())(static_cast <bool> (StoreVecVT.getSizeInBits() == VT.getSizeInBits
()) ? void (0) : __assert_fail ("StoreVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34443, __extension__ __PRETTY_FUNCTION__));

34444

SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);

34445

SmallVector<SDValue, 8> Chains;

34446

SDValue Ptr = St->getBasePtr();

34447

34448

// Perform one or more big stores into memory.

34449

for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {

34450

SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,

34451

StoreType, ShuffWide,

34452

DAG.getIntPtrConstant(i, dl));

34453

SDValue Ch =

34454

DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),

34455

St->getAlignment(), St->getMemOperand()->getFlags());

34456

Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);

34457

Chains.push_back(Ch);

34458

}

34459

34460

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

34461

}

34462

34463

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

34464

// the FP state in cases where an emms may be missing.

34465

// A preferable solution to the general problem is to figure out the right

34466

// places to insert EMMS. This qualifies as a quick hack.

34467

34468

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

34469

if (VT.getSizeInBits() != 64)

34470

return SDValue();

34471

34472

const Function *F = DAG.getMachineFunction().getFunction();

34473

bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);

34474

bool F64IsLegal =

34475

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

34476

if ((VT.isVector() ||

34477

(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&

34478

isa<LoadSDNode>(St->getValue()) &&

34479

!cast<LoadSDNode>(St->getValue())->isVolatile() &&

34480

St->getChain().hasOneUse() && !St->isVolatile()) {

34481

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

34482

SmallVector<SDValue, 8> Ops;

34483

34484

if (!ISD::isNormalLoad(Ld))

34485

return SDValue();

34486

34487

// If this is not the MMX case, i.e. we are just turning i64 load/store

34488

// into f64 load/store, avoid the transformation if there are multiple

34489

// uses of the loaded value.

34490

if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))

34491

return SDValue();

34492

34493

SDLoc LdDL(Ld);

34494

SDLoc StDL(N);

34495

// If we are a 64-bit capable x86, lower to a single movq load/store pair.

34496

// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store

34497

// pair instead.

34498

if (Subtarget.is64Bit() || F64IsLegal) {

34499

MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;

34500

SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),

34501

Ld->getMemOperand());

34502

34503

// Make sure new load is placed in same chain order.

34504

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

34505

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

34506

St->getMemOperand());

34507

}

34508

34509

// Otherwise, lower to two pairs of 32-bit loads / stores.

34510

SDValue LoAddr = Ld->getBasePtr();

34511

SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

34512

34513

SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,

34514

Ld->getPointerInfo(), Ld->getAlignment(),

34515

Ld->getMemOperand()->getFlags());

34516

SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,

34517

Ld->getPointerInfo().getWithOffset(4),

34518

MinAlign(Ld->getAlignment(), 4),

34519

Ld->getMemOperand()->getFlags());

34520

// Make sure new loads are placed in same chain order.

34521

DAG.makeEquivalentMemoryOrdering(Ld, LoLd);

34522

DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

34523

34524

LoAddr = St->getBasePtr();

34525

HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

34526

34527

SDValue LoSt =

34528

DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),

34529

St->getAlignment(), St->getMemOperand()->getFlags());

34530

SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,

34531

St->getPointerInfo().getWithOffset(4),

34532

MinAlign(St->getAlignment(), 4),

34533

St->getMemOperand()->getFlags());

34534

return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);

34535

}

34536

34537

// This is similar to the above case, but here we handle a scalar 64-bit

34538

// integer store that is extracted from a vector on a 32-bit target.

34539

// If we have SSE2, then we can treat it like a floating-point double

34540

// to get past legalization. The execution dependencies fixup pass will

34541

// choose the optimal machine instruction for the store if this really is

34542

// an integer or v2f32 rather than an f64.

34543

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

34544

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

34545

SDValue OldExtract = St->getOperand(1);

34546

SDValue ExtOp0 = OldExtract.getOperand(0);

34547

unsigned VecSize = ExtOp0.getValueSizeInBits();

34548

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

34549

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

34550

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

34551

BitCast, OldExtract.getOperand(1));

34552

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

34553

St->getPointerInfo(), St->getAlignment(),

34554

St->getMemOperand()->getFlags());

34555

}

34556

34557

return SDValue();

34558

}

34559

34560

/// Return 'true' if this vector operation is "horizontal"

34561

/// and return the operands for the horizontal operation in LHS and RHS. A

34562

/// horizontal operation performs the binary operation on successive elements

34563

/// of its first operand, then on successive elements of its second operand,

34564

/// returning the resulting values in a vector. For example, if

34565

/// A = < float a0, float a1, float a2, float a3 >

34566

/// and

34567

/// B = < float b0, float b1, float b2, float b3 >

34568

/// then the result of doing a horizontal operation on A and B is

34569

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

34570

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

34571

/// A horizontal-op B, for some already available A and B, and if so then LHS is

34572

/// set to A, RHS to B, and the routine returns 'true'.

34573

/// Note that the binary operation should have the property that if one of the

34574

/// operands is UNDEF then the result is UNDEF.

34575

static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {

34576

// Look for the following pattern: if

34577

// A = < float a0, float a1, float a2, float a3 >

34578

// B = < float b0, float b1, float b2, float b3 >

34579

// and

34580

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

34581

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

34582

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

34583

// which is A horizontal-op B.

34584

34585

// At least one of the operands should be a vector shuffle.

34586

if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&

34587

RHS.getOpcode() != ISD::VECTOR_SHUFFLE)

34588

return false;

34589

34590

MVT VT = LHS.getSimpleValueType();

34591

34592

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34593, __extension__ __PRETTY_FUNCTION__))

34593

"Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34593, __extension__ __PRETTY_FUNCTION__));

34594

34595

// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to

34596

// operate independently on 128-bit lanes.

34597

unsigned NumElts = VT.getVectorNumElements();

34598

unsigned NumLanes = VT.getSizeInBits()/128;

34599

unsigned NumLaneElts = NumElts / NumLanes;

34600

assert((NumLaneElts % 2 == 0) &&(static_cast <bool> ((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34601, __extension__ __PRETTY_FUNCTION__))

34601

"Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34601, __extension__ __PRETTY_FUNCTION__));

34602

unsigned HalfLaneElts = NumLaneElts/2;

34603

34604

// View LHS in the form

34605

// LHS = VECTOR_SHUFFLE A, B, LMask

34606

// If LHS is not a shuffle then pretend it is the shuffle

34607

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

34608

// NOTE: in what follows a default initialized SDValue represents an UNDEF of

34609

// type VT.

34610

SDValue A, B;

34611

SmallVector<int, 16> LMask(NumElts);

34612

if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {

34613

if (!LHS.getOperand(0).isUndef())

34614

A = LHS.getOperand(0);

34615

if (!LHS.getOperand(1).isUndef())

34616

B = LHS.getOperand(1);

34617

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();

34618

std::copy(Mask.begin(), Mask.end(), LMask.begin());

34619

} else {

34620

if (!LHS.isUndef())

34621

A = LHS;

34622

for (unsigned i = 0; i != NumElts; ++i)

34623

LMask[i] = i;

34624

}

34625

34626

// Likewise, view RHS in the form

34627

// RHS = VECTOR_SHUFFLE C, D, RMask

34628

SDValue C, D;

34629

SmallVector<int, 16> RMask(NumElts);

34630

if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {

34631

if (!RHS.getOperand(0).isUndef())

34632

C = RHS.getOperand(0);

34633

if (!RHS.getOperand(1).isUndef())

34634

D = RHS.getOperand(1);

34635

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();

34636

std::copy(Mask.begin(), Mask.end(), RMask.begin());

34637

} else {

34638

if (!RHS.isUndef())

34639

C = RHS;

34640

for (unsigned i = 0; i != NumElts; ++i)

34641

RMask[i] = i;

34642

}

34643

34644

// Check that the shuffles are both shuffling the same vectors.

34645

if (!(A == C && B == D) && !(A == D && B == C))

34646

return false;

34647

34648

// If everything is UNDEF then bail out: it would be better to fold to UNDEF.

34649

if (!A.getNode() && !B.getNode())

34650

return false;

34651

34652

// If A and B occur in reverse order in RHS, then "swap" them (which means

34653

// rewriting the mask).

34654

if (A != C)

34655

ShuffleVectorSDNode::commuteMask(RMask);

34656

34657

// At this point LHS and RHS are equivalent to

34658

// LHS = VECTOR_SHUFFLE A, B, LMask

34659

// RHS = VECTOR_SHUFFLE A, B, RMask

34660

// Check that the masks correspond to performing a horizontal operation.

34661

for (unsigned l = 0; l != NumElts; l += NumLaneElts) {

34662

for (unsigned i = 0; i != NumLaneElts; ++i) {

34663

int LIdx = LMask[i+l], RIdx = RMask[i+l];

34664

34665

// Ignore any UNDEF components.

34666

if (LIdx < 0 || RIdx < 0 ||

34667

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

34668

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

34669

continue;

34670

34671

// Check that successive elements are being operated on. If not, this is

34672

// not a horizontal operation.

34673

unsigned Src = (i/HalfLaneElts); // each lane is split between srcs

34674

int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;

34675

if (!(LIdx == Index && RIdx == Index + 1) &&

34676

!(IsCommutative && LIdx == Index + 1 && RIdx == Index))

34677

return false;

34678

}

34679

}

34680

34681

LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

34682

RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

34683

return true;

34684

}

34685

34686

/// Do target-specific dag combines on floating-point adds/subs.

34687

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

34688

const X86Subtarget &Subtarget) {

34689

EVT VT = N->getValueType(0);

34690

SDValue LHS = N->getOperand(0);

34691

SDValue RHS = N->getOperand(1);

34692

bool IsFadd = N->getOpcode() == ISD::FADD;

34693

assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode")(static_cast <bool> ((IsFadd || N->getOpcode() == ISD
::FSUB) && "Wrong opcode") ? void (0) : __assert_fail
("(IsFadd || N->getOpcode() == ISD::FSUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34693, __extension__ __PRETTY_FUNCTION__));

34694

34695

// Try to synthesize horizontal add/sub from adds/subs of shuffles.

34696

if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

34697

(Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&

34698

isHorizontalBinOp(LHS, RHS, IsFadd)) {

34699

auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;

34700

return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);

34701

}

34702

return SDValue();

34703

}

34704

34705

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

34706

/// the codegen.

34707

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

34708

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

34709

const X86Subtarget &Subtarget,

34710

SDLoc &DL) {

34711

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34711, __extension__ __PRETTY_FUNCTION__));

34712

SDValue Src = N->getOperand(0);

34713

unsigned Opcode = Src.getOpcode();

34714

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

34715

34716

EVT VT = N->getValueType(0);

34717

EVT SrcVT = Src.getValueType();

34718

34719

auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {

34720

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

34721

34722

// Repeated operand, so we are only trading one output truncation for

34723

// one input truncation.

34724

if (Op0 == Op1)

34725

return true;

34726

34727

// See if either operand has been extended from a smaller/equal size to

34728

// the truncation size, allowing a truncation to combine with the extend.

34729

unsigned Opcode0 = Op0.getOpcode();

34730

if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||

34731

Opcode0 == ISD::ZERO_EXTEND) &&

34732

Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

34733

return true;

34734

34735

unsigned Opcode1 = Op1.getOpcode();

34736

if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||

34737

Opcode1 == ISD::ZERO_EXTEND) &&

34738

Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

34739

return true;

34740

34741

// See if either operand is a single use constant which can be constant

34742

// folded.

34743

SDValue BC0 = peekThroughOneUseBitcasts(Op0);

34744

SDValue BC1 = peekThroughOneUseBitcasts(Op1);

34745

return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||

34746

ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());

34747

};

34748

34749

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

34750

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

34751

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

34752

return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);

34753

};

34754

34755

// Don't combine if the operation has other uses.

34756

if (!N->isOnlyUserOf(Src.getNode()))

34757

return SDValue();

34758

34759

// Only support vector truncation for now.

34760

// TODO: i64 scalar math would benefit as well.

34761

if (!VT.isVector())

34762

return SDValue();

34763

34764

// In most cases its only worth pre-truncating if we're only facing the cost

34765

// of one truncation.

34766

// i.e. if one of the inputs will constant fold or the input is repeated.

34767

switch (Opcode) {

34768

case ISD::AND:

34769

case ISD::XOR:

34770

case ISD::OR: {

34771

SDValue Op0 = Src.getOperand(0);

34772

SDValue Op1 = Src.getOperand(1);

34773

if (TLI.isOperationLegalOrPromote(Opcode, VT) &&

34774

IsRepeatedOpOrFreeTruncation(Op0, Op1))

34775

return TruncateArithmetic(Op0, Op1);

34776

break;

34777

}

34778

34779

case ISD::MUL:

34780

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

34781

// better to truncate if we have the chance.

34782

if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&

34783

!TLI.isOperationLegal(Opcode, SrcVT))

34784

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

34785

LLVM_FALLTHROUGH[[clang::fallthrough]];

34786

case ISD::ADD: {

34787

// TODO: ISD::SUB should be here but interferes with combineSubToSubus.

34788

SDValue Op0 = Src.getOperand(0);

34789

SDValue Op1 = Src.getOperand(1);

34790

if (TLI.isOperationLegal(Opcode, VT) &&

34791

IsRepeatedOpOrFreeTruncation(Op0, Op1))

34792

return TruncateArithmetic(Op0, Op1);

34793

break;

34794

}

34795

}

34796

34797

return SDValue();

34798

}

34799

34800

/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.

34801

static SDValue

34802

combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,

34803

SmallVector<SDValue, 8> &Regs) {

34804

assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||(static_cast <bool> (Regs.size() > 0 && (Regs
[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() ==
MVT::v2i64)) ? void (0) : __assert_fail ("Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() == MVT::v2i64)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34805, __extension__ __PRETTY_FUNCTION__))

34805

Regs[0].getValueType() == MVT::v2i64))(static_cast <bool> (Regs.size() > 0 && (Regs
[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() ==
MVT::v2i64)) ? void (0) : __assert_fail ("Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() == MVT::v2i64)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34805, __extension__ __PRETTY_FUNCTION__));

34806

EVT OutVT = N->getValueType(0);

34807

EVT OutSVT = OutVT.getVectorElementType();

34808

EVT InVT = Regs[0].getValueType();

34809

EVT InSVT = InVT.getVectorElementType();

34810

SDLoc DL(N);

34811

34812

// First, use mask to unset all bits that won't appear in the result.

34813

assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&(static_cast <bool> ((OutSVT == MVT::i8 || OutSVT == MVT
::i16) && "OutSVT can only be either i8 or i16.") ? void
(0) : __assert_fail ("(OutSVT == MVT::i8 || OutSVT == MVT::i16) && \"OutSVT can only be either i8 or i16.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34814, __extension__ __PRETTY_FUNCTION__))

34814

"OutSVT can only be either i8 or i16.")(static_cast <bool> ((OutSVT == MVT::i8 || OutSVT == MVT
::i16) && "OutSVT can only be either i8 or i16.") ? void
(0) : __assert_fail ("(OutSVT == MVT::i8 || OutSVT == MVT::i16) && \"OutSVT can only be either i8 or i16.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34814, __extension__ __PRETTY_FUNCTION__));

34815

APInt Mask =

34816

APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());

34817

SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);

34818

for (auto &Reg : Regs)

34819

Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);

34820

34821

MVT UnpackedVT, PackedVT;

34822

if (OutSVT == MVT::i8) {

34823

UnpackedVT = MVT::v8i16;

34824

PackedVT = MVT::v16i8;

34825

} else {

34826

UnpackedVT = MVT::v4i32;

34827

PackedVT = MVT::v8i16;

34828

}

34829

34830

// In each iteration, truncate the type by a half size.

34831

auto RegNum = Regs.size();

34832

for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();

34833

j < e; j *= 2, RegNum /= 2) {

34834

for (unsigned i = 0; i < RegNum; i++)

34835

Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);

34836

for (unsigned i = 0; i < RegNum / 2; i++)

34837

Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],

34838

Regs[i * 2 + 1]);

34839

}

34840

34841

// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and

34842

// then extract a subvector as the result since v8i8 is not a legal type.

34843

if (OutVT == MVT::v8i8) {

34844

Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);

34845

Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],

34846

DAG.getIntPtrConstant(0, DL));

34847

return Regs[0];

34848

} else if (RegNum > 1) {

34849

Regs.resize(RegNum);

34850

return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);

34851

} else

34852

return Regs[0];

34853

}

34854

34855

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

34856

static SDValue

34857

combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,

34858

SelectionDAG &DAG,

34859

SmallVector<SDValue, 8> &Regs) {

34860

assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32)(static_cast <bool> (Regs.size() > 0 && Regs
[0].getValueType() == MVT::v4i32) ? void (0) : __assert_fail (
"Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34860, __extension__ __PRETTY_FUNCTION__));

34861

EVT OutVT = N->getValueType(0);

34862

SDLoc DL(N);

34863

34864

// Shift left by 16 bits, then arithmetic-shift right by 16 bits.

34865

SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);

34866

for (auto &Reg : Regs) {

34867

Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,

34868

Subtarget, DAG);

34869

Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,

34870

Subtarget, DAG);

34871

}

34872

34873

for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)

34874

Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],

34875

Regs[i * 2 + 1]);

34876

34877

if (Regs.size() > 2) {

34878

Regs.resize(Regs.size() / 2);

34879

return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);

34880

} else

34881

return Regs[0];

34882

}

34883

34884

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

34885

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

34886

/// legalization the truncation will be translated into a BUILD_VECTOR with each

34887

/// element that is extracted from a vector and then truncated, and it is

34888

/// difficult to do this optimization based on them.

34889

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

34890

const X86Subtarget &Subtarget) {

34891

EVT OutVT = N->getValueType(0);

34892

if (!OutVT.isVector())

34893

return SDValue();

34894

34895

SDValue In = N->getOperand(0);

34896

if (!In.getValueType().isSimple())

34897

return SDValue();

34898

34899

EVT InVT = In.getValueType();

34900

unsigned NumElems = OutVT.getVectorNumElements();

34901

34902

// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on

34903

// SSE2, and we need to take care of it specially.

34904

// AVX512 provides vpmovdb.

34905

if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())

34906

return SDValue();

34907

34908

EVT OutSVT = OutVT.getVectorElementType();

34909

EVT InSVT = InVT.getVectorElementType();

34910

if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&

34911

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

34912

NumElems >= 8))

34913

return SDValue();

34914

34915

// SSSE3's pshufb results in less instructions in the cases below.

34916

if (Subtarget.hasSSSE3() && NumElems == 8 &&

34917

((OutSVT == MVT::i8 && InSVT != MVT::i64) ||

34918

(InSVT == MVT::i32 && OutSVT == MVT::i16)))

34919

return SDValue();

34920

34921

SDLoc DL(N);

34922

34923

// Split a long vector into vectors of legal type.

34924

unsigned RegNum = InVT.getSizeInBits() / 128;

34925

SmallVector<SDValue, 8> SubVec(RegNum);

34926

unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();

34927

EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);

34928

34929

for (unsigned i = 0; i < RegNum; i++)

34930

SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,

34931

DAG.getIntPtrConstant(i * NumSubRegElts, DL));

34932

34933

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

34934

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

34935

// truncate 2 x v4i32 to v8i16.

34936

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

34937

return combineVectorTruncationWithPACKUS(N, DAG, SubVec);

34938

else if (InSVT == MVT::i32)

34939

return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);

34940

else

34941

return SDValue();

34942

}

34943

34944

/// This function transforms vector truncation of 'extended sign-bits' or

34945

/// 'extended zero-bits' values.

34946

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

34947

static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,

34948

SelectionDAG &DAG,

34949

const X86Subtarget &Subtarget) {

34950

// Requires SSE2 but AVX512 has fast truncate.

34951

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

34952

return SDValue();

34953

34954

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

34955

return SDValue();

34956

34957

SDValue In = N->getOperand(0);

34958

if (!In.getValueType().isSimple())

34959

return SDValue();

34960

34961

MVT VT = N->getValueType(0).getSimpleVT();

34962

MVT SVT = VT.getScalarType();

34963

34964

MVT InVT = In.getValueType().getSimpleVT();

34965

MVT InSVT = InVT.getScalarType();

34966

34967

// Check we have a truncation suited for PACKSS.

34968

if (!VT.is128BitVector() && !VT.is256BitVector())

34969

return SDValue();

34970

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

34971

return SDValue();

34972

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

34973

return SDValue();

34974

34975

// Use PACKSS if the input has sign-bits that extend all the way to the

34976

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

34977

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

34978

unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

34979

if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))

34980

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

34981

34982

// Use PACKUS if the input has zero-bits that extend all the way to the

34983

// packed/truncated value. e.g. masks, zext_in_reg, etc.

34984

KnownBits Known;

34985

DAG.computeKnownBits(In, Known);

34986

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

34987

NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;

34988

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))

34989

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

34990

34991

return SDValue();

34992

}

34993

34994

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

34995

const X86Subtarget &Subtarget) {

34996

EVT VT = N->getValueType(0);

34997

SDValue Src = N->getOperand(0);

34998

SDLoc DL(N);

34999

35000

// Attempt to pre-truncate inputs to arithmetic ops instead.

35001

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

35002

return V;

35003

35004

// Try to detect AVG pattern first.

35005

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

35006

return Avg;

35007

35008

// Try to combine truncation with unsigned saturation.

35009

if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))

35010

return Val;

35011

35012

// The bitcast source is a direct mmx result.

35013

// Detect bitcasts between i32 to x86mmx

35014

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

35015

SDValue BCSrc = Src.getOperand(0);

35016

if (BCSrc.getValueType() == MVT::x86mmx)

35017

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

35018

}

35019

35020

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

35021

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

35022

return V;

35023

35024

return combineVectorTruncation(N, DAG, Subtarget);

35025

}

35026

35027

/// Returns the negated value if the node \p N flips sign of FP value.

35028

///

35029

/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).

35030

/// AVX512F does not have FXOR, so FNEG is lowered as

35031

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

35032

/// In this case we go though all bitcasts.

35033

static SDValue isFNEG(SDNode *N) {

35034

if (N->getOpcode() == ISD::FNEG)

35035

return N->getOperand(0);

35036

35037

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

35038

if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)

35039

return SDValue();

35040

35041

SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));

35042

if (!Op1.getValueType().isFloatingPoint())

35043

return SDValue();

35044

35045

SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));

35046

35047

unsigned EltBits = Op1.getScalarValueSizeInBits();

35048

auto isSignMask = [&](const ConstantFP *C) {

35049

return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);

35050

};

35051

35052

// There is more than one way to represent the same constant on

35053

// the different X86 targets. The type of the node may also depend on size.

35054

// - load scalar value and broadcast

35055

// - BUILD_VECTOR node

35056

// - load from a constant pool.

35057

// We check all variants here.

35058

if (Op1.getOpcode() == X86ISD::VBROADCAST) {

35059

if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))

35060

if (isSignMask(cast<ConstantFP>(C)))

35061

return Op0;

35062

35063

} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {

35064

if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())

35065

if (isSignMask(CN->getConstantFPValue()))

35066

return Op0;

35067

35068

} else if (auto *C = getTargetConstantFromNode(Op1)) {

35069

if (C->getType()->isVectorTy()) {

35070

if (auto *SplatV = C->getSplatValue())

35071

if (isSignMask(cast<ConstantFP>(SplatV)))

35072

return Op0;

35073

} else if (auto *FPConst = dyn_cast<ConstantFP>(C))

35074

if (isSignMask(FPConst))

35075

return Op0;

35076

}

35077

return SDValue();

35078

}

35079

35080

/// Do target-specific dag combines on floating point negations.

35081

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

35082

const X86Subtarget &Subtarget) {

35083

EVT OrigVT = N->getValueType(0);

35084

SDValue Arg = isFNEG(N);

35085

assert(Arg.getNode() && "N is expected to be an FNEG node")(static_cast <bool> (Arg.getNode() && "N is expected to be an FNEG node"
) ? void (0) : __assert_fail ("Arg.getNode() && \"N is expected to be an FNEG node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35085, __extension__ __PRETTY_FUNCTION__));

35086

35087

EVT VT = Arg.getValueType();

35088

EVT SVT = VT.getScalarType();

35089

SDLoc DL(N);

35090

35091

// Let legalize expand this if it isn't a legal type yet.

35092

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

35093

return SDValue();

35094

35095

// If we're negating a FMUL node on a target with FMA, then we can avoid the

35096

// use of a constant by performing (-0 - A*B) instead.

35097

// FIXME: Check rounding control flags as well once it becomes available.

35098

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

35099

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

35100

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

35101

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

35102

Arg.getOperand(1), Zero);

35103

return DAG.getBitcast(OrigVT, NewNode);

35104

}

35105

35106

// If we're negating an FMA node, then we can adjust the

35107

// instruction to include the extra negation.

35108

unsigned NewOpcode = 0;

35109

if (Arg.hasOneUse()) {

35110

switch (Arg.getOpcode()) {

35111

case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;

35112

case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;

35113

case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;

35114

case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;

35115

case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;

35116

case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;

35117

case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;

35118

case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;

35119

// We can't handle scalar intrinsic node here because it would only

35120

// invert one element and not the whole vector. But we could try to handle

35121

// a negation of the lower element only.

35122

}

35123

}

35124

if (NewOpcode)

35125

return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,

35126

Arg.getNode()->ops()));

35127

35128

return SDValue();

35129

}

35130

35131

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

35132

const X86Subtarget &Subtarget) {

35133

MVT VT = N->getSimpleValueType(0);

35134

// If we have integer vector types available, use the integer opcodes.

35135

if (VT.isVector() && Subtarget.hasSSE2()) {

35136

SDLoc dl(N);

35137

35138

MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

35139

35140

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

35141

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

35142

unsigned IntOpcode;

35143

switch (N->getOpcode()) {

35144

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35144);

35145

case X86ISD::FOR: IntOpcode = ISD::OR; break;

35146

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

35147

case X86ISD::FAND: IntOpcode = ISD::AND; break;

35148

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

35149

}

35150

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

35151

return DAG.getBitcast(VT, IntOp);

35152

}

35153

return SDValue();

35154

}

35155

35156

35157

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

35158

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

35159

if (N->getOpcode() != ISD::XOR)

35160

return SDValue();

35161

35162

SDValue LHS = N->getOperand(0);

35163

auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));

35164

if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)

35165

return SDValue();

35166

35167

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

35168

X86::CondCode(LHS->getConstantOperandVal(0)));

35169

SDLoc DL(N);

35170

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

35171

}

35172

35173

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

35174

TargetLowering::DAGCombinerInfo &DCI,

35175

const X86Subtarget &Subtarget) {

35176

// If this is SSE1 only convert to FXOR to avoid scalarization.

35177

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&

35178

N->getValueType(0) == MVT::v4i32) {

35179

return DAG.getBitcast(

35180

MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

35181

DAG.getBitcast(MVT::v4f32, N->getOperand(0)),

35182

DAG.getBitcast(MVT::v4f32, N->getOperand(1))));

35183

}

35184

35185

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

35186

return Cmp;

35187

35188

if (DCI.isBeforeLegalizeOps())

35189

return SDValue();

35190

35191

if (SDValue SetCC = foldXor1SetCC(N, DAG))

35192

return SetCC;

35193

35194

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

35195

return RV;

35196

35197

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))

35198

return FPLogic;

35199

35200

if (isFNEG(N))

35201

return combineFneg(N, DAG, Subtarget);

35202

return SDValue();

35203

}

35204

35205

35206

static bool isNullFPScalarOrVectorConst(SDValue V) {

35207

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

35208

}

35209

35210

/// If a value is a scalar FP zero or a vector FP zero (potentially including

35211

/// undefined elements), return a zero constant that may be used to fold away

35212

/// that value. In the case of a vector, the returned constant will not contain

35213

/// undefined elements even if the input parameter does. This makes it suitable

35214

/// to be used as a replacement operand with operations (eg, bitwise-and) where

35215

/// an undef should not propagate.

35216

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

35217

const X86Subtarget &Subtarget) {

35218

if (!isNullFPScalarOrVectorConst(V))

35219

return SDValue();

35220

35221

if (V.getValueType().isVector())

35222

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

35223

35224

return V;

35225

}

35226

35227

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

35228

const X86Subtarget &Subtarget) {

35229

SDValue N0 = N->getOperand(0);

35230

SDValue N1 = N->getOperand(1);

35231

EVT VT = N->getValueType(0);

35232

SDLoc DL(N);

35233

35234

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

35235

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

35236

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

35237

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

35238

return SDValue();

35239

35240

auto isAllOnesConstantFP = [](SDValue V) {

35241

if (V.getSimpleValueType().isVector())

35242

return ISD::isBuildVectorAllOnes(V.getNode());

35243

auto *C = dyn_cast<ConstantFPSDNode>(V);

35244

return C && C->getConstantFPValue()->isAllOnesValue();

35245

};

35246

35247

// fand (fxor X, -1), Y --> fandn X, Y

35248

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

35249

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

35250

35251

// fand X, (fxor Y, -1) --> fandn Y, X

35252

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

35253

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

35254

35255

return SDValue();

35256

}

35257

35258

/// Do target-specific dag combines on X86ISD::FAND nodes.

35259

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

35260

const X86Subtarget &Subtarget) {

35261

// FAND(0.0, x) -> 0.0

35262

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

35263

return V;

35264

35265

// FAND(x, 0.0) -> 0.0

35266

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

35267

return V;

35268

35269

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

35270

return V;

35271

35272

return lowerX86FPLogicOp(N, DAG, Subtarget);

35273

}

35274

35275

/// Do target-specific dag combines on X86ISD::FANDN nodes.

35276

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

35277

const X86Subtarget &Subtarget) {

35278

// FANDN(0.0, x) -> x

35279

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

35280

return N->getOperand(1);

35281

35282

// FANDN(x, 0.0) -> 0.0

35283

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

35284

return V;

35285

35286

return lowerX86FPLogicOp(N, DAG, Subtarget);

35287

}

35288

35289

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

35290

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

35291

const X86Subtarget &Subtarget) {

35292

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35292, __extension__ __PRETTY_FUNCTION__));

35293

35294

// F[X]OR(0.0, x) -> x

35295

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

35296

return N->getOperand(1);

35297

35298

// F[X]OR(x, 0.0) -> x

35299

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

35300

return N->getOperand(0);

35301

35302

if (isFNEG(N))

35303

if (SDValue NewVal = combineFneg(N, DAG, Subtarget))

35304

return NewVal;

35305

35306

return lowerX86FPLogicOp(N, DAG, Subtarget);

35307

}

35308

35309

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

35310

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

35311

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35311, __extension__ __PRETTY_FUNCTION__));

35312

35313

// Only perform optimizations if UnsafeMath is used.

35314

if (!DAG.getTarget().Options.UnsafeFPMath)

35315

return SDValue();

35316

35317

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

35318

// into FMINC and FMAXC, which are Commutative operations.

35319

unsigned NewOp = 0;

35320

switch (N->getOpcode()) {

35321

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35321);

35322

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

35323

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

35324

}

35325

35326

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

35327

N->getOperand(0), N->getOperand(1));

35328

}

35329

35330

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

35331

const X86Subtarget &Subtarget) {

35332

if (Subtarget.useSoftFloat())

35333

return SDValue();

35334

35335

// TODO: Check for global or instruction-level "nnan". In that case, we

35336

// should be able to lower to FMAX/FMIN alone.

35337

// TODO: If an operand is already known to be a NaN or not a NaN, this

35338

// should be an optional swap and FMAX/FMIN.

35339

35340

EVT VT = N->getValueType(0);

35341

if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||

35342

(Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||

35343

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))

35344

return SDValue();

35345

35346

// This takes at least 3 instructions, so favor a library call when operating

35347

// on a scalar and minimizing code size.

35348

if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())

35349

return SDValue();

35350

35351

SDValue Op0 = N->getOperand(0);

35352

SDValue Op1 = N->getOperand(1);

35353

SDLoc DL(N);

35354

EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(

35355

DAG.getDataLayout(), *DAG.getContext(), VT);

35356

35357

// There are 4 possibilities involving NaN inputs, and these are the required

35358

// outputs:

35359

// Op1

35360

// Num NaN

35361

// ----------------

35362

// Num | Max | Op0 |

35363

// Op0 ----------------

35364

// NaN | Op1 | NaN |

35365

// ----------------

35366

35367

// The SSE FP max/min instructions were not designed for this case, but rather

35368

// to implement:

35369

// Min = Op1 < Op0 ? Op1 : Op0

35370

// Max = Op1 > Op0 ? Op1 : Op0

35371

35372

// So they always return Op0 if either input is a NaN. However, we can still

35373

// use those instructions for fmaxnum by selecting away a NaN input.

35374

35375

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

35376

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

35377

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

35378

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);

35379

35380

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

35381

// are NaN, the NaN value of Op1 is the result.

35382

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

35383

}

35384

35385

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

35386

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

35387

TargetLowering::DAGCombinerInfo &DCI,

35388

const X86Subtarget &Subtarget) {

35389

// ANDNP(0, x) -> x

35390

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

35391

return N->getOperand(1);

35392

35393

// ANDNP(x, 0) -> 0

35394

if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))

35395

return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));

35396

35397

EVT VT = N->getValueType(0);

35398

35399

// Attempt to recursively combine a bitmask ANDNP with shuffles.

35400

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

35401

SDValue Op(N, 0);

35402

if (SDValue Res = combineX86ShufflesRecursively(

35403

{Op}, 0, Op, {0}, {}, /*Depth*/ 1,

35404

/*HasVarMask*/ false, DAG, DCI, Subtarget)) {

35405

DCI.CombineTo(N, Res);

35406

return SDValue();

35407

}

35408

}

35409

35410

return SDValue();

35411

}

35412

35413

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

35414

TargetLowering::DAGCombinerInfo &DCI) {

35415

SDValue N0 = N->getOperand(0);

35416

SDValue N1 = N->getOperand(1);

35417

35418

// BT ignores high bits in the bit index operand.

35419

unsigned BitWidth = N1.getValueSizeInBits();

35420

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

35421

if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))

35422

return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

35423

35424

return SDValue();

35425

}

35426

35427

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

35428

const X86Subtarget &Subtarget) {

35429

EVT VT = N->getValueType(0);

35430

if (!VT.isVector())

35431

return SDValue();

35432

35433

SDValue N0 = N->getOperand(0);

35434

SDValue N1 = N->getOperand(1);

35435

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

35436

SDLoc dl(N);

35437

35438

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

35439

// both SSE and AVX2 since there is no sign-extended shift right

35440

// operation on a vector with 64-bit elements.

35441

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

35442

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

35443

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

35444

N0.getOpcode() == ISD::SIGN_EXTEND)) {

35445

SDValue N00 = N0.getOperand(0);

35446

35447

// EXTLOAD has a better solution on AVX2,

35448

// it may be replaced with X86ISD::VSEXT node.

35449

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

35450

if (!ISD::isNormalLoad(N00.getNode()))

35451

return SDValue();

35452

35453

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

35454

SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,

35455

N00, N1);

35456

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

35457

}

35458

}

35459

return SDValue();

35460

}

35461

35462

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

35463

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

35464

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

35465

/// opportunities to combine math ops, use an LEA, or use a complex addressing

35466

/// mode. This can eliminate extend, add, and shift instructions.

35467

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

35468

const X86Subtarget &Subtarget) {

35469

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

35470

Ext->getOpcode() != ISD::ZERO_EXTEND)

35471

return SDValue();

35472

35473

// TODO: This should be valid for other integer types.

35474

EVT VT = Ext->getValueType(0);

35475

if (VT != MVT::i64)

35476

return SDValue();

35477

35478

SDValue Add = Ext->getOperand(0);

35479

if (Add.getOpcode() != ISD::ADD)

35480

return SDValue();

35481

35482

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

35483

bool NSW = Add->getFlags().hasNoSignedWrap();

35484

bool NUW = Add->getFlags().hasNoUnsignedWrap();

35485

35486

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

35487

// into the 'zext'

35488

if ((Sext && !NSW) || (!Sext && !NUW))

35489

return SDValue();

35490

35491

// Having a constant operand to the 'add' ensures that we are not increasing

35492

// the instruction count because the constant is extended for free below.

35493

// A constant operand can also become the displacement field of an LEA.

35494

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

35495

if (!AddOp1)

35496

return SDValue();

35497

35498

// Don't make the 'add' bigger if there's no hope of combining it with some

35499

// other 'add' or 'shl' instruction.

35500

// TODO: It may be profitable to generate simpler LEA instructions in place

35501

// of single 'add' instructions, but the cost model for selecting an LEA

35502

// currently has a high threshold.

35503

bool HasLEAPotential = false;

35504

for (auto *User : Ext->uses()) {

35505

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

35506

HasLEAPotential = true;

35507

break;

35508

}

35509

}

35510

if (!HasLEAPotential)

35511

return SDValue();

35512

35513

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

35514

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

35515

SDValue AddOp0 = Add.getOperand(0);

35516

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

35517

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

35518

35519

// The wider add is guaranteed to not wrap because both operands are

35520

// sign-extended.

35521

SDNodeFlags Flags;

35522

Flags.setNoSignedWrap(NSW);

35523

Flags.setNoUnsignedWrap(NUW);

35524

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

35525

}

35526

35527

/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->

35528

/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)

35529

/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly

35530

/// extends from AH (which we otherwise need to do contortions to access).

35531

static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {

35532

SDValue N0 = N->getOperand(0);

35533

auto OpcodeN = N->getOpcode();

35534

auto OpcodeN0 = N0.getOpcode();

35535

if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||

35536

(OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))

35537

return SDValue();

35538

35539

EVT VT = N->getValueType(0);

35540

EVT InVT = N0.getValueType();

35541

if (N0.getResNo() != 1 || InVT != MVT::i8 ||

35542

!(VT == MVT::i32 || VT == MVT::i64))

35543

return SDValue();

35544

35545

SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);

35546

auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG

35547

: X86ISD::UDIVREM8_ZEXT_HREG;

35548

SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),

35549

N0.getOperand(1));

35550

DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));

35551

// If this was a 64-bit extend, complete it.

35552

if (VT == MVT::i64)

35553

return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));

35554

return R.getValue(1);

35555

}

35556

35557

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

35558

// operands and the result of CMOV is not used anywhere else - promote CMOV

35559

// itself instead of promoting its result. This could be beneficial, because:

35560

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

35561

// (or more) pseudo-CMOVs only when they go one-after-another and

35562

// getting rid of result extension code after CMOV will help that.

35563

// 2) Promotion of constant CMOV arguments is free, hence the

35564

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

35565

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

35566

// promotion is also good in terms of code-size.

35567

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

35568

// promotion).

35569

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

35570

SDValue CMovN = Extend->getOperand(0);

35571

if (CMovN.getOpcode() != X86ISD::CMOV)

35572

return SDValue();

35573

35574

EVT TargetVT = Extend->getValueType(0);

35575

unsigned ExtendOpcode = Extend->getOpcode();

35576

SDLoc DL(Extend);

35577

35578

EVT VT = CMovN.getValueType();

35579

SDValue CMovOp0 = CMovN.getOperand(0);

35580

SDValue CMovOp1 = CMovN.getOperand(1);

35581

35582

bool DoPromoteCMOV =

35583

(VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&

35584

CMovN.hasOneUse() &&

35585

(isa<ConstantSDNode>(CMovOp0.getNode()) &&

35586

isa<ConstantSDNode>(CMovOp1.getNode()));

35587

35588

if (!DoPromoteCMOV)

35589

return SDValue();

35590

35591

CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);

35592

CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);

35593

35594

return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,

35595

CMovN.getOperand(2), CMovN.getOperand(3));

35596

}

35597

35598

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

35599

// This is more or less the reverse of combineBitcastvxi1.

35600

static SDValue

35601

combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,

35602

TargetLowering::DAGCombinerInfo &DCI,

35603

const X86Subtarget &Subtarget) {

35604

unsigned Opcode = N->getOpcode();

35605

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

35606

Opcode != ISD::ANY_EXTEND)

35607

return SDValue();

35608

if (!DCI.isBeforeLegalizeOps())

35609

return SDValue();

35610

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

35611

return SDValue();

35612

35613

SDValue N0 = N->getOperand(0);

35614

EVT VT = N->getValueType(0);

35615

EVT SVT = VT.getScalarType();

35616

EVT InSVT = N0.getValueType().getScalarType();

35617

unsigned EltSizeInBits = SVT.getSizeInBits();

35618

35619

// Input type must be extending a bool vector (bit-casted from a scalar

35620

// integer) to legal integer types.

35621

if (!VT.isVector())

35622

return SDValue();

35623

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

35624

return SDValue();

35625

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

35626

return SDValue();

35627

35628

SDValue N00 = N0.getOperand(0);

35629

EVT SclVT = N0.getOperand(0).getValueType();

35630

if (!SclVT.isScalarInteger())

35631

return SDValue();

35632

35633

SDLoc DL(N);

35634

SDValue Vec;

35635

SmallVector<int, 32> ShuffleMask;

35636

unsigned NumElts = VT.getVectorNumElements();

35637

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35637, __extension__ __PRETTY_FUNCTION__));

35638

35639

// Broadcast the scalar integer to the vector elements.

35640

if (NumElts > EltSizeInBits) {

35641

// If the scalar integer is greater than the vector element size, then we

35642

// must split it down into sub-sections for broadcasting. For example:

35643

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

35644

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

35645

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35645, __extension__ __PRETTY_FUNCTION__));

35646

unsigned Scale = NumElts / EltSizeInBits;

35647

EVT BroadcastVT =

35648

EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

35649

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

35650

Vec = DAG.getBitcast(VT, Vec);

35651

35652

for (unsigned i = 0; i != Scale; ++i)

35653

ShuffleMask.append(EltSizeInBits, i);

35654

} else {

35655

// For smaller scalar integers, we can simply any-extend it to the vector

35656

// element size (we don't care about the upper bits) and broadcast it to all

35657

// elements.

35658

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

35659

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

35660

ShuffleMask.append(NumElts, 0);

35661

}

35662

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

35663

35664

// Now, mask the relevant bit in each element.

35665

SmallVector<SDValue, 32> Bits;

35666

for (unsigned i = 0; i != NumElts; ++i) {

35667

int BitIdx = (i % EltSizeInBits);

35668

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

35669

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

35670

}

35671

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

35672

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

35673

35674

// Compare against the bitmask and extend the result.

35675

EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

35676

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

35677

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

35678

35679

// For SEXT, this is now done, otherwise shift the result down for

35680

// zero-extension.

35681

if (Opcode == ISD::SIGN_EXTEND)

35682

return Vec;

35683

return DAG.getNode(ISD::SRL, DL, VT, Vec,

35684

DAG.getConstant(EltSizeInBits - 1, DL, VT));

35685

}

35686

35687

/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or

35688

/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating

35689

/// with UNDEFs) of the input to vectors of the same size as the target type

35690

/// which then extends the lowest elements.

35691

static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,

35692

TargetLowering::DAGCombinerInfo &DCI,

35693

const X86Subtarget &Subtarget) {

35694

unsigned Opcode = N->getOpcode();

35695

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)

35696

return SDValue();

35697

if (!DCI.isBeforeLegalizeOps())

35698

return SDValue();

35699

if (!Subtarget.hasSSE2())

35700

return SDValue();

35701

35702

SDValue N0 = N->getOperand(0);

35703

EVT VT = N->getValueType(0);

35704

EVT SVT = VT.getScalarType();

35705

EVT InVT = N0.getValueType();

35706

EVT InSVT = InVT.getScalarType();

35707

35708

// Input type must be a vector and we must be extending legal integer types.

35709

if (!VT.isVector())

35710

return SDValue();

35711

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

35712

return SDValue();

35713

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

35714

return SDValue();

35715

35716

// On AVX2+ targets, if the input/output types are both legal then we will be

35717

// able to use SIGN_EXTEND/ZERO_EXTEND directly.

35718

if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

35719

DAG.getTargetLoweringInfo().isTypeLegal(InVT))

35720

return SDValue();

35721

35722

SDLoc DL(N);

35723

35724

auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {

35725

EVT InVT = N.getValueType();

35726

EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),

35727

Size / InVT.getScalarSizeInBits());

35728

SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),

35729

DAG.getUNDEF(InVT));

35730

Opnds[0] = N;

35731

return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);

35732

};

35733

35734

// If target-size is less than 128-bits, extend to a type that would extend

35735

// to 128 bits, extend that and extract the original target vector.

35736

if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {

35737

unsigned Scale = 128 / VT.getSizeInBits();

35738

EVT ExVT =

35739

EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());

35740

SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());

35741

SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);

35742

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,

35743

DAG.getIntPtrConstant(0, DL));

35744

}

35745

35746

// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to

35747

// ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.

35748

// Also use this if we don't have SSE41 to allow the legalizer do its job.

35749

if (!Subtarget.hasSSE41() || VT.is128BitVector() ||

35750

(VT.is256BitVector() && Subtarget.hasInt256()) ||

35751

(VT.is512BitVector() && Subtarget.hasAVX512())) {

35752

SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());

35753

return Opcode == ISD::SIGN_EXTEND

35754

? DAG.getSignExtendVectorInReg(ExOp, DL, VT)

35755

: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);

35756

}

35757

35758

auto SplitAndExtendInReg = [&](unsigned SplitSize) {

35759

unsigned NumVecs = VT.getSizeInBits() / SplitSize;

35760

unsigned NumSubElts = SplitSize / SVT.getSizeInBits();

35761

EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);

35762

EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

35763

35764

SmallVector<SDValue, 8> Opnds;

35765

for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {

35766

SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,

35767

DAG.getIntPtrConstant(Offset, DL));

35768

SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);

35769

SrcVec = Opcode == ISD::SIGN_EXTEND

35770

? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)

35771

: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);

35772

Opnds.push_back(SrcVec);

35773

}

35774

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);

35775

};

35776

35777

// On pre-AVX2 targets, split into 128-bit nodes of

35778

// ISD::*_EXTEND_VECTOR_INREG.

35779

if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))

35780

return SplitAndExtendInReg(128);

35781

35782

// On pre-AVX512 targets, split into 256-bit nodes of

35783

// ISD::*_EXTEND_VECTOR_INREG.

35784

if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))

35785

return SplitAndExtendInReg(256);

35786

35787

return SDValue();

35788

}

35789

35790

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

35791

TargetLowering::DAGCombinerInfo &DCI,

35792

const X86Subtarget &Subtarget) {

35793

SDValue N0 = N->getOperand(0);

35794

EVT VT = N->getValueType(0);

35795

EVT InVT = N0.getValueType();

35796

SDLoc DL(N);

35797

35798

if (SDValue DivRem8 = getDivRem8(N, DAG))

35799

return DivRem8;

35800

35801

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

35802

return NewCMov;

35803

35804

if (!DCI.isBeforeLegalizeOps()) {

35805

if (InVT == MVT::i1) {

35806

SDValue Zero = DAG.getConstant(0, DL, VT);

35807

SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);

35808

return DAG.getSelect(DL, VT, N0, AllOnes, Zero);

35809

}

35810

return SDValue();

35811

}

35812

35813

if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&

35814

isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {

35815

// Invert and sign-extend a boolean is the same as zero-extend and subtract

35816

// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently

35817

// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.

35818

// sext (xor Bool, -1) --> sub (zext Bool), 1

35819

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));

35820

return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));

35821

}

35822

35823

if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))

35824

return V;

35825

35826

if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))

35827

return V;

35828

35829

if (Subtarget.hasAVX() && VT.is256BitVector())

35830

if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))

35831

return R;

35832

35833

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

35834

return NewAdd;

35835

35836

return SDValue();

35837

}

35838

35839

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

35840

const X86Subtarget &Subtarget) {

35841

// TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.

35842

SDLoc dl(N);

35843

EVT VT = N->getValueType(0);

35844

35845

// Let legalize expand this if it isn't a legal type yet.

35846

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

35847

return SDValue();

35848

35849

EVT ScalarVT = VT.getScalarType();

35850

if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())

35851

return SDValue();

35852

35853

SDValue A = N->getOperand(0);

35854

SDValue B = N->getOperand(1);

35855

SDValue C = N->getOperand(2);

35856

35857

auto invertIfNegative = [](SDValue &V) {

35858

if (SDValue NegVal = isFNEG(V.getNode())) {

35859

V = NegVal;

35860

return true;

35861

}

35862

return false;

35863

};

35864

35865

// Do not convert the passthru input of scalar intrinsics.

35866

// FIXME: We could allow negations of the lower element only.

35867

bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&

35868

N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);

35869

bool NegB = invertIfNegative(B);

35870

bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&

35871

N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);

35872

35873

// Negative multiplication when NegA xor NegB

35874

bool NegMul = (NegA != NegB);

35875

bool HasNeg = NegA || NegB || NegC;

35876

35877

unsigned NewOpcode;

35878

if (!NegMul)

35879

NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);

35880

else

35881

NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;

35882

35883

// For FMA, we risk reconstructing the node we started with.

35884

// In order to avoid this, we check for negation or opcode change. If

35885

// one of the two happened, then it is a new node and we return it.

35886

if (N->getOpcode() == ISD::FMA) {

35887

if (HasNeg || NewOpcode != N->getOpcode())

35888

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

35889

return SDValue();

35890

}

35891

35892

if (N->getOpcode() == X86ISD::FMADD_RND) {

35893

switch (NewOpcode) {

35894

case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;

35895

case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;

35896

case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;

35897

case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;

35898

}

35899

} else if (N->getOpcode() == X86ISD::FMADDS1) {

35900

switch (NewOpcode) {

35901

case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;

35902

case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;

35903

case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;

35904

case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;

35905

}

35906

} else if (N->getOpcode() == X86ISD::FMADDS3) {

35907

switch (NewOpcode) {

35908

case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;

35909

case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;

35910

case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;

35911

case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;

35912

}

35913

} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {

35914

switch (NewOpcode) {

35915

case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;

35916

case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;

35917

case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;

35918

case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;

35919

}

35920

} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {

35921

switch (NewOpcode) {

35922

case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;

35923

case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;

35924

case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;

35925

case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;

35926

}

35927

} else if (N->getOpcode() == X86ISD::FMADD4S) {

35928

switch (NewOpcode) {

35929

case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;

35930

case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;

35931

case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;

35932

case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;

35933

}

35934

} else {

35935

llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35935);

35936

}

35937

35938

// Only return the node is the opcode was changed or one of the

35939

// operand was negated. If not, we'll just recreate the same node.

35940

if (HasNeg || NewOpcode != N->getOpcode()) {

35941

if (N->getNumOperands() == 4)

35942

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

35943

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

35944

}

35945

35946

return SDValue();

35947

}

35948

35949

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

35950

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

35951

const X86Subtarget &Subtarget) {

35952

SDLoc dl(N);

35953

EVT VT = N->getValueType(0);

35954

35955

SDValue NegVal = isFNEG(N->getOperand(2).getNode());

35956

if (!NegVal)

35957

return SDValue();

35958

35959

unsigned NewOpcode;

35960

switch (N->getOpcode()) {

35961

default: llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35961);

35962

case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;

35963

case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;

35964

case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;

35965

case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;

35966

}

35967

35968

if (N->getNumOperands() == 4)

35969

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

35970

NegVal, N->getOperand(3));

35971

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

35972

NegVal);

35973

}

35974

35975

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

35976

TargetLowering::DAGCombinerInfo &DCI,

35977

const X86Subtarget &Subtarget) {

35978

// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->

35979

// (and (i32 x86isd::setcc_carry), 1)

35980

// This eliminates the zext. This transformation is necessary because

35981

// ISD::SETCC is always legalized to i8.

35982

SDLoc dl(N);

35983

SDValue N0 = N->getOperand(0);

35984

EVT VT = N->getValueType(0);

35985

35986

if (N0.getOpcode() == ISD::AND &&

35987

N0.hasOneUse() &&

35988

N0.getOperand(0).hasOneUse()) {

35989

SDValue N00 = N0.getOperand(0);

35990

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

35991

if (!isOneConstant(N0.getOperand(1)))

35992

return SDValue();

35993

return DAG.getNode(ISD::AND, dl, VT,

35994

DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,

35995

N00.getOperand(0), N00.getOperand(1)),

35996

DAG.getConstant(1, dl, VT));

35997

}

35998

}

35999

36000

if (N0.getOpcode() == ISD::TRUNCATE &&

36001

N0.hasOneUse() &&

36002

N0.getOperand(0).hasOneUse()) {

36003

SDValue N00 = N0.getOperand(0);

36004

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

36005

return DAG.getNode(ISD::AND, dl, VT,

36006

DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,

36007

N00.getOperand(0), N00.getOperand(1)),

36008

DAG.getConstant(1, dl, VT));

36009

}

36010

}

36011

36012

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

36013

return NewCMov;

36014

36015

if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))

36016

return V;

36017

36018

if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))

36019

return V;

36020

36021

if (VT.is256BitVector())

36022

if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))

36023

return R;

36024

36025

if (SDValue DivRem8 = getDivRem8(N, DAG))

36026

return DivRem8;

36027

36028

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

36029

return NewAdd;

36030

36031

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

36032

return R;

36033

36034

return SDValue();

36035

}

36036

36037

/// Try to map a 128-bit or larger integer comparison to vector instructions

36038

/// before type legalization splits it up into chunks.

36039

static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

36040

const X86Subtarget &Subtarget) {

36041

ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();

36042

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36042, __extension__ __PRETTY_FUNCTION__));

36043

36044

// We're looking for an oversized integer equality comparison, but ignore a

36045

// comparison with zero because that gets special treatment in EmitTest().

36046

SDValue X = SetCC->getOperand(0);

36047

SDValue Y = SetCC->getOperand(1);

36048

EVT OpVT = X.getValueType();

36049

unsigned OpSize = OpVT.getSizeInBits();

36050

if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))

36051

return SDValue();

36052

36053

// Bail out if we know that this is not really just an oversized integer.

36054

if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||

36055

peekThroughBitcasts(Y).getValueType() == MVT::f128)

36056

return SDValue();

36057

36058

// TODO: Use PXOR + PTEST for SSE4.1 or later?

36059

// TODO: Add support for AVX-512.

36060

EVT VT = SetCC->getValueType(0);

36061

SDLoc DL(SetCC);

36062

if ((OpSize == 128 && Subtarget.hasSSE2()) ||

36063

(OpSize == 256 && Subtarget.hasAVX2())) {

36064

EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;

36065

SDValue VecX = DAG.getBitcast(VecVT, X);

36066

SDValue VecY = DAG.getBitcast(VecVT, Y);

36067

36068

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

36069

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

36070

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

36071

// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq

36072

// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne

36073

SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);

36074

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

36075

SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,

36076

MVT::i32);

36077

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

36078

}

36079

36080

return SDValue();

36081

}

36082

36083

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

36084

const X86Subtarget &Subtarget) {

36085

ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

36086

SDValue LHS = N->getOperand(0);

36087

SDValue RHS = N->getOperand(1);

36088

EVT VT = N->getValueType(0);

36089

SDLoc DL(N);

36090

36091

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

36092

EVT OpVT = LHS.getValueType();

36093

// 0-x == y --> x+y == 0

36094

// 0-x != y --> x+y != 0

36095

if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&

36096

LHS.hasOneUse()) {

36097

SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));

36098

return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);

36099

}

36100

// x == 0-y --> x+y == 0

36101

// x != 0-y --> x+y != 0

36102

if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&

36103

RHS.hasOneUse()) {

36104

SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));

36105

return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);

36106

}

36107

36108

if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))

36109

return V;

36110

}

36111

36112

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

36113

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

36114

// Put build_vectors on the right.

36115

if (LHS.getOpcode() == ISD::BUILD_VECTOR) {

36116

std::swap(LHS, RHS);

36117

CC = ISD::getSetCCSwappedOperands(CC);

36118

}

36119

36120

bool IsSEXT0 =

36121

(LHS.getOpcode() == ISD::SIGN_EXTEND) &&

36122

(LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

36123

bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

36124

36125

if (IsSEXT0 && IsVZero1) {

36126

assert(VT == LHS.getOperand(0).getValueType() &&(static_cast <bool> (VT == LHS.getOperand(0).getValueType
() && "Uexpected operand type") ? void (0) : __assert_fail
("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36127, __extension__ __PRETTY_FUNCTION__))

36127

"Uexpected operand type")(static_cast <bool> (VT == LHS.getOperand(0).getValueType
() && "Uexpected operand type") ? void (0) : __assert_fail
("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36127, __extension__ __PRETTY_FUNCTION__));

36128

if (CC == ISD::SETGT)

36129

return DAG.getConstant(0, DL, VT);

36130

if (CC == ISD::SETLE)

36131

return DAG.getConstant(1, DL, VT);

36132

if (CC == ISD::SETEQ || CC == ISD::SETGE)

36133

return DAG.getNOT(DL, LHS.getOperand(0), VT);

36134

36135

assert((CC == ISD::SETNE || CC == ISD::SETLT) &&(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETLT
) && "Unexpected condition code!") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36136, __extension__ __PRETTY_FUNCTION__))

36136

"Unexpected condition code!")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETLT
) && "Unexpected condition code!") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36136, __extension__ __PRETTY_FUNCTION__));

36137

return LHS.getOperand(0);

36138

}

36139

}

36140

36141

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

36142

// to avoid scalarization via legalization because v4i32 is not a legal type.

36143

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

36144

LHS.getValueType() == MVT::v4f32)

36145

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

36146

36147

return SDValue();

36148

}

36149

36150

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

36151

TargetLowering::DAGCombinerInfo &DCI) {

36152

SDValue Src = N->getOperand(0);

36153

MVT SrcVT = Src.getSimpleValueType();

36154

36155

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

36156

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

36157

!DCI.isBeforeLegalizeOps());

36158

36159

// MOVMSK only uses the MSB from each vector element.

36160

KnownBits Known;

36161

APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));

36162

if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {

36163

DCI.AddToWorklist(Src.getNode());

36164

DCI.CommitTargetLoweringOpt(TLO);

36165

return SDValue(N, 0);

36166

}

36167

36168

return SDValue();

36169

}

36170

36171

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

36172

TargetLowering::DAGCombinerInfo &DCI,

36173

const X86Subtarget &Subtarget) {

36174

SDLoc DL(N);

36175

36176

// Pre-shrink oversized index elements to avoid triggering scalarization.

36177

if (DCI.isBeforeLegalize()) {

36178

SDValue Index = N->getOperand(4);

36179

if (Index.getScalarValueSizeInBits() > 64) {

36180

EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,

36181

Index.getValueType().getVectorNumElements());

36182

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);

36183

SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());

36184

NewOps[4] = Trunc;

36185

DAG.UpdateNodeOperands(N, NewOps);

36186

DCI.AddToWorklist(N);

36187

return SDValue(N, 0);

36188

}

36189

}

36190

36191

// Try to remove sign extends from i32 to i64 on the index.

36192

// Only do this before legalize in case we are relying on it for

36193

// legalization.

36194

// TODO: We should maybe remove any sign extend once we learn how to sign

36195

// extend narrow index during lowering.

36196

if (DCI.isBeforeLegalizeOps()) {

36197

SDValue Index = N->getOperand(4);

36198

if (Index.getScalarValueSizeInBits() == 64 &&

36199

Index.getOpcode() == ISD::SIGN_EXTEND &&

36200

Index.getOperand(0).getScalarValueSizeInBits() == 32) {

36201

SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());

36202

NewOps[4] = Index.getOperand(0);

36203

DAG.UpdateNodeOperands(N, NewOps);

36204

// The original sign extend has less users, add back to worklist in case

36205

// it needs to be removed.

36206

DCI.AddToWorklist(Index.getNode());

36207

DCI.AddToWorklist(N);

36208

return SDValue(N, 0);

36209

}

36210

}

36211

36212

// Gather and Scatter instructions use k-registers for masks. The type of

36213

// the masks is v*i1. So the mask will be truncated anyway.

36214

// The SIGN_EXTEND_INREG my be dropped.

36215

SDValue Mask = N->getOperand(2);

36216

if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {

36217

SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());

36218

NewOps[2] = Mask.getOperand(0);

36219

DAG.UpdateNodeOperands(N, NewOps);

36220

}

36221

36222

// With AVX2 we only demand the upper bit of the mask.

36223

if (!Subtarget.hasAVX512()) {

36224

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

36225

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

36226

!DCI.isBeforeLegalizeOps());

36227

KnownBits Known;

36228

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

36229

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {

36230

DCI.AddToWorklist(Mask.getNode());

36231

DCI.CommitTargetLoweringOpt(TLO);

36232

return SDValue(N, 0);

36233

}

36234

}

36235

36236

return SDValue();

36237

}

36238

36239

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

36240

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

36241

const X86Subtarget &Subtarget) {

36242

SDLoc DL(N);

36243

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

36244

SDValue EFLAGS = N->getOperand(1);

36245

36246

// Try to simplify the EFLAGS and condition code operands.

36247

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

36248

return getSETCC(CC, Flags, DL, DAG);

36249

36250

return SDValue();

36251

}

36252

36253

/// Optimize branch condition evaluation.

36254

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

36255

const X86Subtarget &Subtarget) {

36256

SDLoc DL(N);

36257

SDValue EFLAGS = N->getOperand(3);

36258

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

36259

36260

// Try to simplify the EFLAGS and condition code operands.

36261

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

36262

// RAUW them under us.

36263

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

36264

SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);

36265

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

36266

N->getOperand(1), Cond, Flags);

36267

}

36268

36269

return SDValue();

36270

}

36271

36272

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

36273

SelectionDAG &DAG) {

36274

// Take advantage of vector comparisons producing 0 or -1 in each lane to

36275

// optimize away operation when it's from a constant.

36276

36277

// The general transformation is:

36278

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

36279

// AND(VECTOR_CMP(x,y), constant2)

36280

// constant2 = UNARYOP(constant)

36281

36282

// Early exit if this isn't a vector operation, the operand of the

36283

// unary operation isn't a bitwise AND, or if the sizes of the operations

36284

// aren't the same.

36285

EVT VT = N->getValueType(0);

36286

if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||

36287

N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||

36288

VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())

36289

return SDValue();

36290

36291

// Now check that the other operand of the AND is a constant. We could

36292

// make the transformation for non-constant splats as well, but it's unclear

36293

// that would be a benefit as it would not eliminate any operations, just

36294

// perform one more step in scalar code before moving to the vector unit.

36295

if (BuildVectorSDNode *BV =

36296

dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {

36297

// Bail out if the vector isn't a constant.

36298

if (!BV->isConstant())

36299

return SDValue();

36300

36301

// Everything checks out. Build up the new and improved node.

36302

SDLoc DL(N);

36303

EVT IntVT = BV->getValueType(0);

36304

// Create a new constant of the appropriate type for the transformed

36305

// DAG.

36306

SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

36307

// The AND node needs bitcasts to/from an integer vector type around it.

36308

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

36309

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,

36310

N->getOperand(0)->getOperand(0), MaskConst);

36311

SDValue Res = DAG.getBitcast(VT, NewAnd);

36312

return Res;

36313

}

36314

36315

return SDValue();

36316

}

36317

36318

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

36319

const X86Subtarget &Subtarget) {

36320

SDValue Op0 = N->getOperand(0);

36321

EVT VT = N->getValueType(0);

36322

EVT InVT = Op0.getValueType();

36323

EVT InSVT = InVT.getScalarType();

36324

36325

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

36326

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

36327

if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {

36328

SDLoc dl(N);

36329

EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

36330

InVT.getVectorNumElements());

36331

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

36332

36333

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

36334

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

36335

}

36336

36337

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

36338

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

36339

// the optimization here.

36340

if (DAG.SignBitIsZero(Op0))

36341

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

36342

36343

return SDValue();

36344

}

36345

36346

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

36347

const X86Subtarget &Subtarget) {

36348

// First try to optimize away the conversion entirely when it's

36349

// conditionally from a constant. Vectors only.

36350

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

36351

return Res;

36352

36353

// Now move on to more general possibilities.

36354

SDValue Op0 = N->getOperand(0);

36355

EVT VT = N->getValueType(0);

36356

EVT InVT = Op0.getValueType();

36357

EVT InSVT = InVT.getScalarType();

36358

36359

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

36360

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

36361

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

36362

if (InVT.isVector() &&

36363

(InSVT == MVT::i8 || InSVT == MVT::i16 ||

36364

(InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {

36365

SDLoc dl(N);

36366

EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

36367

InVT.getVectorNumElements());

36368

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

36369

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

36370

}

36371

36372

// Without AVX512DQ we only support i64 to float scalar conversion. For both

36373

// vectors and scalars, see if we know that the upper bits are all the sign

36374

// bit, in which case we can truncate the input to i32 and convert from that.

36375

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

36376

unsigned BitWidth = InVT.getScalarSizeInBits();

36377

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

36378

if (NumSignBits >= (BitWidth - 31)) {

36379

EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);

36380

if (InVT.isVector())

36381

TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,

36382

InVT.getVectorNumElements());

36383

SDLoc dl(N);

36384

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

36385

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

36386

}

36387

}

36388

36389

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

36390

// a 32-bit target where SSE doesn't support i64->FP operations.

36391

if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {

36392

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

36393

EVT LdVT = Ld->getValueType(0);

36394

36395

// This transformation is not supported if the result type is f16 or f128.

36396

if (VT == MVT::f16 || VT == MVT::f128)

36397

return SDValue();

36398

36399

if (!Ld->isVolatile() && !VT.isVector() &&

36400

ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&

36401

!Subtarget.is64Bit() && LdVT == MVT::i64) {

36402

SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(

36403

SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);

36404

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));

36405

return FILDChain;

36406

}

36407

}

36408

return SDValue();

36409

}

36410

36411

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

36412

if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {

36413

MVT VT = N->getSimpleValueType(0);

36414

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

36415

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,

36416

N->getOperand(0), N->getOperand(1),

36417

Flags);

36418

}

36419

36420

return SDValue();

36421

}

36422

36423

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

36424

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

36425

TargetLowering::DAGCombinerInfo &DCI) {

36426

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

36427

// the result is either zero or one (depending on the input carry bit).

36428

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

36429

if (X86::isZeroNode(N->getOperand(0)) &&

36430

X86::isZeroNode(N->getOperand(1)) &&

36431

// We don't have a good way to replace an EFLAGS use, so only do this when

36432

// dead right now.

36433

SDValue(N, 1).use_empty()) {

36434

SDLoc DL(N);

36435

EVT VT = N->getValueType(0);

36436

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

36437

SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,

36438

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

36439

DAG.getConstant(X86::COND_B, DL,

36440

MVT::i8),

36441

N->getOperand(2)),

36442

DAG.getConstant(1, DL, VT));

36443

return DCI.CombineTo(N, Res1, CarryOut);

36444

}

36445

36446

if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {

36447

MVT VT = N->getSimpleValueType(0);

36448

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

36449

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,

36450

N->getOperand(0), N->getOperand(1),

36451

Flags);

36452

}

36453

36454

return SDValue();

36455

}

36456

36457

/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit

36458

/// which is more useful than 0/1 in some cases.

36459

static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {

36460

SDLoc DL(N);

36461

// "Condition code B" is also known as "the carry flag" (CF).

36462

SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);

36463

SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);

36464

MVT VT = N->getSimpleValueType(0);

36465

if (VT == MVT::i8)

36466

return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));

36467

36468

assert(VT == MVT::i1 && "Unexpected type for SETCC node")(static_cast <bool> (VT == MVT::i1 && "Unexpected type for SETCC node"
) ? void (0) : __assert_fail ("VT == MVT::i1 && \"Unexpected type for SETCC node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36468, __extension__ __PRETTY_FUNCTION__));

36469

return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);

36470

}

36471

36472

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

36473

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

36474

/// with CMP+{ADC, SBB}.

36475

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

36476

bool IsSub = N->getOpcode() == ISD::SUB;

36477

SDValue X = N->getOperand(0);

36478

SDValue Y = N->getOperand(1);

36479

36480

// If this is an add, canonicalize a zext operand to the RHS.

36481

// TODO: Incomplete? What if both sides are zexts?

36482

if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&

36483

Y.getOpcode() != ISD::ZERO_EXTEND)

36484

std::swap(X, Y);

36485

36486

// Look through a one-use zext.

36487

bool PeekedThroughZext = false;

36488

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {

36489

Y = Y.getOperand(0);

36490

PeekedThroughZext = true;

36491

}

36492

36493

// If this is an add, canonicalize a setcc operand to the RHS.

36494

// TODO: Incomplete? What if both sides are setcc?

36495

// TODO: Should we allow peeking through a zext of the other operand?

36496

if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&

36497

Y.getOpcode() != X86ISD::SETCC)

36498

std::swap(X, Y);

36499

36500

if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())

36501

return SDValue();

36502

36503

SDLoc DL(N);

36504

EVT VT = N->getValueType(0);

36505

X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

36506

36507

// If X is -1 or 0, then we have an opportunity to avoid constants required in

36508

// the general case below.

36509

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

36510

if (ConstantX) {

36511

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||

36512

(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {

36513

// This is a complicated way to get -1 or 0 from the carry flag:

36514

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

36515

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

36516

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

36517

DAG.getConstant(X86::COND_B, DL, MVT::i8),

36518

Y.getOperand(1));

36519

}

36520

36521

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||

36522

(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {

36523

SDValue EFLAGS = Y->getOperand(1);

36524

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

36525

EFLAGS.getValueType().isInteger() &&

36526

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

36527

// Swap the operands of a SUB, and we have the same pattern as above.

36528

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

36529

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

36530

SDValue NewSub = DAG.getNode(

36531

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

36532

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

36533

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

36534

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

36535

DAG.getConstant(X86::COND_B, DL, MVT::i8),

36536

NewEFLAGS);

36537

}

36538

}

36539

}

36540

36541

if (CC == X86::COND_B) {

36542

// X + SETB Z --> X + (mask SBB Z, Z)

36543

// X - SETB Z --> X - (mask SBB Z, Z)

36544

// TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?

36545

SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);

36546

if (SBB.getValueSizeInBits() != VT.getSizeInBits())

36547

SBB = DAG.getZExtOrTrunc(SBB, DL, VT);

36548

return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);

36549

}

36550

36551

if (CC == X86::COND_A) {

36552

SDValue EFLAGS = Y->getOperand(1);

36553

// Try to convert COND_A into COND_B in an attempt to facilitate

36554

// materializing "setb reg".

36555

36556

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

36557

// cannot take an immediate as its first operand.

36558

36559

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

36560

EFLAGS.getValueType().isInteger() &&

36561

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

36562

SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),

36563

EFLAGS.getNode()->getVTList(),

36564

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

36565

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

36566

SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);

36567

if (SBB.getValueSizeInBits() != VT.getSizeInBits())

36568

SBB = DAG.getZExtOrTrunc(SBB, DL, VT);

36569

return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);

36570

}

36571

}

36572

36573

if (CC != X86::COND_E && CC != X86::COND_NE)

36574

return SDValue();

36575

36576

SDValue Cmp = Y.getOperand(1);

36577

if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||

36578

!X86::isZeroNode(Cmp.getOperand(1)) ||

36579

!Cmp.getOperand(0).getValueType().isInteger())

36580

return SDValue();

36581

36582

SDValue Z = Cmp.getOperand(0);

36583

EVT ZVT = Z.getValueType();

36584

36585

// If X is -1 or 0, then we have an opportunity to avoid constants required in

36586

// the general case below.

36587

if (ConstantX) {

36588

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

36589

// fake operands:

36590

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

36591

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

36592

if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||

36593

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {

36594

SDValue Zero = DAG.getConstant(0, DL, ZVT);

36595

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

36596

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

36597

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

36598

DAG.getConstant(X86::COND_B, DL, MVT::i8),

36599

SDValue(Neg.getNode(), 1));

36600

}

36601

36602

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

36603

// with fake operands:

36604

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

36605

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

36606

if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||

36607

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {

36608

SDValue One = DAG.getConstant(1, DL, ZVT);

36609

SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

36610

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

36611

DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);

36612

}

36613

}

36614

36615

// (cmp Z, 1) sets the carry flag if Z is 0.

36616

SDValue One = DAG.getConstant(1, DL, ZVT);

36617

SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

36618

36619

// Add the flags type for ADC/SBB nodes.

36620

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

36621

36622

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

36623

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

36624

if (CC == X86::COND_NE)

36625

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

36626

DAG.getConstant(-1ULL, DL, VT), Cmp1);

36627

36628

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

36629

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

36630

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

36631

DAG.getConstant(0, DL, VT), Cmp1);

36632

}

36633

36634

static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,

36635

const X86Subtarget &Subtarget) {

36636

if (!Subtarget.hasSSE2())

36637

return SDValue();

36638

36639

SDValue MulOp = N->getOperand(0);

36640

SDValue Phi = N->getOperand(1);

36641

36642

if (MulOp.getOpcode() != ISD::MUL)

36643

std::swap(MulOp, Phi);

36644

if (MulOp.getOpcode() != ISD::MUL)

36645

return SDValue();

36646

36647

ShrinkMode Mode;

36648

if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)

36649

return SDValue();

36650

36651

EVT VT = N->getValueType(0);

36652

36653

unsigned RegSize = 128;

36654

if (Subtarget.hasBWI())

36655

RegSize = 512;

36656

else if (Subtarget.hasAVX2())

36657

RegSize = 256;

36658

unsigned VectorSize = VT.getVectorNumElements() * 16;

36659

// If the vector size is less than 128, or greater than the supported RegSize,

36660

// do not use PMADD.

36661

if (VectorSize < 128 || VectorSize > RegSize)

36662

return SDValue();

36663

36664

SDLoc DL(N);

36665

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

36666

VT.getVectorNumElements());

36667

EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

36668

VT.getVectorNumElements() / 2);

36669

36670

// Shrink the operands of mul.

36671

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));

36672

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

36673

36674

// Madd vector size is half of the original vector size

36675

SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);

36676

// Fill the rest of the output with 0

36677

SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);

36678

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);

36679

return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);

36680

}

36681

36682

static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,

36683

const X86Subtarget &Subtarget) {

36684

if (!Subtarget.hasSSE2())

36685

return SDValue();

36686

36687

SDLoc DL(N);

36688

EVT VT = N->getValueType(0);

36689

SDValue Op0 = N->getOperand(0);

36690

SDValue Op1 = N->getOperand(1);

36691

36692

// TODO: There's nothing special about i32, any integer type above i16 should

36693

// work just as well.

36694

if (!VT.isVector() || !VT.isSimple() ||

36695

!(VT.getVectorElementType() == MVT::i32))

36696

return SDValue();

36697

36698

unsigned RegSize = 128;

36699

if (Subtarget.hasBWI())

36700

RegSize = 512;

36701

else if (Subtarget.hasAVX2())

36702

RegSize = 256;

36703

36704

// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.

36705

// TODO: We should be able to handle larger vectors by splitting them before

36706

// feeding them into several SADs, and then reducing over those.

36707

if (VT.getSizeInBits() / 4 > RegSize)

36708

return SDValue();

36709

36710

// We know N is a reduction add, which means one of its operands is a phi.

36711

// To match SAD, we need the other operand to be a vector select.

36712

SDValue SelectOp, Phi;

36713

if (Op0.getOpcode() == ISD::VSELECT) {

36714

SelectOp = Op0;

36715

Phi = Op1;

36716

} else if (Op1.getOpcode() == ISD::VSELECT) {

36717

SelectOp = Op1;

36718

Phi = Op0;

36719

} else

36720

return SDValue();

36721

36722

// Check whether we have an abs-diff pattern feeding into the select.

36723

if(!detectZextAbsDiff(SelectOp, Op0, Op1))

36724

return SDValue();

36725

36726

// SAD pattern detected. Now build a SAD instruction and an addition for

36727

// reduction. Note that the number of elements of the result of SAD is less

36728

// than the number of elements of its input. Therefore, we could only update

36729

// part of elements in the reduction vector.

36730

SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);

36731

36732

// The output of PSADBW is a vector of i64.

36733

// We need to turn the vector of i64 into a vector of i32.

36734

// If the reduction vector is at least as wide as the psadbw result, just

36735

// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero

36736

// anyway.

36737

MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);

36738

if (VT.getSizeInBits() >= ResVT.getSizeInBits())

36739

Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);

36740

else

36741

Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

36742

36743

if (VT.getSizeInBits() > ResVT.getSizeInBits()) {

36744

// Fill the upper elements with zero to match the add width.

36745

SDValue Zero = DAG.getConstant(0, DL, VT);

36746

Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,

36747

DAG.getIntPtrConstant(0, DL));

36748

}

36749

36750

return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);

36751

}

36752

36753

/// Convert vector increment or decrement to sub/add with an all-ones constant:

36754

/// add X, <1, 1...> --> sub X, <-1, -1...>

36755

/// sub X, <1, 1...> --> add X, <-1, -1...>

36756

/// The all-ones vector constant can be materialized using a pcmpeq instruction

36757

/// that is commonly recognized as an idiom (has no register dependency), so

36758

/// that's better/smaller than loading a splat 1 constant.

36759

static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {

36760

assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::ADD || N
->getOpcode() == ISD::SUB) && "Unexpected opcode for increment/decrement transform"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && \"Unexpected opcode for increment/decrement transform\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36761, __extension__ __PRETTY_FUNCTION__))

36761

"Unexpected opcode for increment/decrement transform")(static_cast <bool> ((N->getOpcode() == ISD::ADD || N
->getOpcode() == ISD::SUB) && "Unexpected opcode for increment/decrement transform"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && \"Unexpected opcode for increment/decrement transform\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36761, __extension__ __PRETTY_FUNCTION__));

36762

36763

// Pseudo-legality check: getOnesVector() expects one of these types, so bail

36764

// out and wait for legalization if we have an unsupported vector length.

36765

EVT VT = N->getValueType(0);

36766

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

36767

return SDValue();

36768

36769

SDNode *N1 = N->getOperand(1).getNode();

36770

APInt SplatVal;

36771

if (!ISD::isConstantSplatVector(N1, SplatVal) ||

36772

!SplatVal.isOneValue())

36773

return SDValue();

36774

36775

SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));

36776

unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;

36777

return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);

36778

}

36779

36780

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

36781

const X86Subtarget &Subtarget) {

36782

const SDNodeFlags Flags = N->getFlags();

36783

if (Flags.hasVectorReduction()) {

36784

if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))

36785

return Sad;

36786

if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))

36787

return MAdd;

36788

}

36789

EVT VT = N->getValueType(0);

36790

SDValue Op0 = N->getOperand(0);

36791

SDValue Op1 = N->getOperand(1);

36792

36793

// Try to synthesize horizontal adds from adds of shuffles.

36794

if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||

36795

(Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&

36796

isHorizontalBinOp(Op0, Op1, true))

36797

return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

36798

36799

if (SDValue V = combineIncDecVector(N, DAG))

36800

return V;

36801

36802

return combineAddOrSubToADCOrSBB(N, DAG);

36803

}

36804

36805

static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,

36806

const X86Subtarget &Subtarget) {

36807

SDValue Op0 = N->getOperand(0);

36808

SDValue Op1 = N->getOperand(1);

36809

EVT VT = N->getValueType(0);

36810

36811

// PSUBUS is supported, starting from SSE2, but special preprocessing

36812

// for v8i32 requires umin, which appears in SSE41.

36813

if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&

36814

!(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&

36815

!(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&

36816

!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&

36817

(VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||

36818

VT == MVT::v8i64)))

36819

return SDValue();

36820

36821

SDValue SubusLHS, SubusRHS;

36822

// Try to find umax(a,b) - b or a - umin(a,b) patterns

36823

// they may be converted to subus(a,b).

36824

// TODO: Need to add IR cannonicialization for this code.

36825

if (Op0.getOpcode() == ISD::UMAX) {

36826

SubusRHS = Op1;

36827

SDValue MaxLHS = Op0.getOperand(0);

36828

SDValue MaxRHS = Op0.getOperand(1);

36829

if (MaxLHS == Op1)

36830

SubusLHS = MaxRHS;

36831

else if (MaxRHS == Op1)

36832

SubusLHS = MaxLHS;

36833

else

36834

return SDValue();

36835

} else if (Op1.getOpcode() == ISD::UMIN) {

36836

SubusLHS = Op0;

36837

SDValue MinLHS = Op1.getOperand(0);

36838

SDValue MinRHS = Op1.getOperand(1);

36839

if (MinLHS == Op0)

36840

SubusRHS = MinRHS;

36841

else if (MinRHS == Op0)

36842

SubusRHS = MinLHS;

36843

else

36844

return SDValue();

36845

} else

36846

return SDValue();

36847

36848

// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with

36849

// special preprocessing in some cases.

36850

if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)

36851

return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);

36852

36853

// Special preprocessing case can be only applied

36854

// if the value was zero extended from 16 bit,

36855

// so we require first 16 bits to be zeros for 32 bit

36856

// values, or first 48 bits for 64 bit values.

36857

KnownBits Known;

36858

DAG.computeKnownBits(SubusLHS, Known);

36859

unsigned NumZeros = Known.countMinLeadingZeros();

36860

if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)

36861

return SDValue();

36862

36863

EVT ExtType = SubusLHS.getValueType();

36864

EVT ShrinkedType;

36865

if (VT == MVT::v8i32 || VT == MVT::v8i64)

36866

ShrinkedType = MVT::v8i16;

36867

else

36868

ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

36869

36870

// If SubusLHS is zeroextended - truncate SubusRHS to it's

36871

// size SubusRHS = umin(0xFFF.., SubusRHS).

36872

SDValue SaturationConst =

36873

DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),

36874

ShrinkedType.getScalarSizeInBits()),

36875

SDLoc(SubusLHS), ExtType);

36876

SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,

36877

SaturationConst);

36878

SDValue NewSubusLHS =

36879

DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);

36880

SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);

36881

SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,

36882

NewSubusLHS, NewSubusRHS);

36883

// Zero extend the result, it may be used somewhere as 32 bit,

36884

// if not zext and following trunc will shrink.

36885

return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);

36886

}

36887

36888

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

36889

const X86Subtarget &Subtarget) {

36890

SDValue Op0 = N->getOperand(0);

36891

SDValue Op1 = N->getOperand(1);

36892

36893

// X86 can't encode an immediate LHS of a sub. See if we can push the

36894

// negation into a preceding instruction.

36895

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {

36896

// If the RHS of the sub is a XOR with one use and a constant, invert the

36897

// immediate. Then add one to the LHS of the sub so we can turn

36898

// X-Y -> X+~Y+1, saving one register.

36899

if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&

36900

isa<ConstantSDNode>(Op1.getOperand(1))) {

36901

APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();

36902

EVT VT = Op0.getValueType();

36903

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,

36904

Op1.getOperand(0),

36905

DAG.getConstant(~XorC, SDLoc(Op1), VT));

36906

return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,

36907

DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));

36908

}

36909

}

36910

36911

// Try to synthesize horizontal subs from subs of shuffles.

36912

EVT VT = N->getValueType(0);

36913

if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||

36914

(Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&

36915

isHorizontalBinOp(Op0, Op1, false))

36916

return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);

36917

36918

if (SDValue V = combineIncDecVector(N, DAG))

36919

return V;

36920

36921

// Try to create PSUBUS if SUB's argument is max/min

36922

if (SDValue V = combineSubToSubus(N, DAG, Subtarget))

36923

return V;

36924

36925

return combineAddOrSubToADCOrSBB(N, DAG);

36926

}

36927

36928

static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,

36929

TargetLowering::DAGCombinerInfo &DCI,

36930

const X86Subtarget &Subtarget) {

36931

if (DCI.isBeforeLegalize())

36932

return SDValue();

36933

36934

SDLoc DL(N);

36935

unsigned Opcode = N->getOpcode();

36936

MVT VT = N->getSimpleValueType(0);

36937

MVT SVT = VT.getVectorElementType();

36938

unsigned NumElts = VT.getVectorNumElements();

36939

unsigned EltSizeInBits = SVT.getSizeInBits();

36940

36941

SDValue Op = N->getOperand(0);

36942

MVT OpVT = Op.getSimpleValueType();

36943

MVT OpEltVT = OpVT.getVectorElementType();

36944

unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();

36945

unsigned InputBits = OpEltSizeInBits * NumElts;

36946

36947

// Perform any constant folding.

36948

// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.

36949

APInt UndefElts;

36950

SmallVector<APInt, 64> EltBits;

36951

if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {

36952

APInt Undefs(NumElts, 0);

36953

SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));

36954

bool IsZEXT =

36955

(Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);

36956

for (unsigned i = 0; i != NumElts; ++i) {

36957

if (UndefElts[i]) {

36958

Undefs.setBit(i);

36959

continue;

36960

}

36961

Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)

36962

: EltBits[i].sextOrTrunc(EltSizeInBits);

36963

}

36964

return getConstVector(Vals, Undefs, VT, DAG, DL);

36965

}

36966

36967

// (vzext (bitcast (vzext (x)) -> (vzext x)

36968

// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)

36969

SDValue V = peekThroughBitcasts(Op);

36970

if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {

36971

MVT InnerVT = V.getSimpleValueType();

36972

MVT InnerEltVT = InnerVT.getVectorElementType();

36973

36974

// If the element sizes match exactly, we can just do one larger vzext. This

36975

// is always an exact type match as vzext operates on integer types.

36976

if (OpEltVT == InnerEltVT) {

36977

assert(OpVT == InnerVT && "Types must match for vzext!")(static_cast <bool> (OpVT == InnerVT && "Types must match for vzext!"
) ? void (0) : __assert_fail ("OpVT == InnerVT && \"Types must match for vzext!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36977, __extension__ __PRETTY_FUNCTION__));

36978

return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));

36979

}

36980

36981

// The only other way we can combine them is if only a single element of the

36982

// inner vzext is used in the input to the outer vzext.

36983

if (InnerEltVT.getSizeInBits() < InputBits)

36984

return SDValue();

36985

36986

// In this case, the inner vzext is completely dead because we're going to

36987

// only look at bits inside of the low element. Just do the outer vzext on

36988

// a bitcast of the input to the inner.

36989

return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));

36990

}

36991

36992

// Check if we can bypass extracting and re-inserting an element of an input

36993

// vector. Essentially:

36994

// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)

36995

// TODO: Add X86ISD::VSEXT support

36996

if (Opcode == X86ISD::VZEXT &&

36997

V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

36998

V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

36999

V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {

37000

SDValue ExtractedV = V.getOperand(0);

37001

SDValue OrigV = ExtractedV.getOperand(0);

37002

if (isNullConstant(ExtractedV.getOperand(1))) {

37003

MVT OrigVT = OrigV.getSimpleValueType();

37004

// Extract a subvector if necessary...

37005

if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {

37006

int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();

37007

OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),

37008

OrigVT.getVectorNumElements() / Ratio);

37009

OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,

37010

DAG.getIntPtrConstant(0, DL));

37011

}

37012

Op = DAG.getBitcast(OpVT, OrigV);

37013

return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);

37014

}

37015

}

37016

37017

return SDValue();

37018

}

37019

37020

static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,

37021

const X86Subtarget &Subtarget) {

37022

SDValue Op0 = N->getOperand(0);

37023

SDValue Op1 = N->getOperand(1);

37024

37025

MVT VT = N->getSimpleValueType(0);

37026

SDLoc DL(N);

37027

37028

// TEST (AND a, b) ,(AND a, b) -> TEST a, b

37029

if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)

37030

return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),

37031

Op0->getOperand(1));

37032

37033

// TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)

37034

// TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)

37035

if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||

37036

ISD::isBuildVectorAllZeros(Op1.getNode()))

37037

return getZeroVector(VT, Subtarget, DAG, DL);

37038

37039

return SDValue();

37040

}

37041

37042

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

37043

const X86Subtarget &Subtarget) {

37044

MVT VT = N->getSimpleValueType(0);

37045

SDLoc DL(N);

37046

37047

if (N->getOperand(0) == N->getOperand(1)) {

37048

if (N->getOpcode() == X86ISD::PCMPEQ)

37049

return getOnesVector(VT, DAG, DL);

37050

if (N->getOpcode() == X86ISD::PCMPGT)

37051

return getZeroVector(VT, Subtarget, DAG, DL);

37052

}

37053

37054

return SDValue();

37055

}

37056

37057

static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,

37058

TargetLowering::DAGCombinerInfo &DCI,

37059

const X86Subtarget &Subtarget) {

37060

if (DCI.isBeforeLegalizeOps())

37061

return SDValue();

37062

37063

MVT OpVT = N->getSimpleValueType(0);

37064

37065

// Early out for mask vectors.

37066

if (OpVT.getVectorElementType() == MVT::i1)

37067

return SDValue();

37068

37069

SDLoc dl(N);

37070

SDValue Vec = N->getOperand(0);

37071

SDValue SubVec = N->getOperand(1);

37072

37073

unsigned IdxVal = N->getConstantOperandVal(2);

37074

MVT SubVecVT = SubVec.getSimpleValueType();

37075

37076

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

37077

// Inserting zeros into zeros is a nop.

37078

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

37079

return Vec;

37080

37081

// If we're inserting into a zero vector and then into a larger zero vector,

37082

// just insert into the larger zero vector directly.

37083

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

37084

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

37085

unsigned Idx2Val = SubVec.getConstantOperandVal(2);

37086

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,

37087

SubVec.getOperand(1),

37088

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

37089

}

37090

37091

// If we're inserting a bitcast into zeros, rewrite the insert and move the

37092

// bitcast to the other side. This helps with detecting zero extending

37093

// during isel.

37094

// TODO: Is this useful for other indices than 0?

37095

if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {

37096

MVT CastVT = SubVec.getOperand(0).getSimpleValueType();

37097

unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();

37098

MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);

37099

SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,

37100

DAG.getBitcast(NewVT, Vec),

37101

SubVec.getOperand(0), N->getOperand(2));

37102

return DAG.getBitcast(OpVT, Insert);

37103

}

37104

}

37105

37106

// If this is an insert of an extract, combine to a shuffle. Don't do this

37107

// if the insert or extract can be represented with a subregister operation.

37108

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

37109

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

37110

(IdxVal != 0 || !Vec.isUndef())) {

37111

int ExtIdxVal = SubVec.getConstantOperandVal(1);

37112

if (ExtIdxVal != 0) {

37113

int VecNumElts = OpVT.getVectorNumElements();

37114

int SubVecNumElts = SubVecVT.getVectorNumElements();

37115

SmallVector<int, 64> Mask(VecNumElts);

37116

// First create an identity shuffle mask.

37117

for (int i = 0; i != VecNumElts; ++i)

37118

Mask[i] = i;

37119

// Now insert the extracted portion.

37120

for (int i = 0; i != SubVecNumElts; ++i)

37121

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

37122

37123

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

37124

}

37125

}

37126

37127

// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte

37128

// load:

37129

// (insert_subvector (insert_subvector undef, (load16 addr), 0),

37130

// (load16 addr + 16), Elts/2)

37131

// --> load32 addr

37132

// or:

37133

// (insert_subvector (insert_subvector undef, (load32 addr), 0),

37134

// (load32 addr + 32), Elts/2)

37135

// --> load64 addr

37136

// or a 16-byte or 32-byte broadcast:

37137

// (insert_subvector (insert_subvector undef, (load16 addr), 0),

37138

// (load16 addr), Elts/2)

37139

// --> X86SubVBroadcast(load16 addr)

37140

// or:

37141

// (insert_subvector (insert_subvector undef, (load32 addr), 0),

37142

// (load32 addr), Elts/2)

37143

// --> X86SubVBroadcast(load32 addr)

37144

if ((IdxVal == OpVT.getVectorNumElements() / 2) &&

37145

Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&

37146

OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {

37147

auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));

37148

if (Idx2 && Idx2->getZExtValue() == 0) {

37149

SDValue SubVec2 = Vec.getOperand(1);

37150

// If needed, look through bitcasts to get to the load.

37151

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {

37152

bool Fast;

37153

unsigned Alignment = FirstLd->getAlignment();

37154

unsigned AS = FirstLd->getAddressSpace();

37155

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

37156

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),

37157

OpVT, AS, Alignment, &Fast) && Fast) {

37158

SDValue Ops[] = {SubVec2, SubVec};

37159

if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,

37160

Subtarget, false))

37161

return Ld;

37162

}

37163

}

37164

// If lower/upper loads are the same and the only users of the load, then

37165

// lower to a VBROADCASTF128/VBROADCASTI128/etc.

37166

if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))

37167

if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&

37168

SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))

37169

return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);

37170

37171

// If this is subv_broadcast insert into both halves, use a larger

37172

// subv_broadcast.

37173

if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)

37174

return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,

37175

SubVec.getOperand(0));

37176

37177

// If we're inserting all zeros into the upper half, change this to

37178

// an insert into an all zeros vector. We will match this to a move

37179

// with implicit upper bit zeroing during isel.

37180

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

37181

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

37182

getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,

37183

Vec.getOperand(2));

37184

37185

// If we are inserting into both halves of the vector, the starting

37186

// vector should be undef. If it isn't, make it so. Only do this if the

37187

// the early insert has no other uses.

37188

// TODO: Should this be a generic DAG combine?

37189

if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {

37190

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),

37191

SubVec2, Vec.getOperand(2));

37192

DCI.AddToWorklist(Vec.getNode());

37193

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,

37194

N->getOperand(2));

37195

37196

}

37197

}

37198

}

37199

37200

return SDValue();

37201

}

37202

37203

static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

37204

TargetLowering::DAGCombinerInfo &DCI,

37205

const X86Subtarget &Subtarget) {

37206

if (DCI.isBeforeLegalizeOps())

37207

return SDValue();

37208

37209

MVT OpVT = N->getSimpleValueType(0);

37210

SDValue InVec = N->getOperand(0);

37211

unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

37212

37213

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

37214

return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));

37215

37216

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

37217

if (OpVT.getScalarType() == MVT::i1)

37218

return DAG.getConstant(1, SDLoc(N), OpVT);

37219

return getOnesVector(OpVT, DAG, SDLoc(N));

37220

}

37221

37222

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

37223

return DAG.getBuildVector(

37224

OpVT, SDLoc(N),

37225

InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));

37226

37227

return SDValue();

37228

}

37229

37230

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

37231

DAGCombinerInfo &DCI) const {

37232

SelectionDAG &DAG = DCI.DAG;

37233

switch (N->getOpcode()) {

37234

default: break;

37235

case ISD::EXTRACT_VECTOR_ELT:

37236

case X86ISD::PEXTRW:

37237

case X86ISD::PEXTRB:

37238

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

37239

case ISD::INSERT_SUBVECTOR:

37240

return combineInsertSubvector(N, DAG, DCI, Subtarget);

37241

case ISD::EXTRACT_SUBVECTOR:

37242

return combineExtractSubvector(N, DAG, DCI, Subtarget);

37243

case ISD::VSELECT:

37244

case ISD::SELECT:

37245

case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);

37246

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

37247

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

37248

case ISD::ADD: return combineAdd(N, DAG, Subtarget);

37249

case ISD::SUB: return combineSub(N, DAG, Subtarget);

37250

case X86ISD::SBB: return combineSBB(N, DAG);

37251

case X86ISD::ADC: return combineADC(N, DAG, DCI);

37252

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

37253

case ISD::SHL:

37254

case ISD::SRA:

37255

case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);

37256

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

37257

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

37258

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

37259

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

37260

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

37261

case ISD::STORE: return combineStore(N, DAG, Subtarget);

37262

case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);

37263

case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);

37264

case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);

37265

case ISD::FADD:

37266

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

37267

case ISD::FNEG: return combineFneg(N, DAG, Subtarget);

37268

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

37269

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

37270

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

37271

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

37272

case X86ISD::FXOR:

37273

case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);

37274

case X86ISD::FMIN:

37275

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

37276

case ISD::FMINNUM:

37277

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

37278

case X86ISD::BT: return combineBT(N, DAG, DCI);

37279

case ISD::ANY_EXTEND:

37280

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

37281

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

37282

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

37283

case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);

37284

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

37285

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

37286

case X86ISD::PACKSS:

37287

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

37288

case X86ISD::VSHLI:

37289

case X86ISD::VSRAI:

37290

case X86ISD::VSRLI:

37291

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

37292

case ISD::SIGN_EXTEND_VECTOR_INREG:

37293

case ISD::ZERO_EXTEND_VECTOR_INREG:

37294

case X86ISD::VSEXT:

37295

case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);

37296

case X86ISD::PINSRB:

37297

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

37298

case X86ISD::SHUFP: // Handle all target specific shuffles

37299

case X86ISD::INSERTPS:

37300

case X86ISD::EXTRQI:

37301

case X86ISD::INSERTQI:

37302

case X86ISD::PALIGNR:

37303

case X86ISD::VSHLDQ:

37304

case X86ISD::VSRLDQ:

37305

case X86ISD::BLENDI:

37306

case X86ISD::UNPCKH:

37307

case X86ISD::UNPCKL:

37308

case X86ISD::MOVHLPS:

37309

case X86ISD::MOVLHPS:

37310

case X86ISD::PSHUFB:

37311

case X86ISD::PSHUFD:

37312

case X86ISD::PSHUFHW:

37313

case X86ISD::PSHUFLW:

37314

case X86ISD::MOVSHDUP:

37315

case X86ISD::MOVSLDUP:

37316

case X86ISD::MOVDDUP:

37317

case X86ISD::MOVSS:

37318

case X86ISD::MOVSD:

37319

case X86ISD::VBROADCAST:

37320

case X86ISD::VPPERM:

37321

case X86ISD::VPERMI:

37322

case X86ISD::VPERMV:

37323

case X86ISD::VPERMV3:

37324

case X86ISD::VPERMIV3:

37325

case X86ISD::VPERMIL2:

37326

case X86ISD::VPERMILPI:

37327

case X86ISD::VPERMILPV:

37328

case X86ISD::VPERM2X128:

37329

case X86ISD::VZEXT_MOVL:

37330

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

37331

case X86ISD::FMADD_RND:

37332

case X86ISD::FMADDS1_RND:

37333

case X86ISD::FMADDS3_RND:

37334

case X86ISD::FMADDS1:

37335

case X86ISD::FMADDS3:

37336

case X86ISD::FMADD4S:

37337

case ISD::FMA: return combineFMA(N, DAG, Subtarget);

37338

case X86ISD::FMADDSUB_RND:

37339

case X86ISD::FMSUBADD_RND:

37340

case X86ISD::FMADDSUB:

37341

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);

37342

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);

37343

case X86ISD::MGATHER:

37344

case X86ISD::MSCATTER:

37345

case ISD::MGATHER:

37346

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);

37347

case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);

37348

case X86ISD::PCMPEQ:

37349

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

37350

}

37351

37352

return SDValue();

37353

}

37354

37355

/// Return true if the target has native support for the specified value type

37356

/// and it is 'desirable' to use the type for the given node type. e.g. On x86

37357

/// i16 is legal, but undesirable since i16 instruction encodings are longer and

37358

/// some i16 instructions are slow.

37359

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

37360

if (!isTypeLegal(VT))

37361

return false;

37362

if (VT != MVT::i16)

37363

return true;

37364

37365

switch (Opc) {

37366

default:

37367

return true;

37368

case ISD::LOAD:

37369

case ISD::SIGN_EXTEND:

37370

case ISD::ZERO_EXTEND:

37371

case ISD::ANY_EXTEND:

37372

case ISD::SHL:

37373

case ISD::SRL:

37374

case ISD::SUB:

37375

case ISD::ADD:

37376

case ISD::MUL:

37377

case ISD::AND:

37378

case ISD::OR:

37379

case ISD::XOR:

37380

return false;

37381

}

37382

}

37383

37384

/// This function checks if any of the users of EFLAGS copies the EFLAGS. We

37385

/// know that the code that lowers COPY of EFLAGS has to use the stack, and if

37386

/// we don't adjust the stack we clobber the first frame index.

37387

/// See X86InstrInfo::copyPhysReg.

37388

static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {

37389

const MachineRegisterInfo &MRI = MF.getRegInfo();

37390

return any_of(MRI.reg_instructions(X86::EFLAGS),

37391

[](const MachineInstr &RI) { return RI.isCopy(); });

37392

}

37393

37394

void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {

37395

if (hasCopyImplyingStackAdjustment(MF)) {

37396

MachineFrameInfo &MFI = MF.getFrameInfo();

37397

MFI.setHasCopyImplyingStackAdjustment(true);

37398

}

37399

37400

TargetLoweringBase::finalizeLowering(MF);

37401

}

37402

37403

/// This method query the target whether it is beneficial for dag combiner to

37404

/// promote the specified node. If true, it should return the desired promotion

37405

/// type by reference.

37406

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

37407

EVT VT = Op.getValueType();

37408

if (VT != MVT::i16)

37409

return false;

37410

37411

bool Promote = false;

37412

bool Commute = false;

37413

switch (Op.getOpcode()) {

37414

default: break;

37415

case ISD::SIGN_EXTEND:

37416

case ISD::ZERO_EXTEND:

37417

case ISD::ANY_EXTEND:

37418

Promote = true;

37419

break;

37420

case ISD::SHL:

37421

case ISD::SRL: {

37422

SDValue N0 = Op.getOperand(0);

37423

// Look out for (store (shl (load), x)).

37424

if (MayFoldLoad(N0) && MayFoldIntoStore(Op))

37425

return false;

37426

Promote = true;

37427

break;

37428

}

37429

case ISD::ADD:

37430

case ISD::MUL:

37431

case ISD::AND:

37432

case ISD::OR:

37433

case ISD::XOR:

37434

Commute = true;

37435

LLVM_FALLTHROUGH[[clang::fallthrough]];

37436

case ISD::SUB: {

37437

SDValue N0 = Op.getOperand(0);

37438

SDValue N1 = Op.getOperand(1);

37439

if (!Commute && MayFoldLoad(N1))

37440

return false;

37441

// Avoid disabling potential load folding opportunities.

37442

if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))

37443

return false;

37444

if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))

37445

return false;

37446

Promote = true;

37447

}

37448

}

37449

37450

PVT = MVT::i32;

37451

return Promote;

37452

}

37453

37454

bool X86TargetLowering::

37455

isDesirableToCombineBuildVectorToShuffleTruncate(

37456

ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

37457

37458

assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&(static_cast <bool> (SrcVT.getVectorNumElements() == ShuffleMask
.size() && "Element count mismatch") ? void (0) : __assert_fail
("SrcVT.getVectorNumElements() == ShuffleMask.size() && \"Element count mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37459, __extension__ __PRETTY_FUNCTION__))

37459

"Element count mismatch")(static_cast <bool> (SrcVT.getVectorNumElements() == ShuffleMask
.size() && "Element count mismatch") ? void (0) : __assert_fail
("SrcVT.getVectorNumElements() == ShuffleMask.size() && \"Element count mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37459, __extension__ __PRETTY_FUNCTION__));

37460

assert((static_cast <bool> (Subtarget.getTargetLowering()->
isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"
) ? void (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37462, __extension__ __PRETTY_FUNCTION__))

37461

Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&(static_cast <bool> (Subtarget.getTargetLowering()->
isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"
) ? void (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37462, __extension__ __PRETTY_FUNCTION__))

37462

"Shuffle Mask expected to be legal")(static_cast <bool> (Subtarget.getTargetLowering()->
isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"
) ? void (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37462, __extension__ __PRETTY_FUNCTION__));

37463

37464

// For 32-bit elements VPERMD is better than shuffle+truncate.

37465

// TODO: After we improve lowerBuildVector, add execption for VPERMW.

37466

if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())

37467

return false;

37468

37469

if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))

37470

return false;

37471

37472

return true;

37473

}

37474

37475

//===----------------------------------------------------------------------===//

37476

// X86 Inline Assembly Support

37477

//===----------------------------------------------------------------------===//

37478

37479

// Helper to match a string separated by whitespace.

37480

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

37481

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

37482

37483

for (StringRef Piece : Pieces) {

37484

if (!S.startswith(Piece)) // Check if the piece matches.

37485

return false;

37486

37487

S = S.substr(Piece.size());

37488

StringRef::size_type Pos = S.find_first_not_of(" \t");

37489

if (Pos == 0) // We matched a prefix.

37490

return false;

37491

37492

S = S.substr(Pos);

37493

}

37494

37495

return S.empty();

37496

}

37497

37498

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

37499

37500

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

37501

if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&

37502

std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&

37503

std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

37504

37505

if (AsmPieces.size() == 3)

37506

return true;

37507

else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))

37508

return true;

37509

}

37510

}

37511

return false;

37512

}

37513

37514

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

37515

InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

37516

37517

const std::string &AsmStr = IA->getAsmString();

37518

37519

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

37520

if (!Ty || Ty->getBitWidth() % 16 != 0)

37521

return false;

37522

37523

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

37524

SmallVector<StringRef, 4> AsmPieces;

37525

SplitString(AsmStr, AsmPieces, ";\n");

37526

37527

switch (AsmPieces.size()) {

37528

default: return false;

37529

case 1:

37530

// FIXME: this should verify that we are targeting a 486 or better. If not,

37531

// we will turn this bswap into something that will be lowered to logical

37532

// ops instead of emitting the bswap asm. For now, we don't support 486 or

37533

// lower so don't worry about this.

37534

// bswap $0

37535

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

37536

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

37537

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

37538

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

37539

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

37540

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

37541

// No need to check constraints, nothing other than the equivalent of

37542

// "=r,0" would be valid here.

37543

return IntrinsicLowering::LowerToByteSwap(CI);

37544

}

37545

37546

// rorw $$8, ${0:w} --> llvm.bswap.i16

37547

if (CI->getType()->isIntegerTy(16) &&

37548

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

37549

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

37550

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

37551

AsmPieces.clear();

37552

StringRef ConstraintsStr = IA->getConstraintString();

37553

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

37554

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

37555

if (clobbersFlagRegisters(AsmPieces))

37556

return IntrinsicLowering::LowerToByteSwap(CI);

37557

}

37558

break;

37559

case 3:

37560

if (CI->getType()->isIntegerTy(32) &&

37561

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

37562

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

37563

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

37564

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

37565

AsmPieces.clear();

37566

StringRef ConstraintsStr = IA->getConstraintString();

37567

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

37568

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

37569

if (clobbersFlagRegisters(AsmPieces))

37570

return IntrinsicLowering::LowerToByteSwap(CI);

37571

}

37572

37573

if (CI->getType()->isIntegerTy(64)) {

37574

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

37575

if (Constraints.size() >= 2 &&

37576

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

37577

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

37578

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

37579

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

37580

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

37581

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

37582

return IntrinsicLowering::LowerToByteSwap(CI);

37583

}

37584

}

37585

break;

37586

}

37587

return false;

37588

}

37589

37590

/// Given a constraint letter, return the type of constraint for this target.

37591

X86TargetLowering::ConstraintType

37592

X86TargetLowering::getConstraintType(StringRef Constraint) const {

37593

if (Constraint.size() == 1) {

37594

switch (Constraint[0]) {

37595

case 'R':

37596

case 'q':

37597

case 'Q':

37598

case 'f':

37599

case 't':

37600

case 'u':

37601

case 'y':

37602

case 'x':

37603

case 'v':

37604

case 'Y':

37605

case 'l':

37606

case 'k': // AVX512 masking registers.

37607

return C_RegisterClass;

37608

case 'a':

37609

case 'b':

37610

case 'c':

37611

case 'd':

37612

case 'S':

37613

case 'D':

37614

case 'A':

37615

return C_Register;

37616

case 'I':

37617

case 'J':

37618

case 'K':

37619

case 'L':

37620

case 'M':

37621

case 'N':

37622

case 'G':

37623

case 'C':

37624

case 'e':

37625

case 'Z':

37626

return C_Other;

37627

default:

37628

break;

37629

}

37630

}

37631

else if (Constraint.size() == 2) {

37632

switch (Constraint[0]) {

37633

default:

37634

break;

37635

case 'Y':

37636

switch (Constraint[1]) {

37637

default:

37638

break;

37639

case 'z':

37640

case '0':

37641

return C_Register;

37642

case 'i':

37643

case 'm':

37644

case 'k':

37645

case 't':

37646

case '2':

37647

return C_RegisterClass;

37648

}

37649

}

37650

}

37651

return TargetLowering::getConstraintType(Constraint);

37652

}

37653

37654

/// Examine constraint type and operand type and determine a weight value.

37655

/// This object must already have been set up with the operand type

37656

/// and the current alternative constraint selected.

37657

TargetLowering::ConstraintWeight

37658

X86TargetLowering::getSingleConstraintMatchWeight(

37659

AsmOperandInfo &info, const char *constraint) const {

37660

ConstraintWeight weight = CW_Invalid;

37661

Value *CallOperandVal = info.CallOperandVal;

37662

// If we don't have a value, we can't do a match,

37663

// but allow it at the lowest weight.

37664

if (!CallOperandVal)

37665

return CW_Default;

37666

Type *type = CallOperandVal->getType();

37667

// Look at the constraint type.

37668

switch (*constraint) {

37669

default:

37670

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

37671

LLVM_FALLTHROUGH[[clang::fallthrough]];

37672

case 'R':

37673

case 'q':

37674

case 'Q':

37675

case 'a':

37676

case 'b':

37677

case 'c':

37678

case 'd':

37679

case 'S':

37680

case 'D':

37681

case 'A':

37682

if (CallOperandVal->getType()->isIntegerTy())

37683

weight = CW_SpecificReg;

37684

break;

37685

case 'f':

37686

case 't':

37687

case 'u':

37688

if (type->isFloatingPointTy())

37689

weight = CW_SpecificReg;

37690

break;

37691

case 'y':

37692

if (type->isX86_MMXTy() && Subtarget.hasMMX())

37693

weight = CW_SpecificReg;

37694

break;

37695

case 'Y': {

37696

unsigned Size = StringRef(constraint).size();

37697

// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'

37698

char NextChar = Size == 2 ? constraint[1] : 'i';

37699

if (Size > 2)

37700

break;

37701

switch (NextChar) {

37702

default:

37703

return CW_Invalid;

37704

// XMM0

37705

case 'z':

37706

case '0':

37707

if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())

37708

return CW_SpecificReg;

37709

return CW_Invalid;

37710

// Conditional OpMask regs (AVX512)

37711

case 'k':

37712

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

37713

return CW_Register;

37714

return CW_Invalid;

37715

// Any MMX reg

37716

case 'm':

37717

if (type->isX86_MMXTy() && Subtarget.hasMMX())

37718

return weight;

37719

return CW_Invalid;

37720

// Any SSE reg when ISA >= SSE2, same as 'Y'

37721

case 'i':

37722

case 't':

37723

case '2':

37724

if (!Subtarget.hasSSE2())

37725

return CW_Invalid;

37726

break;

37727

}

37728

// Fall through (handle "Y" constraint).

37729

LLVM_FALLTHROUGH[[clang::fallthrough]];

37730

}

37731

case 'v':

37732

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

37733

weight = CW_Register;

37734

LLVM_FALLTHROUGH[[clang::fallthrough]];

37735

case 'x':

37736

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

37737

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))

37738

weight = CW_Register;

37739

break;

37740

case 'k':

37741

// Enable conditional vector operations using %k<#> registers.

37742

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

37743

weight = CW_Register;

37744

break;

37745

case 'I':

37746

if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

37747

if (C->getZExtValue() <= 31)

37748

weight = CW_Constant;

37749

}

37750

break;

37751

case 'J':

37752

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

37753

if (C->getZExtValue() <= 63)

37754

weight = CW_Constant;

37755

}

37756

break;

37757

case 'K':

37758

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

37759

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

37760

weight = CW_Constant;

37761

}

37762

break;

37763

case 'L':

37764

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

37765

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

37766

weight = CW_Constant;

37767

}

37768

break;

37769

case 'M':

37770

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

37771

if (C->getZExtValue() <= 3)

37772

weight = CW_Constant;

37773

}

37774

break;

37775

case 'N':

37776

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

37777

if (C->getZExtValue() <= 0xff)

37778

weight = CW_Constant;

37779

}

37780

break;

37781

case 'G':

37782

case 'C':

37783

if (isa<ConstantFP>(CallOperandVal)) {

37784

weight = CW_Constant;

37785

}

37786

break;

37787

case 'e':

37788

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

37789

if ((C->getSExtValue() >= -0x80000000LL) &&

37790

(C->getSExtValue() <= 0x7fffffffLL))

37791

weight = CW_Constant;

37792

}

37793

break;

37794

case 'Z':

37795

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

37796

if (C->getZExtValue() <= 0xffffffff)

37797

weight = CW_Constant;

37798

}

37799

break;

37800

}

37801

return weight;

37802

}

37803

37804

/// Try to replace an X constraint, which matches anything, with another that

37805

/// has more specific requirements based on the type of the corresponding

37806

/// operand.

37807

const char *X86TargetLowering::

37808

LowerXConstraint(EVT ConstraintVT) const {

37809

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

37810

// 'f' like normal targets.

37811

if (ConstraintVT.isFloatingPoint()) {

37812

if (Subtarget.hasSSE2())

37813

return "Y";

37814

if (Subtarget.hasSSE1())

37815

return "x";

37816

}

37817

37818

return TargetLowering::LowerXConstraint(ConstraintVT);

37819

}

37820

37821

/// Lower the specified operand into the Ops vector.

37822

/// If it is invalid, don't add anything to Ops.

37823

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

37824

std::string &Constraint,

37825

std::vector<SDValue>&Ops,

37826

SelectionDAG &DAG) const {

37827

SDValue Result;

37828

37829

// Only support length 1 constraints for now.

37830

if (Constraint.length() > 1) return;

37831

37832

char ConstraintLetter = Constraint[0];

37833

switch (ConstraintLetter) {

37834

default: break;

37835

case 'I':

37836

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37837

if (C->getZExtValue() <= 31) {

37838

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

37839

Op.getValueType());

37840

break;

37841

}

37842

}

37843

return;

37844

case 'J':

37845

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37846

if (C->getZExtValue() <= 63) {

37847

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

37848

Op.getValueType());

37849

break;

37850

}

37851

}

37852

return;

37853

case 'K':

37854

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37855

if (isInt<8>(C->getSExtValue())) {

37856

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

37857

Op.getValueType());

37858

break;

37859

}

37860

}

37861

return;

37862

case 'L':

37863

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37864

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

37865

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

37866

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

37867

Op.getValueType());

37868

break;

37869

}

37870

}

37871

return;

37872

case 'M':

37873

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37874

if (C->getZExtValue() <= 3) {

37875

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

37876

Op.getValueType());

37877

break;

37878

}

37879

}

37880

return;

37881

case 'N':

37882

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37883

if (C->getZExtValue() <= 255) {

37884

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

37885

Op.getValueType());

37886

break;

37887

}

37888

}

37889

return;

37890

case 'O':

37891

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37892

if (C->getZExtValue() <= 127) {

37893

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

37894

Op.getValueType());

37895

break;

37896

}

37897

}

37898

return;

37899

case 'e': {

37900

// 32-bit signed value

37901

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37902

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

37903

C->getSExtValue())) {

37904

// Widen to 64 bits here to get it sign extended.

37905

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

37906

break;

37907

}

37908

// FIXME gcc accepts some relocatable values here too, but only in certain

37909

// memory models; it's complicated.

37910

}

37911

return;

37912

}

37913

case 'Z': {

37914

// 32-bit unsigned value

37915

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

37916

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

37917

C->getZExtValue())) {

37918

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

37919

Op.getValueType());

37920

break;

37921

}

37922

}

37923

// FIXME gcc accepts some relocatable values here too, but only in certain

37924

// memory models; it's complicated.

37925

return;

37926

}

37927

case 'i': {

37928

// Literal immediates are always ok.

37929

if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {

37930

// Widen to 64 bits here to get it sign extended.

37931

Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);

37932

break;

37933

}

37934

37935

// In any sort of PIC mode addresses need to be computed at runtime by

37936

// adding in a register or some sort of table lookup. These can't

37937

// be used as immediates.

37938

if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())

37939

return;

37940

37941

// If we are in non-pic codegen mode, we allow the address of a global (with

37942

// an optional displacement) to be used with 'i'.

37943

GlobalAddressSDNode *GA = nullptr;

37944

int64_t Offset = 0;

37945

37946

// Match either (GA), (GA+C), (GA+C1+C2), etc.

37947

while (1) {

37948

if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {

37949

Offset += GA->getOffset();

37950

break;

37951

} else if (Op.getOpcode() == ISD::ADD) {

37952

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

37953

Offset += C->getZExtValue();

37954

Op = Op.getOperand(0);

37955

continue;

37956

}

37957

} else if (Op.getOpcode() == ISD::SUB) {

37958

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

37959

Offset += -C->getZExtValue();

37960

Op = Op.getOperand(0);

37961

continue;

37962

}

37963

}

37964

37965

// Otherwise, this isn't something we can handle, reject it.

37966

return;

37967

}

37968

37969

const GlobalValue *GV = GA->getGlobal();

37970

// If we require an extra load to get this address, as in PIC mode, we

37971

// can't accept it.

37972

if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))

37973

return;

37974

37975

Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),

37976

GA->getValueType(0), Offset);

37977

break;

37978

}

37979

}

37980

37981

if (Result.getNode()) {

37982

Ops.push_back(Result);

37983

return;

37984

}

37985

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

37986

}

37987

37988

/// Check if \p RC is a general purpose register class.

37989

/// I.e., GR* or one of their variant.

37990

static bool isGRClass(const TargetRegisterClass &RC) {

37991

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

37992

RC.hasSuperClassEq(&X86::GR16RegClass) ||

37993

RC.hasSuperClassEq(&X86::GR32RegClass) ||

37994

RC.hasSuperClassEq(&X86::GR64RegClass) ||

37995

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

37996

}

37997

37998

/// Check if \p RC is a vector register class.

37999

/// I.e., FR* / VR* or one of their variant.

38000

static bool isFRClass(const TargetRegisterClass &RC) {

38001

return RC.hasSuperClassEq(&X86::FR32XRegClass) ||

38002

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

38003

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

38004

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

38005

RC.hasSuperClassEq(&X86::VR512RegClass);

38006

}

38007

38008

std::pair<unsigned, const TargetRegisterClass *>

38009

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

38010

StringRef Constraint,

38011

MVT VT) const {

38012

// First, see if this is a constraint that directly corresponds to an LLVM

38013

// register class.

38014

if (Constraint.size() == 1) {

38015

// GCC Constraint Letters

38016

switch (Constraint[0]) {

38017

default: break;

38018

// TODO: Slight differences here in allocation order and leaving

38019

// RIP in the class. Do they matter any more here than they do

38020

// in the normal allocation?

38021

case 'k':

38022

if (Subtarget.hasAVX512()) {

38023

// Only supported in AVX512 or later.

38024

switch (VT.SimpleTy) {

38025

default: break;

38026

case MVT::i32:

38027

return std::make_pair(0U, &X86::VK32RegClass);

38028

case MVT::i16:

38029

return std::make_pair(0U, &X86::VK16RegClass);

38030

case MVT::i8:

38031

return std::make_pair(0U, &X86::VK8RegClass);

38032

case MVT::i1:

38033

return std::make_pair(0U, &X86::VK1RegClass);

38034

case MVT::i64:

38035

return std::make_pair(0U, &X86::VK64RegClass);

38036

}

38037

}

38038

break;

38039

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

38040

if (Subtarget.is64Bit()) {

38041

if (VT == MVT::i32 || VT == MVT::f32)

38042

return std::make_pair(0U, &X86::GR32RegClass);

38043

if (VT == MVT::i16)

38044

return std::make_pair(0U, &X86::GR16RegClass);

38045

if (VT == MVT::i8 || VT == MVT::i1)

38046

return std::make_pair(0U, &X86::GR8RegClass);

38047

if (VT == MVT::i64 || VT == MVT::f64)

38048

return std::make_pair(0U, &X86::GR64RegClass);

38049

break;

38050

}

38051

LLVM_FALLTHROUGH[[clang::fallthrough]];

38052

// 32-bit fallthrough

38053

case 'Q': // Q_REGS

38054

if (VT == MVT::i32 || VT == MVT::f32)

38055

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

38056

if (VT == MVT::i16)

38057

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

38058

if (VT == MVT::i8 || VT == MVT::i1)

38059

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

38060

if (VT == MVT::i64)

38061

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

38062

break;

38063

case 'r': // GENERAL_REGS

38064

case 'l': // INDEX_REGS

38065

if (VT == MVT::i8 || VT == MVT::i1)

38066

return std::make_pair(0U, &X86::GR8RegClass);

38067

if (VT == MVT::i16)

38068

return std::make_pair(0U, &X86::GR16RegClass);

38069

if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())

38070

return std::make_pair(0U, &X86::GR32RegClass);

38071

return std::make_pair(0U, &X86::GR64RegClass);

38072

case 'R': // LEGACY_REGS

38073

if (VT == MVT::i8 || VT == MVT::i1)

38074

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

38075

if (VT == MVT::i16)

38076

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

38077

if (VT == MVT::i32 || !Subtarget.is64Bit())

38078

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

38079

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

38080

case 'f': // FP Stack registers.

38081

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

38082

// value to the correct fpstack register class.

38083

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

38084

return std::make_pair(0U, &X86::RFP32RegClass);

38085

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

38086

return std::make_pair(0U, &X86::RFP64RegClass);

38087

return std::make_pair(0U, &X86::RFP80RegClass);

38088

case 'y': // MMX_REGS if MMX allowed.

38089

if (!Subtarget.hasMMX()) break;

38090

return std::make_pair(0U, &X86::VR64RegClass);

38091

case 'Y': // SSE_REGS if SSE2 allowed

38092

if (!Subtarget.hasSSE2()) break;

38093

LLVM_FALLTHROUGH[[clang::fallthrough]];

38094

case 'v':

38095

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

38096

if (!Subtarget.hasSSE1()) break;

38097

bool VConstraint = (Constraint[0] == 'v');

38098

38099

switch (VT.SimpleTy) {

38100

default: break;

38101

// Scalar SSE types.

38102

case MVT::f32:

38103

case MVT::i32:

38104

if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())

38105

return std::make_pair(0U, &X86::FR32XRegClass);

38106

return std::make_pair(0U, &X86::FR32RegClass);

38107

case MVT::f64:

38108

case MVT::i64:

38109

if (VConstraint && Subtarget.hasVLX())

38110

return std::make_pair(0U, &X86::FR64XRegClass);

38111

return std::make_pair(0U, &X86::FR64RegClass);

38112

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

38113

// Vector types.

38114

case MVT::v16i8:

38115

case MVT::v8i16:

38116

case MVT::v4i32:

38117

case MVT::v2i64:

38118

case MVT::v4f32:

38119

case MVT::v2f64:

38120

if (VConstraint && Subtarget.hasVLX())

38121

return std::make_pair(0U, &X86::VR128XRegClass);

38122

return std::make_pair(0U, &X86::VR128RegClass);

38123

// AVX types.

38124

case MVT::v32i8:

38125

case MVT::v16i16:

38126

case MVT::v8i32:

38127

case MVT::v4i64:

38128

case MVT::v8f32:

38129

case MVT::v4f64:

38130

if (VConstraint && Subtarget.hasVLX())

38131

return std::make_pair(0U, &X86::VR256XRegClass);

38132

return std::make_pair(0U, &X86::VR256RegClass);

38133

case MVT::v8f64:

38134

case MVT::v16f32:

38135

case MVT::v16i32:

38136

case MVT::v8i64:

38137

return std::make_pair(0U, &X86::VR512RegClass);

38138

}

38139

break;

38140

}

38141

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

38142

switch (Constraint[1]) {

38143

default:

38144

break;

38145

case 'i':

38146

case 't':

38147

case '2':

38148

return getRegForInlineAsmConstraint(TRI, "Y", VT);

38149

case 'm':

38150

if (!Subtarget.hasMMX()) break;

38151

return std::make_pair(0U, &X86::VR64RegClass);

38152

case 'z':

38153

case '0':

38154

if (!Subtarget.hasSSE1()) break;

38155

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

38156

case 'k':

38157

// This register class doesn't allocate k0 for masked vector operation.

38158

if (Subtarget.hasAVX512()) { // Only supported in AVX512.

38159

switch (VT.SimpleTy) {

38160

default: break;

38161

case MVT::i32:

38162

return std::make_pair(0U, &X86::VK32WMRegClass);

38163

case MVT::i16:

38164

return std::make_pair(0U, &X86::VK16WMRegClass);

38165

case MVT::i8:

38166

return std::make_pair(0U, &X86::VK8WMRegClass);

38167

case MVT::i1:

38168

return std::make_pair(0U, &X86::VK1WMRegClass);

38169

case MVT::i64:

38170

return std::make_pair(0U, &X86::VK64WMRegClass);

38171

}

38172

}

38173

break;

38174

}

38175

}

38176

38177

// Use the default implementation in TargetLowering to convert the register

38178

// constraint into a member of a register class.

38179

std::pair<unsigned, const TargetRegisterClass*> Res;

38180

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

38181

38182

// Not found as a standard register?

38183

if (!Res.second) {

38184

// Map st(0) -> st(7) -> ST0

38185

if (Constraint.size() == 7 && Constraint[0] == '{' &&

38186

tolower(Constraint[1]) == 's' &&

38187

tolower(Constraint[2]) == 't' &&

38188

Constraint[3] == '(' &&

38189

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

38190

Constraint[5] == ')' &&

38191

Constraint[6] == '}') {

38192

38193

Res.first = X86::FP0+Constraint[4]-'0';

38194

Res.second = &X86::RFP80RegClass;

38195

return Res;

38196

}

38197

38198

// GCC allows "st(0)" to be called just plain "st".

38199

if (StringRef("{st}").equals_lower(Constraint)) {

38200

Res.first = X86::FP0;

38201

Res.second = &X86::RFP80RegClass;

38202

return Res;

38203

}

38204

38205

// flags -> EFLAGS

38206

if (StringRef("{flags}").equals_lower(Constraint)) {

38207

Res.first = X86::EFLAGS;

38208

Res.second = &X86::CCRRegClass;

38209

return Res;

38210

}

38211

38212

// 'A' means [ER]AX + [ER]DX.

38213

if (Constraint == "A") {

38214

if (Subtarget.is64Bit()) {

38215

Res.first = X86::RAX;

38216

Res.second = &X86::GR64_ADRegClass;

38217

} else {

38218

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38219, __extension__ __PRETTY_FUNCTION__))

38219

"Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38219, __extension__ __PRETTY_FUNCTION__));

38220

Res.first = X86::EAX;

38221

Res.second = &X86::GR32_ADRegClass;

38222

}

38223

return Res;

38224

}

38225

return Res;

38226

}

38227

38228

// Otherwise, check to see if this is a register class of the wrong value

38229

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

38230

// turn into {ax},{dx}.

38231

// MVT::Other is used to specify clobber names.

38232

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

38233

return Res; // Correct type already, nothing to do.

38234

38235

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

38236

// return "eax". This should even work for things like getting 64bit integer

38237

// registers when given an f64 type.

38238

const TargetRegisterClass *Class = Res.second;

38239

// The generic code will match the first register class that contains the

38240

// given register. Thus, based on the ordering of the tablegened file,

38241

// the "plain" GR classes might not come first.

38242

// Therefore, use a helper method.

38243

if (isGRClass(*Class)) {

38244

unsigned Size = VT.getSizeInBits();

38245

if (Size == 1) Size = 8;

38246

unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);

38247

if (DestReg > 0) {

38248

bool is64Bit = Subtarget.is64Bit();

38249

const TargetRegisterClass *RC =

38250

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

38251

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

38252

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

38253

: &X86::GR64RegClass;

38254

if (RC->contains(DestReg))

38255

Res = std::make_pair(DestReg, RC);

38256

} else {

38257

// No register found/type mismatch.

38258

Res.first = 0;

38259

Res.second = nullptr;

38260

}

38261

} else if (isFRClass(*Class)) {

38262

// Handle references to XMM physical registers that got mapped into the

38263

// wrong class. This can happen with constraints like {xmm0} where the

38264

// target independent register mapper will just pick the first match it can

38265

// find, ignoring the required type.

38266

38267

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

38268

if (VT == MVT::f32 || VT == MVT::i32)

38269

Res.second = &X86::FR32RegClass;

38270

else if (VT == MVT::f64 || VT == MVT::i64)

38271

Res.second = &X86::FR64RegClass;

38272

else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))

38273

Res.second = &X86::VR128RegClass;

38274

else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))

38275

Res.second = &X86::VR256RegClass;

38276

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

38277

Res.second = &X86::VR512RegClass;

38278

else {

38279

// Type mismatch and not a clobber: Return an error;

38280

Res.first = 0;

38281

Res.second = nullptr;

38282

}

38283

}

38284

38285

return Res;

38286

}

38287

38288

int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,

38289

const AddrMode &AM, Type *Ty,

38290

unsigned AS) const {

38291

// Scaling factors are not free at all.

38292

// An indexed folded instruction, i.e., inst (reg1, reg2, scale),

38293

// will take 2 allocations in the out of order engine instead of 1

38294

// for plain addressing mode, i.e. inst (reg1).

38295

// E.g.,

38296

// vaddps (%rsi,%drx), %ymm0, %ymm1

38297

// Requires two allocations (one for the load, one for the computation)

38298

// whereas:

38299

// vaddps (%rsi), %ymm0, %ymm1

38300

// Requires just 1 allocation, i.e., freeing allocations for other operations

38301

// and having less micro operations to execute.

38302

38303

// For some X86 architectures, this is even worse because for instance for

38304

// stores, the complex addressing mode forces the instruction to use the

38305

// "load" ports instead of the dedicated "store" port.

38306

// E.g., on Haswell:

38307

// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.

38308

// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.

38309

if (isLegalAddressingMode(DL, AM, Ty, AS))

38310

// Scale represents reg2 * scale, thus account for 1

38311

// as soon as we use a second register.

38312

return AM.Scale != 0;

38313

return -1;

38314

}

38315

38316

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

38317

// Integer division on x86 is expensive. However, when aggressively optimizing

38318

// for code size, we prefer to use a div instruction, as it is usually smaller

38319

// than the alternative sequence.

38320

// The exception to this is vector division. Since x86 doesn't have vector

38321

// integer division, leaving the division as-is is a loss even in terms of

38322

// size, because it will have to be scalarized, while the alternative code

38323

// sequence can be performed in vector form.

38324

bool OptSize =

38325

Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);

38326

return OptSize && !VT.isVector();

38327

}

38328

38329

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

38330

if (!Subtarget.is64Bit())

38331

return;

38332

38333

// Update IsSplitCSR in X86MachineFunctionInfo.

38334

X86MachineFunctionInfo *AFI =

38335

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

38336

AFI->setIsSplitCSR(true);

38337

}

38338

38339

void X86TargetLowering::insertCopiesSplitCSR(

38340

MachineBasicBlock *Entry,

38341

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

38342

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38343

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

38344

if (!IStart)

38345

return;

38346

38347

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

38348

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

38349

MachineBasicBlock::iterator MBBI = Entry->begin();

38350

for (const MCPhysReg *I = IStart; *I; ++I) {

38351

const TargetRegisterClass *RC = nullptr;

38352

if (X86::GR64RegClass.contains(*I))

38353

RC = &X86::GR64RegClass;

38354

else

38355

38356

38357

unsigned NewVR = MRI->createVirtualRegister(RC);

38358

// Create copy from CSR to a virtual register.

38359

// FIXME: this currently does not emit CFI pseudo-instructions, it works

38360

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

38361

// nounwind. If we want to generalize this later, we may need to emit

38362

// CFI pseudo-instructions.

38363

assert(Entry->getParent()->getFunction()->hasFnAttribute((static_cast <bool> (Entry->getParent()->getFunction
()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38365, __extension__ __PRETTY_FUNCTION__))

38364

Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38365, __extension__ __PRETTY_FUNCTION__))

38365

"Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38365, __extension__ __PRETTY_FUNCTION__));

38366

Entry->addLiveIn(*I);

38367

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

38368

.addReg(*I);

38369

38370

// Insert the copy-back instructions right before the terminator.

38371

for (auto *Exit : Exits)

38372

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

38373

TII->get(TargetOpcode::COPY), *I)

38374

.addReg(NewVR);

38375

}

38376

}

38377

38378

bool X86TargetLowering::supportSwiftError() const {

38379

return Subtarget.is64Bit();

38380

}

38381

38382

/// Returns the name of the symbol used to emit stack probes or the empty

38383

/// string if not applicable.

38384

StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {

38385

// If the function specifically requests stack probes, emit them.

38386

if (MF.getFunction()->hasFnAttribute("probe-stack"))

38387

return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();

38388

38389

// Generally, if we aren't on Windows, the platform ABI does not include

38390

// support for stack probes, so don't emit them.

38391

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())

38392

return "";

38393

38394

// We need a stack probe to conform to the Windows ABI. Choose the right

38395

// symbol.

38396

if (Subtarget.is64Bit())

38397

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

38398

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

38399

}

←

/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h

//===- llvm/ADT/SmallBitVector.h - 'Normally small' bit vectors -*- C++ -*-===//

// The LLVM Compiler Infrastructure

// This file is distributed under the University of Illinois Open Source

// License. See LICENSE.TXT for details.

//===----------------------------------------------------------------------===//

// This file implements the SmallBitVector class.

//===----------------------------------------------------------------------===//

#ifndef LLVM_ADT_SMALLBITVECTOR_H

#define LLVM_ADT_SMALLBITVECTOR_H

#include "llvm/ADT/BitVector.h"

#include "llvm/ADT/iterator_range.h"

#include "llvm/Support/MathExtras.h"

#include <algorithm>

#include <cassert>

#include <climits>

#include <cstddef>

#include <cstdint>

#include <limits>

#include <utility>

namespace llvm {

/// This is a 'bitvector' (really, a variable-sized bit array), optimized for

/// the case when the array is small. It contains one pointer-sized field, which

/// is directly used as a plain collection of bits when possible, or as a

/// pointer to a larger heap-allocated array when necessary. This allows normal

/// "small" cases to be fast without losing generality for large inputs.

class SmallBitVector {

// TODO: In "large" mode, a pointer to a BitVector is used, leading to an

// unnecessary level of indirection. It would be more efficient to use a

// pointer to memory containing size, allocation size, and the array of bits.

uintptr_t X = 1;

enum {

// The number of bits in this class.

NumBaseBits = sizeof(uintptr_t) * CHAR_BIT8,

// One bit is used to discriminate between small and large mode. The

// remaining bits are used for the small-mode representation.

SmallNumRawBits = NumBaseBits - 1,

// A few more bits are used to store the size of the bit set in small mode.

// Theoretically this is a ceil-log2. These bits are encoded in the most

// significant bits of the raw bits.

SmallNumSizeBits = (NumBaseBits == 32 ? 5 :

NumBaseBits == 64 ? 6 :

SmallNumRawBits),

// The remaining bits are used to store the actual set in small mode.

SmallNumDataBits = SmallNumRawBits - SmallNumSizeBits

};

static_assert(NumBaseBits == 64 || NumBaseBits == 32,

"Unsupported word size");

public:

using size_type = unsigned;

// Encapsulation of a single bit.

class reference {

SmallBitVector &TheVector;

unsigned BitPos;

public:

reference(SmallBitVector &b, unsigned Idx) : TheVector(b), BitPos(Idx) {}

reference(const reference&) = default;

reference& operator=(reference t) {

*this = bool(t);

return *this;

}

reference& operator=(bool t) {

if (t)

TheVector.set(BitPos);

else

TheVector.reset(BitPos);

return *this;

}

operator bool() const {

return const_cast<const SmallBitVector &>(TheVector).operator[](BitPos);

}

};

private:

bool isSmall() const {

return X & uintptr_t(1);

}

BitVector *getPointer() const {

100

assert(!isSmall())(static_cast <bool> (!isSmall()) ? void (0) : __assert_fail
("!isSmall()", "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 100, __extension__ __PRETTY_FUNCTION__));

101

return reinterpret_cast<BitVector *>(X);

102

}

103

104

void switchToSmall(uintptr_t NewSmallBits, size_t NewSize) {

105

X = 1;

106

setSmallSize(NewSize);

107

setSmallBits(NewSmallBits);

108

}

109

110

void switchToLarge(BitVector *BV) {

111

X = reinterpret_cast<uintptr_t>(BV);

112

assert(!isSmall() && "Tried to use an unaligned pointer")(static_cast <bool> (!isSmall() && "Tried to use an unaligned pointer"
) ? void (0) : __assert_fail ("!isSmall() && \"Tried to use an unaligned pointer\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 112, __extension__ __PRETTY_FUNCTION__));

113

}

114

115

// Return all the bits used for the "small" representation; this includes

116

// bits for the size as well as the element bits.

117

uintptr_t getSmallRawBits() const {

118

assert(isSmall())(static_cast <bool> (isSmall()) ? void (0) : __assert_fail
("isSmall()", "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 118, __extension__ __PRETTY_FUNCTION__));

119

return X >> 1;

120

}

121

122

void setSmallRawBits(uintptr_t NewRawBits) {

123

124

X = (NewRawBits << 1) | uintptr_t(1);

125

}

126

127

// Return the size.

128

size_t getSmallSize() const { return getSmallRawBits() >> SmallNumDataBits; }

129

130

void setSmallSize(size_t Size) {

131

setSmallRawBits(getSmallBits() | (Size << SmallNumDataBits));

132

}

133

134

// Return the element bits.

135

uintptr_t getSmallBits() const {

136

return getSmallRawBits() & ~(~uintptr_t(0) << getSmallSize());

137

}

138

139

void setSmallBits(uintptr_t NewBits) {

140

setSmallRawBits((NewBits & ~(~uintptr_t(0) << getSmallSize())) |

141

(getSmallSize() << SmallNumDataBits));

142

}

143

144

public:

145

/// Creates an empty bitvector.

146

SmallBitVector() = default;

147

148

/// Creates a bitvector of specified number of bits. All bits are initialized

149

/// to the specified value.

150

explicit SmallBitVector(unsigned s, bool t = false) {

151

if (s <= SmallNumDataBits)

←

Taking false branch

→

152

switchToSmall(t ? ~uintptr_t(0) : 0, s);

153

else

154

switchToLarge(new BitVector(s, t));

←

Memory is allocated

→

155

}

156

157

/// SmallBitVector copy ctor.

158

SmallBitVector(const SmallBitVector &RHS) {

159

if (RHS.isSmall())

160

X = RHS.X;

161

else

162

switchToLarge(new BitVector(*RHS.getPointer()));

163

}

164

165

SmallBitVector(SmallBitVector &&RHS) : X(RHS.X) {

166

RHS.X = 1;

167

}

168

169

~SmallBitVector() {

170

if (!isSmall())

171

delete getPointer();

172

}

173

174

using const_set_bits_iterator = const_set_bits_iterator_impl<SmallBitVector>;

175

using set_iterator = const_set_bits_iterator;

176

177

const_set_bits_iterator set_bits_begin() const {

178

return const_set_bits_iterator(*this);

179

}

180

181

const_set_bits_iterator set_bits_end() const {

182

return const_set_bits_iterator(*this, -1);

183

}

184

185

iterator_range<const_set_bits_iterator> set_bits() const {

186

return make_range(set_bits_begin(), set_bits_end());

187

}

188

189

/// Tests whether there are no bits in this bitvector.

190

bool empty() const {

191

return isSmall() ? getSmallSize() == 0 : getPointer()->empty();

192

}

193

194

/// Returns the number of bits in this bitvector.

195

size_t size() const {

196

return isSmall() ? getSmallSize() : getPointer()->size();

197

}

198

199

/// Returns the number of bits which are set.

200

size_type count() const {

201

if (isSmall()) {

202

uintptr_t Bits = getSmallBits();

203

return countPopulation(Bits);

204

}

205

return getPointer()->count();

206

}

207

208

/// Returns true if any bit is set.

209

bool any() const {

210

if (isSmall())

211

return getSmallBits() != 0;

212

return getPointer()->any();

213

}

214

215

/// Returns true if all bits are set.

216

bool all() const {

217

if (isSmall())

218

return getSmallBits() == (uintptr_t(1) << getSmallSize()) - 1;

219

return getPointer()->all();

220

}

221

222

/// Returns true if none of the bits are set.

223

bool none() const {

224

if (isSmall())

225

return getSmallBits() == 0;

226

return getPointer()->none();

227

}

228

229

/// Returns the index of the first set bit, -1 if none of the bits are set.

230

int find_first() const {

231

if (isSmall()) {

232

uintptr_t Bits = getSmallBits();

233

if (Bits == 0)

234

return -1;

235

return countTrailingZeros(Bits);

236

}

237

return getPointer()->find_first();

238

}

239

240

int find_last() const {

241

if (isSmall()) {

242

uintptr_t Bits = getSmallBits();

243

if (Bits == 0)

244

return -1;

245

return NumBaseBits - countLeadingZeros(Bits);

246

}

247

return getPointer()->find_last();

248

}

249

250

/// Returns the index of the first unset bit, -1 if all of the bits are set.

251

int find_first_unset() const {

252

if (isSmall()) {

253

if (count() == getSmallSize())

254

return -1;

255

256

uintptr_t Bits = getSmallBits();

257

return countTrailingOnes(Bits);

258

}

259

return getPointer()->find_first_unset();

260

}

261

262

int find_last_unset() const {

263

if (isSmall()) {

264

if (count() == getSmallSize())

265

return -1;

266

267

uintptr_t Bits = getSmallBits();

268

return NumBaseBits - countLeadingOnes(Bits);

269

}

270

return getPointer()->find_last_unset();

271

}

272

273

/// Returns the index of the next set bit following the "Prev" bit.

274

/// Returns -1 if the next set bit is not found.

275

int find_next(unsigned Prev) const {

276

if (isSmall()) {

277

uintptr_t Bits = getSmallBits();

278

// Mask off previous bits.

279

Bits &= ~uintptr_t(0) << (Prev + 1);

280

if (Bits == 0 || Prev + 1 >= getSmallSize())

281

return -1;

282

return countTrailingZeros(Bits);

283

}

284

return getPointer()->find_next(Prev);

285

}

286

287

/// Returns the index of the next unset bit following the "Prev" bit.

288

/// Returns -1 if the next unset bit is not found.

289

int find_next_unset(unsigned Prev) const {

290

if (isSmall()) {

291

++Prev;

292

uintptr_t Bits = getSmallBits();

293

// Mask in previous bits.

294

uintptr_t Mask = (1 << Prev) - 1;

295

Bits |= Mask;

296

297

if (Bits == ~uintptr_t(0) || Prev + 1 >= getSmallSize())

298

return -1;

299

return countTrailingOnes(Bits);

300

}

301

return getPointer()->find_next_unset(Prev);

302

}

303

304

/// find_prev - Returns the index of the first set bit that precedes the

305

/// the bit at \p PriorTo. Returns -1 if all previous bits are unset.

306

int find_prev(unsigned PriorTo) const {

307

if (isSmall()) {

308

if (PriorTo == 0)

309

return -1;

310

311

--PriorTo;

312

uintptr_t Bits = getSmallBits();

313

Bits &= maskTrailingOnes<uintptr_t>(PriorTo + 1);

314

if (Bits == 0)

315

return -1;

316

317

return NumBaseBits - countLeadingZeros(Bits) - 1;

318

}

319

return getPointer()->find_prev(PriorTo);

320

}

321

322

/// Clear all bits.

323

void clear() {

324

if (!isSmall())

325

delete getPointer();

326

switchToSmall(0, 0);

327

}

328

329

/// Grow or shrink the bitvector.

330

void resize(unsigned N, bool t = false) {

331

if (!isSmall()) {

332

getPointer()->resize(N, t);

333

} else if (SmallNumDataBits >= N) {

334

uintptr_t NewBits = t ? ~uintptr_t(0) << getSmallSize() : 0;

335

setSmallSize(N);

336

setSmallBits(NewBits | getSmallBits());

337

} else {

338

BitVector *BV = new BitVector(N, t);

339

uintptr_t OldBits = getSmallBits();

340

for (size_t i = 0, e = getSmallSize(); i != e; ++i)

341

(*BV)[i] = (OldBits >> i) & 1;

342

switchToLarge(BV);

343

}

344

}

345

346

void reserve(unsigned N) {

347

if (isSmall()) {

348

if (N > SmallNumDataBits) {

349

uintptr_t OldBits = getSmallRawBits();

350

size_t SmallSize = getSmallSize();

351

BitVector *BV = new BitVector(SmallSize);

352

for (size_t i = 0; i < SmallSize; ++i)

353

if ((OldBits >> i) & 1)

354

BV->set(i);

355

BV->reserve(N);

356

switchToLarge(BV);

357

}

358

} else {

359

getPointer()->reserve(N);

360

}

361

}

362

363

// Set, reset, flip

364

SmallBitVector &set() {

365

if (isSmall())

366

setSmallBits(~uintptr_t(0));

367

else

368

getPointer()->set();

369

return *this;

370

}

371

372

SmallBitVector &set(unsigned Idx) {

373

if (isSmall()) {

374

assert(Idx <= static_cast<unsigned>((static_cast <bool> (Idx <= static_cast<unsigned>
( std::numeric_limits<uintptr_t>::digits) && "undefined behavior"
) ? void (0) : __assert_fail ("Idx <= static_cast<unsigned>( std::numeric_limits<uintptr_t>::digits) && \"undefined behavior\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 376, __extension__ __PRETTY_FUNCTION__))

375

std::numeric_limits<uintptr_t>::digits) &&(static_cast <bool> (Idx <= static_cast<unsigned>
( std::numeric_limits<uintptr_t>::digits) && "undefined behavior"
) ? void (0) : __assert_fail ("Idx <= static_cast<unsigned>( std::numeric_limits<uintptr_t>::digits) && \"undefined behavior\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 376, __extension__ __PRETTY_FUNCTION__))

376

"undefined behavior")(static_cast <bool> (Idx <= static_cast<unsigned>
( std::numeric_limits<uintptr_t>::digits) && "undefined behavior"
) ? void (0) : __assert_fail ("Idx <= static_cast<unsigned>( std::numeric_limits<uintptr_t>::digits) && \"undefined behavior\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 376, __extension__ __PRETTY_FUNCTION__));

377

setSmallBits(getSmallBits() | (uintptr_t(1) << Idx));

378

}

379

else

380

getPointer()->set(Idx);

381

return *this;

382

}

383

384

/// Efficiently set a range of bits in [I, E)

385

SmallBitVector &set(unsigned I, unsigned E) {

386

assert(I <= E && "Attempted to set backwards range!")(static_cast <bool> (I <= E && "Attempted to set backwards range!"
) ? void (0) : __assert_fail ("I <= E && \"Attempted to set backwards range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 386, __extension__ __PRETTY_FUNCTION__));

387

assert(E <= size() && "Attempted to set out-of-bounds range!")(static_cast <bool> (E <= size() && "Attempted to set out-of-bounds range!"
) ? void (0) : __assert_fail ("E <= size() && \"Attempted to set out-of-bounds range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 387, __extension__ __PRETTY_FUNCTION__));

388

if (I == E) return *this;

389

if (isSmall()) {

390

uintptr_t EMask = ((uintptr_t)1) << E;

391

uintptr_t IMask = ((uintptr_t)1) << I;

392

uintptr_t Mask = EMask - IMask;

393

setSmallBits(getSmallBits() | Mask);

394

} else

395

getPointer()->set(I, E);

396

return *this;

397

}

398

399

SmallBitVector &reset() {

400

if (isSmall())

401

setSmallBits(0);

402

else

403

getPointer()->reset();

404

return *this;

405

}

406

407

SmallBitVector &reset(unsigned Idx) {

408

if (isSmall())

409

setSmallBits(getSmallBits() & ~(uintptr_t(1) << Idx));

410

else

411

getPointer()->reset(Idx);

412

return *this;

413

}

414

415

/// Efficiently reset a range of bits in [I, E)

416

SmallBitVector &reset(unsigned I, unsigned E) {

417

assert(I <= E && "Attempted to reset backwards range!")(static_cast <bool> (I <= E && "Attempted to reset backwards range!"
) ? void (0) : __assert_fail ("I <= E && \"Attempted to reset backwards range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 417, __extension__ __PRETTY_FUNCTION__));

418

assert(E <= size() && "Attempted to reset out-of-bounds range!")(static_cast <bool> (E <= size() && "Attempted to reset out-of-bounds range!"
) ? void (0) : __assert_fail ("E <= size() && \"Attempted to reset out-of-bounds range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 418, __extension__ __PRETTY_FUNCTION__));

419

if (I == E) return *this;

420

if (isSmall()) {

421

uintptr_t EMask = ((uintptr_t)1) << E;

422

uintptr_t IMask = ((uintptr_t)1) << I;

423

uintptr_t Mask = EMask - IMask;

424

setSmallBits(getSmallBits() & ~Mask);

425

} else

426

getPointer()->reset(I, E);

427

return *this;

428

}

429

430

SmallBitVector &flip() {

431

if (isSmall())

432

setSmallBits(~getSmallBits());

433

else

434

getPointer()->flip();

435

return *this;

436

}

437

438

SmallBitVector &flip(unsigned Idx) {

439

if (isSmall())

440

setSmallBits(getSmallBits() ^ (uintptr_t(1) << Idx));

441

else

442

getPointer()->flip(Idx);

443

return *this;

444

}

445

446

// No argument flip.

447

SmallBitVector operator~() const {

448

return SmallBitVector(*this).flip();

449

}

450

451

// Indexing.

452

reference operator[](unsigned Idx) {

453

assert(Idx < size() && "Out-of-bounds Bit access.")(static_cast <bool> (Idx < size() && "Out-of-bounds Bit access."
) ? void (0) : __assert_fail ("Idx < size() && \"Out-of-bounds Bit access.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 453, __extension__ __PRETTY_FUNCTION__));

454

return reference(*this, Idx);

455

}

456

457

bool operator[](unsigned Idx) const {

458

459

if (isSmall())

460

return ((getSmallBits() >> Idx) & 1) != 0;

461

return getPointer()->operator[](Idx);

462

}

463

464

bool test(unsigned Idx) const {

465

return (*this)[Idx];

466

}

467

468

/// Test if any common bits are set.

469

bool anyCommon(const SmallBitVector &RHS) const {

470

if (isSmall() && RHS.isSmall())

471

return (getSmallBits() & RHS.getSmallBits()) != 0;

472

if (!isSmall() && !RHS.isSmall())

473

return getPointer()->anyCommon(*RHS.getPointer());

474

475

for (unsigned i = 0, e = std::min(size(), RHS.size()); i != e; ++i)

476

if (test(i) && RHS.test(i))

477

return true;

478

return false;

479

}

480

481

// Comparison operators.

482

bool operator==(const SmallBitVector &RHS) const {

483

if (size() != RHS.size())

484

return false;

485

if (isSmall())

486

return getSmallBits() == RHS.getSmallBits();

487

else

488

return *getPointer() == *RHS.getPointer();

489

}

490

491

bool operator!=(const SmallBitVector &RHS) const {

492

return !(*this == RHS);

493

}

494

495

// Intersection, union, disjoint union.

496

SmallBitVector &operator&=(const SmallBitVector &RHS) {

497

resize(std::max(size(), RHS.size()));

498

if (isSmall())

499

setSmallBits(getSmallBits() & RHS.getSmallBits());

500

else if (!RHS.isSmall())

501

getPointer()->operator&=(*RHS.getPointer());

502

else {

503

SmallBitVector Copy = RHS;

504

Copy.resize(size());

505

getPointer()->operator&=(*Copy.getPointer());

506

}

507

return *this;

508

}

509

510

/// Reset bits that are set in RHS. Same as *this &= ~RHS.

511

SmallBitVector &reset(const SmallBitVector &RHS) {

512

if (isSmall() && RHS.isSmall())

513

setSmallBits(getSmallBits() & ~RHS.getSmallBits());

514

else if (!isSmall() && !RHS.isSmall())

515

getPointer()->reset(*RHS.getPointer());

516

else

517

for (unsigned i = 0, e = std::min(size(), RHS.size()); i != e; ++i)

518

if (RHS.test(i))

519

reset(i);

520

521

return *this;

522

}

523

524

/// Check if (This - RHS) is zero. This is the same as reset(RHS) and any().

525

bool test(const SmallBitVector &RHS) const {

526

if (isSmall() && RHS.isSmall())

527

return (getSmallBits() & ~RHS.getSmallBits()) != 0;

528

if (!isSmall() && !RHS.isSmall())

529

return getPointer()->test(*RHS.getPointer());

530

531

unsigned i, e;

532

for (i = 0, e = std::min(size(), RHS.size()); i != e; ++i)

533

if (test(i) && !RHS.test(i))

534

return true;

535

536

for (e = size(); i != e; ++i)

537

if (test(i))

538

return true;

539

540

return false;

541

}

542

543

SmallBitVector &operator|=(const SmallBitVector &RHS) {

544

resize(std::max(size(), RHS.size()));

545

if (isSmall())

546

setSmallBits(getSmallBits() | RHS.getSmallBits());

547

else if (!RHS.isSmall())

548

getPointer()->operator|=(*RHS.getPointer());

549

else {

550

SmallBitVector Copy = RHS;

551

Copy.resize(size());

552

getPointer()->operator|=(*Copy.getPointer());

553

}

554

return *this;

555

}

556

557

SmallBitVector &operator^=(const SmallBitVector &RHS) {

558

resize(std::max(size(), RHS.size()));

559

if (isSmall())

560

setSmallBits(getSmallBits() ^ RHS.getSmallBits());

561

else if (!RHS.isSmall())

562

getPointer()->operator^=(*RHS.getPointer());

563

else {

564

SmallBitVector Copy = RHS;

565

Copy.resize(size());

566

getPointer()->operator^=(*Copy.getPointer());

567

}

568

return *this;

569

}

570

571

SmallBitVector &operator<<=(unsigned N) {

572

if (isSmall())

573

setSmallBits(getSmallBits() << N);

574

else

575

getPointer()->operator<<=(N);

576

return *this;

577

}

578

579

SmallBitVector &operator>>=(unsigned N) {

580

if (isSmall())

581

setSmallBits(getSmallBits() >> N);

582

else

583

getPointer()->operator>>=(N);

584

return *this;

585

}

586

587

// Assignment operator.

588

const SmallBitVector &operator=(const SmallBitVector &RHS) {

589

if (isSmall()) {

590

if (RHS.isSmall())

591

X = RHS.X;

592

else

593

switchToLarge(new BitVector(*RHS.getPointer()));

594

} else {

595

if (!RHS.isSmall())

596

*getPointer() = *RHS.getPointer();

597

else {

598

delete getPointer();

599

X = RHS.X;

600

}

601

}

602

return *this;

603

}

604

605

const SmallBitVector &operator=(SmallBitVector &&RHS) {

606

if (this != &RHS) {

607

clear();

608

swap(RHS);

609

}

610

return *this;

611

}

612

613

void swap(SmallBitVector &RHS) {

614

std::swap(X, RHS.X);

615

}

616

617

/// Add '1' bits from Mask to this vector. Don't resize.

618

/// This computes "*this |= Mask".

619

void setBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {

620

if (isSmall())

621

applyMask<true, false>(Mask, MaskWords);

622

else

623

getPointer()->setBitsInMask(Mask, MaskWords);

624

}

625

626

/// Clear any bits in this vector that are set in Mask. Don't resize.

627

/// This computes "*this &= ~Mask".

628

void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {

629

if (isSmall())

630

applyMask<false, false>(Mask, MaskWords);

631

else

632

getPointer()->clearBitsInMask(Mask, MaskWords);

633

}

634

635

/// Add a bit to this vector for every '0' bit in Mask. Don't resize.

636

/// This computes "*this |= ~Mask".

637

void setBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {

638

if (isSmall())

639

applyMask<true, true>(Mask, MaskWords);

640

else

641

getPointer()->setBitsNotInMask(Mask, MaskWords);

642

}

643

644

/// Clear a bit in this vector for every '0' bit in Mask. Don't resize.

645

/// This computes "*this &= Mask".

646

void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {

647

if (isSmall())

648

applyMask<false, true>(Mask, MaskWords);

649

else

650

getPointer()->clearBitsNotInMask(Mask, MaskWords);

651

}

652

653

private:

654

template <bool AddBits, bool InvertMask>

655

void applyMask(const uint32_t *Mask, unsigned MaskWords) {

656

assert(MaskWords <= sizeof(uintptr_t) && "Mask is larger than base!")(static_cast <bool> (MaskWords <= sizeof(uintptr_t) &&
"Mask is larger than base!") ? void (0) : __assert_fail ("MaskWords <= sizeof(uintptr_t) && \"Mask is larger than base!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 656, __extension__ __PRETTY_FUNCTION__));

657

uintptr_t M = Mask[0];

658

if (NumBaseBits == 64)

659

M |= uint64_t(Mask[1]) << 32;

660

if (InvertMask)

661

M = ~M;

662

if (AddBits)

663

setSmallBits(getSmallBits() | M);

664

else

665

setSmallBits(getSmallBits() & ~M);

666

}

667

};

668

669

inline SmallBitVector

670

operator&(const SmallBitVector &LHS, const SmallBitVector &RHS) {

671

SmallBitVector Result(LHS);

672

Result &= RHS;

673

return Result;

674

}

675

676

inline SmallBitVector

677

operator|(const SmallBitVector &LHS, const SmallBitVector &RHS) {

678

SmallBitVector Result(LHS);

679

Result |= RHS;

680

return Result;

681

}

682

683

inline SmallBitVector

684

operator^(const SmallBitVector &LHS, const SmallBitVector &RHS) {

685

SmallBitVector Result(LHS);

686

Result ^= RHS;

687

return Result;

688

}

689

690

} // end namespace llvm

691

692

namespace std {

693

694

/// Implement std::swap in terms of BitVector swap.

695

inline void

696

swap(llvm::SmallBitVector &LHS, llvm::SmallBitVector &RHS) {

697

LHS.swap(RHS);

698

}

699

700

} // end namespace std

701

702

#endif // LLVM_ADT_SMALLBITVECTOR_H