/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp

Bug Summary

File:	lib/Target/X86/X86ISelLowering.cpp
Location:	line 6075, column 33
Description:	Called C++ object pointer is null

Annotated Source Code

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

// The LLVM Compiler Infrastructure

// This file is distributed under the University of Illinois Open Source

// License. See LICENSE.TXT for details.

//===----------------------------------------------------------------------===//

// This file defines the interfaces that X86 uses to lower LLVM code into a

// selection DAG.

//===----------------------------------------------------------------------===//

#include "X86ISelLowering.h"

#include "Utils/X86ShuffleDecode.h"

#include "X86CallingConv.h"

#include "X86FrameLowering.h"

#include "X86InstrBuilder.h"

#include "X86MachineFunctionInfo.h"

#include "X86TargetMachine.h"

#include "X86TargetObjectFile.h"

#include "llvm/ADT/SmallBitVector.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/ADT/StringSwitch.h"

#include "llvm/ADT/VariadicFunction.h"

#include "llvm/CodeGen/IntrinsicLowering.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineJumpTableInfo.h"

#include "llvm/CodeGen/MachineModuleInfo.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/IR/CallSite.h"

#include "llvm/IR/CallingConv.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/GlobalAlias.h"

#include "llvm/IR/GlobalVariable.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/MC/MCAsmInfo.h"

#include "llvm/MC/MCContext.h"

#include "llvm/MC/MCExpr.h"

#include "llvm/MC/MCSymbol.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Target/TargetOptions.h"

#include "X86IntrinsicsInfo.h"

#include <bitset>

#include <numeric>

#include <cctype>

using namespace llvm;

#define DEBUG_TYPE"x86-isel" "x86-isel"

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = { "x86-isel", "Number of tail calls"
, 0, 0 };

static cl::opt<bool> ExperimentalVectorWideningLegalization(

"x86-experimental-vector-widening-legalization", cl::init(false),

cl::desc("Enable an experimental vector type legalization through widening "

"rather than promotion."),

cl::Hidden);

static cl::opt<bool> ExperimentalVectorShuffleLowering(

"x86-experimental-vector-shuffle-lowering", cl::init(true),

cl::desc("Enable an experimental vector shuffle lowering code path."),

cl::Hidden);

static cl::opt<bool> ExperimentalVectorShuffleLegality(

"x86-experimental-vector-shuffle-legality", cl::init(false),

cl::desc("Enable experimental shuffle legality based on the experimental "

"shuffle lowering. Should only be used with the experimental "

"shuffle lowering."),

cl::Hidden);

static cl::opt<int> ReciprocalEstimateRefinementSteps(

"x86-recip-refinement-steps", cl::init(1),

cl::desc("Specify the number of Newton-Raphson iterations applied to the "

"result of the hardware reciprocal estimate instruction."),

cl::NotHidden);

// Forward declarations.

static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,

SDValue V2);

static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,

SelectionDAG &DAG, SDLoc dl,

unsigned vectorWidth) {

assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 96, __PRETTY_FUNCTION__))

"Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 96, __PRETTY_FUNCTION__));

EVT VT = Vec.getValueType();

EVT ElVT = VT.getVectorElementType();

unsigned Factor = VT.getSizeInBits()/vectorWidth;

100

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

101

VT.getVectorNumElements()/Factor);

102

103

// Extract from UNDEF is UNDEF.

104

if (Vec.getOpcode() == ISD::UNDEF)

105

return DAG.getUNDEF(ResultVT);

106

107

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

108

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

109

110

// This is the index of the first element of the vectorWidth-bit chunk

111

// we want.

112

unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)

113

* ElemsPerChunk);

114

115

// If the input is a buildvector just emit a smaller one.

116

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

117

return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,

118

makeArrayRef(Vec->op_begin() + NormalizedIdxVal,

119

ElemsPerChunk));

120

121

SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);

122

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

123

}

124

125

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

126

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

127

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

128

/// instructions or a simple subregister reference. Idx is an index in the

129

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

130

/// lowering EXTRACT_VECTOR_ELT operations easier.

131

static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,

132

SelectionDAG &DAG, SDLoc dl) {

133

assert((Vec.getValueType().is256BitVector() ||(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 134, __PRETTY_FUNCTION__))

134

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 134, __PRETTY_FUNCTION__));

135

return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);

136

}

137

138

/// Generate a DAG to grab 256-bits from a 512-bit vector.

139

static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,

140

SelectionDAG &DAG, SDLoc dl) {

141

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 141, __PRETTY_FUNCTION__));

142

return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);

143

}

144

145

static SDValue InsertSubVector(SDValue Result, SDValue Vec,

146

unsigned IdxVal, SelectionDAG &DAG,

147

SDLoc dl, unsigned vectorWidth) {

148

149

150

// Inserting UNDEF is Result

151

if (Vec.getOpcode() == ISD::UNDEF)

152

return Result;

153

EVT VT = Vec.getValueType();

154

EVT ElVT = VT.getVectorElementType();

155

EVT ResultVT = Result.getValueType();

156

157

// Insert the relevant vectorWidth bits.

158

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

159

160

// This is the index of the first element of the vectorWidth-bit chunk

161

// we want.

162

unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)

163

* ElemsPerChunk);

164

165

SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);

166

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

167

}

168

169

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

170

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

171

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

172

/// simple superregister reference. Idx is an index in the 128 bits

173

/// we want. It need not be aligned to a 128-bit boundary. That makes

174

/// lowering INSERT_VECTOR_ELT operations easier.

175

static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

176

SelectionDAG &DAG,SDLoc dl) {

177

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 177, __PRETTY_FUNCTION__));

178

return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

179

}

180

181

static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

182

SelectionDAG &DAG, SDLoc dl) {

183

assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!")((Vec.getValueType().is256BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is256BitVector() && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 183, __PRETTY_FUNCTION__));

184

return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);

185

}

186

187

/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128

188

/// instructions. This is used because creating CONCAT_VECTOR nodes of

189

/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower

190

/// large BUILD_VECTORS.

191

static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,

192

unsigned NumElems, SelectionDAG &DAG,

193

SDLoc dl) {

194

SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);

195

return Insert128BitVector(V, V2, NumElems/2, DAG, dl);

196

}

197

198

static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,

199

unsigned NumElems, SelectionDAG &DAG,

200

SDLoc dl) {

201

SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);

202

return Insert256BitVector(V, V2, NumElems/2, DAG, dl);

203

}

204

205

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)

206

: TargetLowering(TM) {

207

Subtarget = &TM.getSubtarget<X86Subtarget>();

208

X86ScalarSSEf64 = Subtarget->hasSSE2();

209

X86ScalarSSEf32 = Subtarget->hasSSE1();

210

TD = getDataLayout();

211

212

// Set up the TargetLowering object.

213

static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };

214

215

// X86 is weird. It always uses i8 for shift amounts and setcc results.

216

setBooleanContents(ZeroOrOneBooleanContent);

217

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

218

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

219

220

// For 64-bit, since we have so many registers, use the ILP scheduler.

221

// For 32-bit, use the register pressure specific scheduling.

222

// For Atom, always use ILP scheduling.

223

if (Subtarget->isAtom())

224

setSchedulingPreference(Sched::ILP);

225

else if (Subtarget->is64Bit())

226

setSchedulingPreference(Sched::ILP);

227

else

228

setSchedulingPreference(Sched::RegPressure);

229

const X86RegisterInfo *RegInfo =

230

TM.getSubtarget<X86Subtarget>().getRegisterInfo();

231

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

232

233

// Bypass expensive divides on Atom when compiling with O2.

234

if (TM.getOptLevel() >= CodeGenOpt::Default) {

235

if (Subtarget->hasSlowDivide32())

236

addBypassSlowDiv(32, 8);

237

if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())

238

addBypassSlowDiv(64, 16);

239

}

240

241

if (Subtarget->isTargetKnownWindowsMSVC()) {

242

// Setup Windows compiler runtime calls.

243

setLibcallName(RTLIB::SDIV_I64, "_alldiv");

244

setLibcallName(RTLIB::UDIV_I64, "_aulldiv");

245

setLibcallName(RTLIB::SREM_I64, "_allrem");

246

setLibcallName(RTLIB::UREM_I64, "_aullrem");

247

setLibcallName(RTLIB::MUL_I64, "_allmul");

248

setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);

249

setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);

250

setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);

251

setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);

252

setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);

253

254

// The _ftol2 runtime function has an unusual calling conv, which

255

// is modeled by a special pseudo-instruction.

256

setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);

257

setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);

258

setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);

259

setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);

260

}

261

262

if (Subtarget->isTargetDarwin()) {

263

// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.

264

setUseUnderscoreSetJmp(false);

265

setUseUnderscoreLongJmp(false);

266

} else if (Subtarget->isTargetWindowsGNU()) {

267

// MS runtime is weird: it exports _setjmp, but longjmp!

268

setUseUnderscoreSetJmp(true);

269

setUseUnderscoreLongJmp(false);

270

} else {

271

setUseUnderscoreSetJmp(true);

272

setUseUnderscoreLongJmp(true);

273

}

274

275

// Set up the register classes.

276

addRegisterClass(MVT::i8, &X86::GR8RegClass);

277

addRegisterClass(MVT::i16, &X86::GR16RegClass);

278

addRegisterClass(MVT::i32, &X86::GR32RegClass);

279

if (Subtarget->is64Bit())

280

addRegisterClass(MVT::i64, &X86::GR64RegClass);

281

282

for (MVT VT : MVT::integer_valuetypes())

283

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

284

285

// We don't accept any truncstore of integer registers.

286

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

287

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

288

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

289

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

290

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

291

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

292

293

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

294

295

// SETOEQ and SETUNE require checking two conditions.

296

setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);

297

setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);

298

setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);

299

setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);

300

setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);

301

setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

302

303

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

304

// operation.

305

setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);

306

setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);

307

setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

308

309

if (Subtarget->is64Bit()) {

310

setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);

311

setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);

312

} else if (!TM.Options.UseSoftFloat) {

313

// We have an algorithm for SSE2->double, and we turn this into a

314

// 64-bit FILD followed by conditional FADD for other targets.

315

setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);

316

// We have an algorithm for SSE2, and we turn this into a 64-bit

317

// FILD for other targets.

318

setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);

319

}

320

321

// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

322

// this operation.

323

setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);

324

setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

325

326

if (!TM.Options.UseSoftFloat) {

327

// SSE has no i16 to fp conversion, only i32

328

if (X86ScalarSSEf32) {

329

setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);

330

// f32 and f64 cases are Legal, f80 case is not

331

setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);

332

} else {

333

setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);

334

setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);

335

}

336

} else {

337

setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);

338

setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);

339

}

340

341

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

342

// are Legal, f80 is custom lowered.

343

setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);

344

setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

345

346

// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

347

// this operation.

348

setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);

349

setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

350

351

if (X86ScalarSSEf32) {

352

setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);

353

// f32 and f64 cases are Legal, f80 case is not

354

setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);

355

} else {

356

setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);

357

setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);

358

}

359

360

// Handle FP_TO_UINT by promoting the destination to a larger signed

361

// conversion.

362

setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);

363

setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);

364

setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

365

366

if (Subtarget->is64Bit()) {

367

setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);

368

setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);

369

} else if (!TM.Options.UseSoftFloat) {

370

// Since AVX is a superset of SSE3, only check for SSE here.

371

if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())

372

// Expand FP_TO_UINT into a select.

373

// FIXME: We would like to use a Custom expander here eventually to do

374

// the optimal thing for SSE vs. the default expansion in the legalizer.

375

setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);

376

else

377

// With SSE3 we can use fisttpll to convert to a signed i64; without

378

// SSE, we're stuck with a fistpll.

379

setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

380

}

381

382

if (isTargetFTOL()) {

383

// Use the _ftol2 runtime function, which has a pseudo-instruction

384

// to handle its weird calling convention.

385

setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);

386

}

387

388

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

389

if (!X86ScalarSSEf64) {

390

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

391

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

392

if (Subtarget->is64Bit()) {

393

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

394

// Without SSE, i64->f64 goes through memory.

395

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

396

}

397

}

398

399

// Scalar integer divide and remainder are lowered to use operations that

400

// produce two results, to match the available instructions. This exposes

401

// the two-result form to trivial CSE, which is able to combine x/y and x%y

402

// into a single instruction.

403

404

// Scalar integer multiply-high is also lowered to use two-result

405

// operations, to match the available instructions. However, plain multiply

406

// (low) operations are left as Legal, as there are single-result

407

// instructions for this in x86. Using the two-result multiply instructions

408

// when both high and low results are needed must be arranged by dagcombine.

409

for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {

410

MVT VT = IntVTs[i];

411

setOperationAction(ISD::MULHS, VT, Expand);

412

setOperationAction(ISD::MULHU, VT, Expand);

413

setOperationAction(ISD::SDIV, VT, Expand);

414

setOperationAction(ISD::UDIV, VT, Expand);

415

setOperationAction(ISD::SREM, VT, Expand);

416

setOperationAction(ISD::UREM, VT, Expand);

417

418

// Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.

419

setOperationAction(ISD::ADDC, VT, Custom);

420

setOperationAction(ISD::ADDE, VT, Custom);

421

setOperationAction(ISD::SUBC, VT, Custom);

422

setOperationAction(ISD::SUBE, VT, Custom);

423

}

424

425

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

426

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

427

setOperationAction(ISD::BR_CC , MVT::f32, Expand);

428

setOperationAction(ISD::BR_CC , MVT::f64, Expand);

429

setOperationAction(ISD::BR_CC , MVT::f80, Expand);

430

setOperationAction(ISD::BR_CC , MVT::i8, Expand);

431

setOperationAction(ISD::BR_CC , MVT::i16, Expand);

432

setOperationAction(ISD::BR_CC , MVT::i32, Expand);

433

setOperationAction(ISD::BR_CC , MVT::i64, Expand);

434

setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);

435

setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);

436

setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);

437

setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);

438

setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);

439

setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);

440

setOperationAction(ISD::SELECT_CC , MVT::i64, Expand);

441

if (Subtarget->is64Bit())

442

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

443

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

444

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

445

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

446

setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

447

setOperationAction(ISD::FREM , MVT::f32 , Expand);

448

setOperationAction(ISD::FREM , MVT::f64 , Expand);

449

setOperationAction(ISD::FREM , MVT::f80 , Expand);

450

setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

451

452

// Promote the i8 variants and force them on up to i32 which has a shorter

453

// encoding.

454

setOperationAction(ISD::CTTZ , MVT::i8 , Promote);

455

AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);

456

setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);

457

AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);

458

if (Subtarget->hasBMI()) {

459

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);

460

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);

461

if (Subtarget->is64Bit())

462

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);

463

} else {

464

setOperationAction(ISD::CTTZ , MVT::i16 , Custom);

465

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

466

if (Subtarget->is64Bit())

467

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

468

}

469

470

if (Subtarget->hasLZCNT()) {

471

// When promoting the i8 variants, force them to i32 for a shorter

472

// encoding.

473

setOperationAction(ISD::CTLZ , MVT::i8 , Promote);

474

AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);

475

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);

476

AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

477

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);

478

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);

479

if (Subtarget->is64Bit())

480

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);

481

} else {

482

setOperationAction(ISD::CTLZ , MVT::i8 , Custom);

483

setOperationAction(ISD::CTLZ , MVT::i16 , Custom);

484

setOperationAction(ISD::CTLZ , MVT::i32 , Custom);

485

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);

486

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);

487

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);

488

if (Subtarget->is64Bit()) {

489

setOperationAction(ISD::CTLZ , MVT::i64 , Custom);

490

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);

491

}

492

}

493

494

// Special handling for half-precision floating point conversions.

495

// If we don't have F16C support, then lower half float conversions

496

// into library calls.

497

if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {

498

setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);

499

setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);

500

}

501

502

// There's never any support for operations beyond MVT::f32.

503

setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);

504

setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);

505

setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);

506

setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

507

508

setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);

509

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);

510

setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);

511

setTruncStoreAction(MVT::f32, MVT::f16, Expand);

512

setTruncStoreAction(MVT::f64, MVT::f16, Expand);

513

setTruncStoreAction(MVT::f80, MVT::f16, Expand);

514

515

if (Subtarget->hasPOPCNT()) {

516

setOperationAction(ISD::CTPOP , MVT::i8 , Promote);

517

} else {

518

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

519

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

520

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

521

if (Subtarget->is64Bit())

522

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

523

}

524

525

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

526

527

if (!Subtarget->hasMOVBE())

528

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

529

530

// These should be promoted to a larger select which is supported.

531

setOperationAction(ISD::SELECT , MVT::i1 , Promote);

532

// X86 wants to expand cmov itself.

533

setOperationAction(ISD::SELECT , MVT::i8 , Custom);

534

setOperationAction(ISD::SELECT , MVT::i16 , Custom);

535

setOperationAction(ISD::SELECT , MVT::i32 , Custom);

536

setOperationAction(ISD::SELECT , MVT::f32 , Custom);

537

setOperationAction(ISD::SELECT , MVT::f64 , Custom);

538

setOperationAction(ISD::SELECT , MVT::f80 , Custom);

539

setOperationAction(ISD::SETCC , MVT::i8 , Custom);

540

setOperationAction(ISD::SETCC , MVT::i16 , Custom);

541

setOperationAction(ISD::SETCC , MVT::i32 , Custom);

542

setOperationAction(ISD::SETCC , MVT::f32 , Custom);

543

setOperationAction(ISD::SETCC , MVT::f64 , Custom);

544

setOperationAction(ISD::SETCC , MVT::f80 , Custom);

545

if (Subtarget->is64Bit()) {

546

setOperationAction(ISD::SELECT , MVT::i64 , Custom);

547

setOperationAction(ISD::SETCC , MVT::i64 , Custom);

548

}

549

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

550

// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support

551

// SjLj exception handling but a light-weight setjmp/longjmp replacement to

552

// support continuation, user-level threading, and etc.. As a result, no

553

// other SjLj exception interfaces are implemented and please don't build

554

// your own exception handling based on them.

555

// LLVM/Clang supports zero-cost DWARF exception handling.

556

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

557

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

558

559

// Darwin ABI issue.

560

setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);

561

setOperationAction(ISD::JumpTable , MVT::i32 , Custom);

562

setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);

563

setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);

564

if (Subtarget->is64Bit())

565

setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);

566

setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);

567

setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);

568

if (Subtarget->is64Bit()) {

569

setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);

570

setOperationAction(ISD::JumpTable , MVT::i64 , Custom);

571

setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);

572

setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);

573

setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);

574

}

575

// 64-bit addm sub, shl, sra, srl (iff 32-bit x86)

576

setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);

577

setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);

578

setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);

579

if (Subtarget->is64Bit()) {

580

setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);

581

setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);

582

setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);

583

}

584

585

if (Subtarget->hasSSE1())

586

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

587

588

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

589

590

// Expand certain atomics

591

for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {

592

MVT VT = IntVTs[i];

593

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

594

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

595

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

596

}

597

598

if (Subtarget->hasCmpxchg16b()) {

599

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

600

}

601

602

// FIXME - use subtarget debug flags

603

if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&

604

!Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {

605

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

606

}

607

608

if (Subtarget->is64Bit()) {

609

setExceptionPointerRegister(X86::RAX);

610

setExceptionSelectorRegister(X86::RDX);

611

} else {

612

setExceptionPointerRegister(X86::EAX);

613

setExceptionSelectorRegister(X86::EDX);

614

}

615

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

616

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

617

618

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

619

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

620

621

setOperationAction(ISD::TRAP, MVT::Other, Legal);

622

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

623

624

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

625

setOperationAction(ISD::VASTART , MVT::Other, Custom);

626

setOperationAction(ISD::VAEND , MVT::Other, Expand);

627

if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {

628

// TargetInfo::X86_64ABIBuiltinVaList

629

setOperationAction(ISD::VAARG , MVT::Other, Custom);

630

setOperationAction(ISD::VACOPY , MVT::Other, Custom);

631

} else {

632

// TargetInfo::CharPtrBuiltinVaList

633

setOperationAction(ISD::VAARG , MVT::Other, Expand);

634

setOperationAction(ISD::VACOPY , MVT::Other, Expand);

635

}

636

637

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

638

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

639

640

setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);

641

642

if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {

643

// f32 and f64 use SSE.

644

// Set up the FP register classes.

645

addRegisterClass(MVT::f32, &X86::FR32RegClass);

646

addRegisterClass(MVT::f64, &X86::FR64RegClass);

647

648

// Use ANDPD to simulate FABS.

649

setOperationAction(ISD::FABS , MVT::f64, Custom);

650

setOperationAction(ISD::FABS , MVT::f32, Custom);

651

652

// Use XORP to simulate FNEG.

653

setOperationAction(ISD::FNEG , MVT::f64, Custom);

654

setOperationAction(ISD::FNEG , MVT::f32, Custom);

655

656

// Use ANDPD and ORPD to simulate FCOPYSIGN.

657

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);

658

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

659

660

// Lower this to FGETSIGNx86 plus an AND.

661

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

662

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

663

664

// We don't support sin/cos/fmod

665

setOperationAction(ISD::FSIN , MVT::f64, Expand);

666

setOperationAction(ISD::FCOS , MVT::f64, Expand);

667

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

668

setOperationAction(ISD::FSIN , MVT::f32, Expand);

669

setOperationAction(ISD::FCOS , MVT::f32, Expand);

670

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

671

672

// Expand FP immediates into loads from the stack, except for the special

673

// cases we handle.

674

addLegalFPImmediate(APFloat(+0.0)); // xorpd

675

addLegalFPImmediate(APFloat(+0.0f)); // xorps

676

} else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {

677

// Use SSE for f32, x87 for f64.

678

// Set up the FP register classes.

679

addRegisterClass(MVT::f32, &X86::FR32RegClass);

680

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

681

682

// Use ANDPS to simulate FABS.

683

setOperationAction(ISD::FABS , MVT::f32, Custom);

684

685

// Use XORP to simulate FNEG.

686

setOperationAction(ISD::FNEG , MVT::f32, Custom);

687

688

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

689

690

// Use ANDPS and ORPS to simulate FCOPYSIGN.

691

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

692

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

693

694

// We don't support sin/cos/fmod

695

setOperationAction(ISD::FSIN , MVT::f32, Expand);

696

setOperationAction(ISD::FCOS , MVT::f32, Expand);

697

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

698

699

// Special cases we handle for FP constants.

700

addLegalFPImmediate(APFloat(+0.0f)); // xorps

701

addLegalFPImmediate(APFloat(+0.0)); // FLD0

702

addLegalFPImmediate(APFloat(+1.0)); // FLD1

703

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

704

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

705

706

if (!TM.Options.UnsafeFPMath) {

707

setOperationAction(ISD::FSIN , MVT::f64, Expand);

708

setOperationAction(ISD::FCOS , MVT::f64, Expand);

709

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

710

}

711

} else if (!TM.Options.UseSoftFloat) {

712

// f32 and f64 in x87.

713

// Set up the FP register classes.

714

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

715

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

716

717

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

718

setOperationAction(ISD::UNDEF, MVT::f32, Expand);

719

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

720

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

721

722

if (!TM.Options.UnsafeFPMath) {

723

setOperationAction(ISD::FSIN , MVT::f64, Expand);

724

setOperationAction(ISD::FSIN , MVT::f32, Expand);

725

setOperationAction(ISD::FCOS , MVT::f64, Expand);

726

setOperationAction(ISD::FCOS , MVT::f32, Expand);

727

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

728

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

729

}

730

addLegalFPImmediate(APFloat(+0.0)); // FLD0

731

addLegalFPImmediate(APFloat(+1.0)); // FLD1

732

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

733

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

734

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

735

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

736

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

737

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

738

}

739

740

// We don't support FMA.

741

setOperationAction(ISD::FMA, MVT::f64, Expand);

742

setOperationAction(ISD::FMA, MVT::f32, Expand);

743

744

// Long double always uses X87.

745

if (!TM.Options.UseSoftFloat) {

746

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

747

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

748

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

749

{

750

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);

751

addLegalFPImmediate(TmpFlt); // FLD0

752

TmpFlt.changeSign();

753

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

754

755

bool ignored;

756

APFloat TmpFlt2(+1.0);

757

TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,

758

&ignored);

759

addLegalFPImmediate(TmpFlt2); // FLD1

760

TmpFlt2.changeSign();

761

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

762

}

763

764

if (!TM.Options.UnsafeFPMath) {

765

setOperationAction(ISD::FSIN , MVT::f80, Expand);

766

setOperationAction(ISD::FCOS , MVT::f80, Expand);

767

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

768

}

769

770

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

771

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

772

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

773

setOperationAction(ISD::FRINT, MVT::f80, Expand);

774

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

775

setOperationAction(ISD::FMA, MVT::f80, Expand);

776

}

777

778

// Always use a library call for pow.

779

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

780

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

781

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

782

783

setOperationAction(ISD::FLOG, MVT::f80, Expand);

784

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

785

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

786

setOperationAction(ISD::FEXP, MVT::f80, Expand);

787

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

788

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

789

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

790

791

// First set operation action for all vector types to either promote

792

// (for widening) or expand (for scalarization). Then we will selectively

793

// turn on ones that can be effectively codegen'd.

794

for (MVT VT : MVT::vector_valuetypes()) {

795

setOperationAction(ISD::ADD , VT, Expand);

796

setOperationAction(ISD::SUB , VT, Expand);

797

setOperationAction(ISD::FADD, VT, Expand);

798

setOperationAction(ISD::FNEG, VT, Expand);

799

setOperationAction(ISD::FSUB, VT, Expand);

800

setOperationAction(ISD::MUL , VT, Expand);

801

setOperationAction(ISD::FMUL, VT, Expand);

802

setOperationAction(ISD::SDIV, VT, Expand);

803

setOperationAction(ISD::UDIV, VT, Expand);

804

setOperationAction(ISD::FDIV, VT, Expand);

805

setOperationAction(ISD::SREM, VT, Expand);

806

setOperationAction(ISD::UREM, VT, Expand);

807

setOperationAction(ISD::LOAD, VT, Expand);

808

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);

809

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

810

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

811

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

812

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

813

setOperationAction(ISD::FABS, VT, Expand);

814

setOperationAction(ISD::FSIN, VT, Expand);

815

setOperationAction(ISD::FSINCOS, VT, Expand);

816

setOperationAction(ISD::FCOS, VT, Expand);

817

setOperationAction(ISD::FSINCOS, VT, Expand);

818

setOperationAction(ISD::FREM, VT, Expand);

819

setOperationAction(ISD::FMA, VT, Expand);

820

setOperationAction(ISD::FPOWI, VT, Expand);

821

setOperationAction(ISD::FSQRT, VT, Expand);

822

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

823

setOperationAction(ISD::FFLOOR, VT, Expand);

824

setOperationAction(ISD::FCEIL, VT, Expand);

825

setOperationAction(ISD::FTRUNC, VT, Expand);

826

setOperationAction(ISD::FRINT, VT, Expand);

827

setOperationAction(ISD::FNEARBYINT, VT, Expand);

828

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

829

setOperationAction(ISD::MULHS, VT, Expand);

830

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

831

setOperationAction(ISD::MULHU, VT, Expand);

832

setOperationAction(ISD::SDIVREM, VT, Expand);

833

setOperationAction(ISD::UDIVREM, VT, Expand);

834

setOperationAction(ISD::FPOW, VT, Expand);

835

setOperationAction(ISD::CTPOP, VT, Expand);

836

setOperationAction(ISD::CTTZ, VT, Expand);

837

setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);

838

setOperationAction(ISD::CTLZ, VT, Expand);

839

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);

840

setOperationAction(ISD::SHL, VT, Expand);

841

setOperationAction(ISD::SRA, VT, Expand);

842

setOperationAction(ISD::SRL, VT, Expand);

843

setOperationAction(ISD::ROTL, VT, Expand);

844

setOperationAction(ISD::ROTR, VT, Expand);

845

setOperationAction(ISD::BSWAP, VT, Expand);

846

setOperationAction(ISD::SETCC, VT, Expand);

847

setOperationAction(ISD::FLOG, VT, Expand);

848

setOperationAction(ISD::FLOG2, VT, Expand);

849

setOperationAction(ISD::FLOG10, VT, Expand);

850

setOperationAction(ISD::FEXP, VT, Expand);

851

setOperationAction(ISD::FEXP2, VT, Expand);

852

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

853

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

854

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

855

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

856

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

857

setOperationAction(ISD::TRUNCATE, VT, Expand);

858

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

859

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

860

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

861

setOperationAction(ISD::VSELECT, VT, Expand);

862

setOperationAction(ISD::SELECT_CC, VT, Expand);

863

for (MVT InnerVT : MVT::vector_valuetypes()) {

864

setTruncStoreAction(InnerVT, VT, Expand);

865

866

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

867

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

868

869

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

870

// types, we have to deal with them whether we ask for Expansion or not.

871

// Setting Expand causes its own optimisation problems though, so leave

872

// them legal.

873

if (VT.getVectorElementType() == MVT::i1)

874

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

875

}

876

}

877

878

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

879

// with -msoft-float, disable use of MMX as well.

880

if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {

881

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

882

// No operations on x86mmx supported, everything uses intrinsics.

883

}

884

885

// MMX-sized vectors (other than x86mmx) are expected to be expanded

886

// into smaller operations.

887

setOperationAction(ISD::MULHS, MVT::v8i8, Expand);

888

setOperationAction(ISD::MULHS, MVT::v4i16, Expand);

889

setOperationAction(ISD::MULHS, MVT::v2i32, Expand);

890

setOperationAction(ISD::MULHS, MVT::v1i64, Expand);

891

setOperationAction(ISD::AND, MVT::v8i8, Expand);

892

setOperationAction(ISD::AND, MVT::v4i16, Expand);

893

setOperationAction(ISD::AND, MVT::v2i32, Expand);

894

setOperationAction(ISD::AND, MVT::v1i64, Expand);

895

setOperationAction(ISD::OR, MVT::v8i8, Expand);

896

setOperationAction(ISD::OR, MVT::v4i16, Expand);

897

setOperationAction(ISD::OR, MVT::v2i32, Expand);

898

setOperationAction(ISD::OR, MVT::v1i64, Expand);

899

setOperationAction(ISD::XOR, MVT::v8i8, Expand);

900

setOperationAction(ISD::XOR, MVT::v4i16, Expand);

901

setOperationAction(ISD::XOR, MVT::v2i32, Expand);

902

setOperationAction(ISD::XOR, MVT::v1i64, Expand);

903

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);

904

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);

905

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);

906

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);

907

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);

908

setOperationAction(ISD::SELECT, MVT::v8i8, Expand);

909

setOperationAction(ISD::SELECT, MVT::v4i16, Expand);

910

setOperationAction(ISD::SELECT, MVT::v2i32, Expand);

911

setOperationAction(ISD::SELECT, MVT::v1i64, Expand);

912

setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);

913

setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);

914

setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);

915

setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);

916

917

if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {

918

addRegisterClass(MVT::v4f32, &X86::VR128RegClass);

919

920

setOperationAction(ISD::FADD, MVT::v4f32, Legal);

921

setOperationAction(ISD::FSUB, MVT::v4f32, Legal);

922

setOperationAction(ISD::FMUL, MVT::v4f32, Legal);

923

setOperationAction(ISD::FDIV, MVT::v4f32, Legal);

924

setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);

925

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

926

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

927

setOperationAction(ISD::LOAD, MVT::v4f32, Legal);

928

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

929

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

930

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

931

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

932

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

933

}

934

935

if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {

936

addRegisterClass(MVT::v2f64, &X86::VR128RegClass);

937

938

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

939

// registers cannot be used even for integer operations.

940

addRegisterClass(MVT::v16i8, &X86::VR128RegClass);

941

addRegisterClass(MVT::v8i16, &X86::VR128RegClass);

942

addRegisterClass(MVT::v4i32, &X86::VR128RegClass);

943

addRegisterClass(MVT::v2i64, &X86::VR128RegClass);

944

945

setOperationAction(ISD::ADD, MVT::v16i8, Legal);

946

setOperationAction(ISD::ADD, MVT::v8i16, Legal);

947

setOperationAction(ISD::ADD, MVT::v4i32, Legal);

948

setOperationAction(ISD::ADD, MVT::v2i64, Legal);

949

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

950

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

951

setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);

952

setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);

953

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

954

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

955

setOperationAction(ISD::SUB, MVT::v16i8, Legal);

956

setOperationAction(ISD::SUB, MVT::v8i16, Legal);

957

setOperationAction(ISD::SUB, MVT::v4i32, Legal);

958

setOperationAction(ISD::SUB, MVT::v2i64, Legal);

959

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

960

setOperationAction(ISD::FADD, MVT::v2f64, Legal);

961

setOperationAction(ISD::FSUB, MVT::v2f64, Legal);

962

setOperationAction(ISD::FMUL, MVT::v2f64, Legal);

963

setOperationAction(ISD::FDIV, MVT::v2f64, Legal);

964

setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);

965

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

966

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

967

968

setOperationAction(ISD::SETCC, MVT::v2i64, Custom);

969

setOperationAction(ISD::SETCC, MVT::v16i8, Custom);

970

setOperationAction(ISD::SETCC, MVT::v8i16, Custom);

971

setOperationAction(ISD::SETCC, MVT::v4i32, Custom);

972

973

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);

974

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);

975

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

976

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

977

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

978

979

// Only provide customized ctpop vector bit twiddling for vector types we

980

// know to perform better than using the popcnt instructions on each vector

981

// element. If popcnt isn't supported, always provide the custom version.

982

if (!Subtarget->hasPOPCNT()) {

983

setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);

984

setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);

985

}

986

987

// Custom lower build_vector, vector_shuffle, and extract_vector_elt.

988

for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {

989

MVT VT = (MVT::SimpleValueType)i;

990

// Do not attempt to custom lower non-power-of-2 vectors

991

if (!isPowerOf2_32(VT.getVectorNumElements()))

992

continue;

993

// Do not attempt to custom lower non-128-bit vectors

994

if (!VT.is128BitVector())

995

continue;

996

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

997

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

998

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

999

}

1000

1001

// We support custom legalizing of sext and anyext loads for specific

1002

// memory vector types which we can load as a scalar (or sequence of

1003

// scalars) and extend in-register to a legal 128-bit vector type. For sext

1004

// loads these must work with a single scalar load.

1005

for (MVT VT : MVT::integer_vector_valuetypes()) {

1006

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);

1007

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);

1008

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);

1009

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);

1010

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);

1011

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);

1012

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);

1013

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);

1014

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);

1015

}

1016

1017

setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);

1018

setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);

1019

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);

1020

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);

1021

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);

1022

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);

1023

1024

if (Subtarget->is64Bit()) {

1025

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);

1026

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);

1027

}

1028

1029

// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.

1030

for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {

1031

MVT VT = (MVT::SimpleValueType)i;

1032

1033

// Do not attempt to promote non-128-bit vectors

1034

if (!VT.is128BitVector())

1035

continue;

1036

1037

setOperationAction(ISD::AND, VT, Promote);

1038

AddPromotedToType (ISD::AND, VT, MVT::v2i64);

1039

setOperationAction(ISD::OR, VT, Promote);

1040

AddPromotedToType (ISD::OR, VT, MVT::v2i64);

1041

setOperationAction(ISD::XOR, VT, Promote);

1042

AddPromotedToType (ISD::XOR, VT, MVT::v2i64);

1043

setOperationAction(ISD::LOAD, VT, Promote);

1044

AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);

1045

setOperationAction(ISD::SELECT, VT, Promote);

1046

AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);

1047

}

1048

1049

// Custom lower v2i64 and v2f64 selects.

1050

setOperationAction(ISD::LOAD, MVT::v2f64, Legal);

1051

setOperationAction(ISD::LOAD, MVT::v2i64, Legal);

1052

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

1053

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

1054

1055

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);

1056

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);

1057

1058

setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);

1059

setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);

1060

// As there is no 64-bit GPR available, we need build a special custom

1061

// sequence to convert from v2i32 to v2f32.

1062

if (!Subtarget->is64Bit())

1063

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1064

1065

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1066

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1067

1068

for (MVT VT : MVT::fp_vector_valuetypes())

1069

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);

1070

1071

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1072

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1073

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1074

}

1075

1076

if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {

1077

setOperationAction(ISD::FFLOOR, MVT::f32, Legal);

1078

setOperationAction(ISD::FCEIL, MVT::f32, Legal);

1079

setOperationAction(ISD::FTRUNC, MVT::f32, Legal);

1080

setOperationAction(ISD::FRINT, MVT::f32, Legal);

1081

setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);

1082

setOperationAction(ISD::FFLOOR, MVT::f64, Legal);

1083

setOperationAction(ISD::FCEIL, MVT::f64, Legal);

1084

setOperationAction(ISD::FTRUNC, MVT::f64, Legal);

1085

setOperationAction(ISD::FRINT, MVT::f64, Legal);

1086

setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);

1087

1088

setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);

1089

setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);

1090

setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);

1091

setOperationAction(ISD::FRINT, MVT::v4f32, Legal);

1092

setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);

1093

setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);

1094

setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);

1095

setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);

1096

setOperationAction(ISD::FRINT, MVT::v2f64, Legal);

1097

setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);

1098

1099

// FIXME: Do we need to handle scalar-to-vector here?

1100

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1101

1102

setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);

1103

setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);

1104

setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);

1105

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

1106

setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);

1107

// There is no BLENDI for byte vectors. We don't need to custom lower

1108

// some vselects for now.

1109

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1110

1111

// SSE41 brings specific instructions for doing vector sign extend even in

1112

// cases where we don't have SRA.

1113

for (MVT VT : MVT::integer_vector_valuetypes()) {

1114

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);

1115

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);

1116

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);

1117

}

1118

1119

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1120

setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);

1121

setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);

1122

setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);

1123

setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);

1124

setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);

1125

setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);

1126

1127

setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);

1128

setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);

1129

setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);

1130

setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);

1131

setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);

1132

setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);

1133

1134

// i8 and i16 vectors are custom because the source register and source

1135

// source memory operand types are not the same width. f32 vectors are

1136

// custom since the immediate controlling the insert encodes additional

1137

// information.

1138

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

1139

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

1140

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

1141

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

1142

1143

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);

1144

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);

1145

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);

1146

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

1147

1148

// FIXME: these should be Legal, but that's only for the case where

1149

// the index is constant. For now custom expand to deal with that.

1150

if (Subtarget->is64Bit()) {

1151

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);

1152

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);

1153

}

1154

}

1155

1156

if (Subtarget->hasSSE2()) {

1157

setOperationAction(ISD::SRL, MVT::v8i16, Custom);

1158

setOperationAction(ISD::SRL, MVT::v16i8, Custom);

1159

1160

setOperationAction(ISD::SHL, MVT::v8i16, Custom);

1161

setOperationAction(ISD::SHL, MVT::v16i8, Custom);

1162

1163

setOperationAction(ISD::SRA, MVT::v8i16, Custom);

1164

setOperationAction(ISD::SRA, MVT::v16i8, Custom);

1165

1166

// In the customized shift lowering, the legal cases in AVX2 will be

1167

// recognized.

1168

setOperationAction(ISD::SRL, MVT::v2i64, Custom);

1169

setOperationAction(ISD::SRL, MVT::v4i32, Custom);

1170

1171

setOperationAction(ISD::SHL, MVT::v2i64, Custom);

1172

setOperationAction(ISD::SHL, MVT::v4i32, Custom);

1173

1174

setOperationAction(ISD::SRA, MVT::v4i32, Custom);

1175

}

1176

1177

if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {

1178

addRegisterClass(MVT::v32i8, &X86::VR256RegClass);

1179

addRegisterClass(MVT::v16i16, &X86::VR256RegClass);

1180

addRegisterClass(MVT::v8i32, &X86::VR256RegClass);

1181

addRegisterClass(MVT::v8f32, &X86::VR256RegClass);

1182

addRegisterClass(MVT::v4i64, &X86::VR256RegClass);

1183

addRegisterClass(MVT::v4f64, &X86::VR256RegClass);

1184

1185

setOperationAction(ISD::LOAD, MVT::v8f32, Legal);

1186

setOperationAction(ISD::LOAD, MVT::v4f64, Legal);

1187

setOperationAction(ISD::LOAD, MVT::v4i64, Legal);

1188

1189

setOperationAction(ISD::FADD, MVT::v8f32, Legal);

1190

setOperationAction(ISD::FSUB, MVT::v8f32, Legal);

1191

setOperationAction(ISD::FMUL, MVT::v8f32, Legal);

1192

setOperationAction(ISD::FDIV, MVT::v8f32, Legal);

1193

setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);

1194

setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);

1195

setOperationAction(ISD::FCEIL, MVT::v8f32, Legal);

1196

setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal);

1197

setOperationAction(ISD::FRINT, MVT::v8f32, Legal);

1198

setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal);

1199

setOperationAction(ISD::FNEG, MVT::v8f32, Custom);

1200

setOperationAction(ISD::FABS, MVT::v8f32, Custom);

1201

1202

setOperationAction(ISD::FADD, MVT::v4f64, Legal);

1203

setOperationAction(ISD::FSUB, MVT::v4f64, Legal);

1204

setOperationAction(ISD::FMUL, MVT::v4f64, Legal);

1205

setOperationAction(ISD::FDIV, MVT::v4f64, Legal);

1206

setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);

1207

setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);

1208

setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);

1209

setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);

1210

setOperationAction(ISD::FRINT, MVT::v4f64, Legal);

1211

setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal);

1212

setOperationAction(ISD::FNEG, MVT::v4f64, Custom);

1213

setOperationAction(ISD::FABS, MVT::v4f64, Custom);

1214

1215

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1216

// even though v8i16 is a legal type.

1217

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);

1218

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);

1219

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

1220

1221

setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);

1222

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);

1223

setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);

1224

1225

setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);

1226

setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);

1227

1228

for (MVT VT : MVT::fp_vector_valuetypes())

1229

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);

1230

1231

setOperationAction(ISD::SRL, MVT::v16i16, Custom);

1232

setOperationAction(ISD::SRL, MVT::v32i8, Custom);

1233

1234

setOperationAction(ISD::SHL, MVT::v16i16, Custom);

1235

setOperationAction(ISD::SHL, MVT::v32i8, Custom);

1236

1237

setOperationAction(ISD::SRA, MVT::v16i16, Custom);

1238

setOperationAction(ISD::SRA, MVT::v32i8, Custom);

1239

1240

setOperationAction(ISD::SETCC, MVT::v32i8, Custom);

1241

setOperationAction(ISD::SETCC, MVT::v16i16, Custom);

1242

setOperationAction(ISD::SETCC, MVT::v8i32, Custom);

1243

setOperationAction(ISD::SETCC, MVT::v4i64, Custom);

1244

1245

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1246

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1247

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1248

1249

setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);

1250

setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);

1251

setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);

1252

setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);

1253

1254

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1255

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);

1256

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);

1257

setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);

1258

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);

1259

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);

1260

setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);

1261

setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);

1262

setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);

1263

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1264

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1265

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1266

1267

if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {

1268

setOperationAction(ISD::FMA, MVT::v8f32, Legal);

1269

setOperationAction(ISD::FMA, MVT::v4f64, Legal);

1270

setOperationAction(ISD::FMA, MVT::v4f32, Legal);

1271

setOperationAction(ISD::FMA, MVT::v2f64, Legal);

1272

setOperationAction(ISD::FMA, MVT::f32, Legal);

1273

setOperationAction(ISD::FMA, MVT::f64, Legal);

1274

}

1275

1276

if (Subtarget->hasInt256()) {

1277

setOperationAction(ISD::ADD, MVT::v4i64, Legal);

1278

setOperationAction(ISD::ADD, MVT::v8i32, Legal);

1279

setOperationAction(ISD::ADD, MVT::v16i16, Legal);

1280

setOperationAction(ISD::ADD, MVT::v32i8, Legal);

1281

1282

setOperationAction(ISD::SUB, MVT::v4i64, Legal);

1283

setOperationAction(ISD::SUB, MVT::v8i32, Legal);

1284

setOperationAction(ISD::SUB, MVT::v16i16, Legal);

1285

setOperationAction(ISD::SUB, MVT::v32i8, Legal);

1286

1287

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1288

setOperationAction(ISD::MUL, MVT::v8i32, Legal);

1289

setOperationAction(ISD::MUL, MVT::v16i16, Legal);

1290

// Don't lower v32i8 because there is no 128-bit byte mul

1291

1292

setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);

1293

setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

1294

setOperationAction(ISD::MULHU, MVT::v16i16, Legal);

1295

setOperationAction(ISD::MULHS, MVT::v16i16, Legal);

1296

1297

setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);

1298

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1299

1300

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1301

// when we have a 256bit-wide blend with immediate.

1302

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1303

1304

// Only provide customized ctpop vector bit twiddling for vector types we

1305

// know to perform better than using the popcnt instructions on each

1306

// vector element. If popcnt isn't supported, always provide the custom

1307

// version.

1308

if (!Subtarget->hasPOPCNT())

1309

setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);

1310

1311

// Custom CTPOP always performs better on natively supported v8i32

1312

setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);

1313

1314

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1315

setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);

1316

setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);

1317

setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);

1318

setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);

1319

setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);

1320

setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);

1321

1322

setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);

1323

setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);

1324

setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);

1325

setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);

1326

setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);

1327

setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);

1328

} else {

1329

setOperationAction(ISD::ADD, MVT::v4i64, Custom);

1330

setOperationAction(ISD::ADD, MVT::v8i32, Custom);

1331

setOperationAction(ISD::ADD, MVT::v16i16, Custom);

1332

setOperationAction(ISD::ADD, MVT::v32i8, Custom);

1333

1334

setOperationAction(ISD::SUB, MVT::v4i64, Custom);

1335

setOperationAction(ISD::SUB, MVT::v8i32, Custom);

1336

setOperationAction(ISD::SUB, MVT::v16i16, Custom);

1337

setOperationAction(ISD::SUB, MVT::v32i8, Custom);

1338

1339

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1340

setOperationAction(ISD::MUL, MVT::v8i32, Custom);

1341

setOperationAction(ISD::MUL, MVT::v16i16, Custom);

1342

// Don't lower v32i8 because there is no 128-bit byte mul

1343

}

1344

1345

// In the customized shift lowering, the legal cases in AVX2 will be

1346

// recognized.

1347

setOperationAction(ISD::SRL, MVT::v4i64, Custom);

1348

setOperationAction(ISD::SRL, MVT::v8i32, Custom);

1349

1350

setOperationAction(ISD::SHL, MVT::v4i64, Custom);

1351

setOperationAction(ISD::SHL, MVT::v8i32, Custom);

1352

1353

setOperationAction(ISD::SRA, MVT::v8i32, Custom);

1354

1355

// Custom lower several nodes for 256-bit types.

1356

for (MVT VT : MVT::vector_valuetypes()) {

1357

if (VT.getScalarSizeInBits() >= 32) {

1358

setOperationAction(ISD::MLOAD, VT, Legal);

1359

setOperationAction(ISD::MSTORE, VT, Legal);

1360

}

1361

// Extract subvector is special because the value type

1362

// (result) is 128-bit but the source is 256-bit wide.

1363

if (VT.is128BitVector()) {

1364

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1365

}

1366

// Do not attempt to custom lower other non-256-bit vectors

1367

if (!VT.is256BitVector())

1368

continue;

1369

1370

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1371

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1372

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1373

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1374

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1375

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1376

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1377

}

1378

1379

// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.

1380

for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {

1381

MVT VT = (MVT::SimpleValueType)i;

1382

1383

// Do not attempt to promote non-256-bit vectors

1384

if (!VT.is256BitVector())

1385

continue;

1386

1387

setOperationAction(ISD::AND, VT, Promote);

1388

AddPromotedToType (ISD::AND, VT, MVT::v4i64);

1389

setOperationAction(ISD::OR, VT, Promote);

1390

AddPromotedToType (ISD::OR, VT, MVT::v4i64);

1391

setOperationAction(ISD::XOR, VT, Promote);

1392

AddPromotedToType (ISD::XOR, VT, MVT::v4i64);

1393

setOperationAction(ISD::LOAD, VT, Promote);

1394

AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);

1395

setOperationAction(ISD::SELECT, VT, Promote);

1396

AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);

1397

}

1398

}

1399

1400

if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {

1401

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1402

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1403

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1404

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1405

1406

addRegisterClass(MVT::i1, &X86::VK1RegClass);

1407

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1408

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1409

1410

for (MVT VT : MVT::fp_vector_valuetypes())

1411

setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);

1412

1413

setOperationAction(ISD::BR_CC, MVT::i1, Expand);

1414

setOperationAction(ISD::SETCC, MVT::i1, Custom);

1415

setOperationAction(ISD::XOR, MVT::i1, Legal);

1416

setOperationAction(ISD::OR, MVT::i1, Legal);

1417

setOperationAction(ISD::AND, MVT::i1, Legal);

1418

setOperationAction(ISD::LOAD, MVT::v16f32, Legal);

1419

setOperationAction(ISD::LOAD, MVT::v8f64, Legal);

1420

setOperationAction(ISD::LOAD, MVT::v8i64, Legal);

1421

setOperationAction(ISD::LOAD, MVT::v16i32, Legal);

1422

setOperationAction(ISD::LOAD, MVT::v16i1, Legal);

1423

1424

setOperationAction(ISD::FADD, MVT::v16f32, Legal);

1425

setOperationAction(ISD::FSUB, MVT::v16f32, Legal);

1426

setOperationAction(ISD::FMUL, MVT::v16f32, Legal);

1427

setOperationAction(ISD::FDIV, MVT::v16f32, Legal);

1428

setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);

1429

setOperationAction(ISD::FNEG, MVT::v16f32, Custom);

1430

1431

setOperationAction(ISD::FADD, MVT::v8f64, Legal);

1432

setOperationAction(ISD::FSUB, MVT::v8f64, Legal);

1433

setOperationAction(ISD::FMUL, MVT::v8f64, Legal);

1434

setOperationAction(ISD::FDIV, MVT::v8f64, Legal);

1435

setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);

1436

setOperationAction(ISD::FNEG, MVT::v8f64, Custom);

1437

setOperationAction(ISD::FMA, MVT::v8f64, Legal);

1438

setOperationAction(ISD::FMA, MVT::v16f32, Legal);

1439

1440

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);

1441

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);

1442

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);

1443

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);

1444

if (Subtarget->is64Bit()) {

1445

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);

1446

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);

1447

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);

1448

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);

1449

}

1450

setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);

1451

setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);

1452

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);

1453

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);

1454

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);

1455

setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);

1456

setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);

1457

setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);

1458

setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);

1459

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

1460

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);

1461

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

1462

setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);

1463

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

1464

1465

setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);

1466

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1467

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);

1468

setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);

1469

setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);

1470

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);

1471

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1472

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1473

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1474

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1475

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);

1476

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);

1477

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);

1478

1479

setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);

1480

setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);

1481

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);

1482

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

1483

setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);

1484

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);

1485

1486

setOperationAction(ISD::SETCC, MVT::v16i1, Custom);

1487

setOperationAction(ISD::SETCC, MVT::v8i1, Custom);

1488

1489

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1490

1491

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);

1492

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);

1493

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);

1494

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);

1495

setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);

1496

setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);

1497

setOperationAction(ISD::SELECT, MVT::v8f64, Custom);

1498

setOperationAction(ISD::SELECT, MVT::v8i64, Custom);

1499

setOperationAction(ISD::SELECT, MVT::v16f32, Custom);

1500

1501

setOperationAction(ISD::ADD, MVT::v8i64, Legal);

1502

setOperationAction(ISD::ADD, MVT::v16i32, Legal);

1503

1504

setOperationAction(ISD::SUB, MVT::v8i64, Legal);

1505

setOperationAction(ISD::SUB, MVT::v16i32, Legal);

1506

1507

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1508

1509

setOperationAction(ISD::SRL, MVT::v8i64, Custom);

1510

setOperationAction(ISD::SRL, MVT::v16i32, Custom);

1511

1512

setOperationAction(ISD::SHL, MVT::v8i64, Custom);

1513

setOperationAction(ISD::SHL, MVT::v16i32, Custom);

1514

1515

setOperationAction(ISD::SRA, MVT::v8i64, Custom);

1516

setOperationAction(ISD::SRA, MVT::v16i32, Custom);

1517

1518

setOperationAction(ISD::AND, MVT::v8i64, Legal);

1519

setOperationAction(ISD::OR, MVT::v8i64, Legal);

1520

setOperationAction(ISD::XOR, MVT::v8i64, Legal);

1521

setOperationAction(ISD::AND, MVT::v16i32, Legal);

1522

setOperationAction(ISD::OR, MVT::v16i32, Legal);

1523

setOperationAction(ISD::XOR, MVT::v16i32, Legal);

1524

1525

if (Subtarget->hasCDI()) {

1526

setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);

1527

setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);

1528

}

1529

1530

// Custom lower several nodes.

1531

for (MVT VT : MVT::vector_valuetypes()) {

1532

unsigned EltSize = VT.getVectorElementType().getSizeInBits();

1533

// Extract subvector is special because the value type

1534

// (result) is 256/128-bit but the source is 512-bit wide.

1535

if (VT.is128BitVector() || VT.is256BitVector()) {

1536

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1537

}

1538

if (VT.getVectorElementType() == MVT::i1)

1539

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1540

1541

// Do not attempt to custom lower other non-512-bit vectors

1542

if (!VT.is512BitVector())

1543

continue;

1544

1545

if ( EltSize >= 32) {

1546

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1547

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1548

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1549

setOperationAction(ISD::VSELECT, VT, Legal);

1550

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1551

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1552

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1553

setOperationAction(ISD::MLOAD, VT, Legal);

1554

setOperationAction(ISD::MSTORE, VT, Legal);

1555

}

1556

}

1557

for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {

1558

MVT VT = (MVT::SimpleValueType)i;

1559

1560

// Do not attempt to promote non-512-bit vectors.

1561

if (!VT.is512BitVector())

1562

continue;

1563

1564

setOperationAction(ISD::SELECT, VT, Promote);

1565

AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);

1566

}

1567

}// has AVX-512

1568

1569

if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {

1570

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1571

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1572

1573

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

1574

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

1575

1576

setOperationAction(ISD::LOAD, MVT::v32i16, Legal);

1577

setOperationAction(ISD::LOAD, MVT::v64i8, Legal);

1578

setOperationAction(ISD::SETCC, MVT::v32i1, Custom);

1579

setOperationAction(ISD::SETCC, MVT::v64i1, Custom);

1580

setOperationAction(ISD::ADD, MVT::v32i16, Legal);

1581

setOperationAction(ISD::ADD, MVT::v64i8, Legal);

1582

setOperationAction(ISD::SUB, MVT::v32i16, Legal);

1583

setOperationAction(ISD::SUB, MVT::v64i8, Legal);

1584

setOperationAction(ISD::MUL, MVT::v32i16, Legal);

1585

1586

for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {

1587

const MVT VT = (MVT::SimpleValueType)i;

1588

1589

const unsigned EltSize = VT.getVectorElementType().getSizeInBits();

1590

1591

// Do not attempt to promote non-512-bit vectors.

1592

if (!VT.is512BitVector())

1593

continue;

1594

1595

if (EltSize < 32) {

1596

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1597

setOperationAction(ISD::VSELECT, VT, Legal);

1598

}

1599

}

1600

}

1601

1602

if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {

1603

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1604

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1605

1606

setOperationAction(ISD::SETCC, MVT::v4i1, Custom);

1607

setOperationAction(ISD::SETCC, MVT::v2i1, Custom);

1608

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);

1609

1610

setOperationAction(ISD::AND, MVT::v8i32, Legal);

1611

setOperationAction(ISD::OR, MVT::v8i32, Legal);

1612

setOperationAction(ISD::XOR, MVT::v8i32, Legal);

1613

setOperationAction(ISD::AND, MVT::v4i32, Legal);

1614

setOperationAction(ISD::OR, MVT::v4i32, Legal);

1615

setOperationAction(ISD::XOR, MVT::v4i32, Legal);

1616

}

1617

1618

// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion

1619

// of this type with custom code.

1620

for (MVT VT : MVT::vector_valuetypes())

1621

setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);

1622

1623

// We want to custom lower some of our intrinsics.

1624

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

1625

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

1626

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

1627

if (!Subtarget->is64Bit())

1628

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

1629

1630

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

1631

// handle type legalization for these operations here.

1632

1633

// FIXME: We really should do custom legalization for addition and

1634

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

1635

// than generic legalization for 64-bit multiplication-with-overflow, though.

1636

for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {

1637

// Add/Sub/Mul with overflow operations are custom lowered.

1638

MVT VT = IntVTs[i];

1639

setOperationAction(ISD::SADDO, VT, Custom);

1640

setOperationAction(ISD::UADDO, VT, Custom);

1641

setOperationAction(ISD::SSUBO, VT, Custom);

1642

setOperationAction(ISD::USUBO, VT, Custom);

1643

setOperationAction(ISD::SMULO, VT, Custom);

1644

setOperationAction(ISD::UMULO, VT, Custom);

1645

}

1646

1647

1648

if (!Subtarget->is64Bit()) {

1649

// These libcalls are not available in 32-bit.

1650

setLibcallName(RTLIB::SHL_I128, nullptr);

1651

setLibcallName(RTLIB::SRL_I128, nullptr);

1652

setLibcallName(RTLIB::SRA_I128, nullptr);

1653

}

1654

1655

// Combine sin / cos into one node or libcall if possible.

1656

if (Subtarget->hasSinCos()) {

1657

setLibcallName(RTLIB::SINCOS_F32, "sincosf");

1658

setLibcallName(RTLIB::SINCOS_F64, "sincos");

1659

if (Subtarget->isTargetDarwin()) {

1660

// For MacOSX, we don't want the normal expansion of a libcall to sincos.

1661

// We want to issue a libcall to __sincos_stret to avoid memory traffic.

1662

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

1663

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

1664

}

1665

}

1666

1667

if (Subtarget->isTargetWin64()) {

1668

setOperationAction(ISD::SDIV, MVT::i128, Custom);

1669

setOperationAction(ISD::UDIV, MVT::i128, Custom);

1670

setOperationAction(ISD::SREM, MVT::i128, Custom);

1671

setOperationAction(ISD::UREM, MVT::i128, Custom);

1672

setOperationAction(ISD::SDIVREM, MVT::i128, Custom);

1673

setOperationAction(ISD::UDIVREM, MVT::i128, Custom);

1674

}

1675

1676

// We have target-specific dag combine patterns for the following nodes:

1677

setTargetDAGCombine(ISD::VECTOR_SHUFFLE);

1678

setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);

1679

setTargetDAGCombine(ISD::VSELECT);

1680

setTargetDAGCombine(ISD::SELECT);

1681

setTargetDAGCombine(ISD::SHL);

1682

setTargetDAGCombine(ISD::SRA);

1683

setTargetDAGCombine(ISD::SRL);

1684

setTargetDAGCombine(ISD::OR);

1685

setTargetDAGCombine(ISD::AND);

1686

setTargetDAGCombine(ISD::ADD);

1687

setTargetDAGCombine(ISD::FADD);

1688

setTargetDAGCombine(ISD::FSUB);

1689

setTargetDAGCombine(ISD::FMA);

1690

setTargetDAGCombine(ISD::SUB);

1691

setTargetDAGCombine(ISD::LOAD);

1692

setTargetDAGCombine(ISD::MLOAD);

1693

setTargetDAGCombine(ISD::STORE);

1694

setTargetDAGCombine(ISD::MSTORE);

1695

setTargetDAGCombine(ISD::ZERO_EXTEND);

1696

setTargetDAGCombine(ISD::ANY_EXTEND);

1697

setTargetDAGCombine(ISD::SIGN_EXTEND);

1698

setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);

1699

setTargetDAGCombine(ISD::TRUNCATE);

1700

setTargetDAGCombine(ISD::SINT_TO_FP);

1701

setTargetDAGCombine(ISD::SETCC);

1702

setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

1703

setTargetDAGCombine(ISD::BUILD_VECTOR);

1704

setTargetDAGCombine(ISD::MUL);

1705

setTargetDAGCombine(ISD::XOR);

1706

1707

computeRegisterProperties();

1708

1709

// On Darwin, -Os means optimize for size without hurting performance,

1710

// do not reduce the limit.

1711

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

1712

MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;

1713

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

1714

MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;

1715

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

1716

MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;

1717

setPrefLoopAlignment(4); // 2^4 bytes.

1718

1719

// Predictable cmov don't hurt on atom because it's in-order.

1720

PredictableSelectIsExpensive = !Subtarget->isAtom();

1721

EnableExtLdPromotion = true;

1722

setPrefFunctionAlignment(4); // 2^4 bytes.

1723

1724

verifyIntrinsicTables();

1725

}

1726

1727

// This has so far only been implemented for 64-bit MachO.

1728

bool X86TargetLowering::useLoadStackGuardNode() const {

1729

return Subtarget->isTargetMachO() && Subtarget->is64Bit();

1730

}

1731

1732

TargetLoweringBase::LegalizeTypeAction

1733

X86TargetLowering::getPreferredVectorAction(EVT VT) const {

1734

if (ExperimentalVectorWideningLegalization &&

1735

VT.getVectorNumElements() != 1 &&

1736

VT.getVectorElementType().getSimpleVT() != MVT::i1)

1737

return TypeWidenVector;

1738

1739

return TargetLoweringBase::getPreferredVectorAction(VT);

1740

}

1741

1742

EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {

1743

if (!VT.isVector())

1744

return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;

1745

1746

const unsigned NumElts = VT.getVectorNumElements();

1747

const EVT EltVT = VT.getVectorElementType();

1748

if (VT.is512BitVector()) {

1749

if (Subtarget->hasAVX512())

1750

if (EltVT == MVT::i32 || EltVT == MVT::i64 ||

1751

EltVT == MVT::f32 || EltVT == MVT::f64)

1752

switch(NumElts) {

1753

case 8: return MVT::v8i1;

1754

case 16: return MVT::v16i1;

1755

}

1756

if (Subtarget->hasBWI())

1757

if (EltVT == MVT::i8 || EltVT == MVT::i16)

1758

switch(NumElts) {

1759

case 32: return MVT::v32i1;

1760

case 64: return MVT::v64i1;

1761

}

1762

}

1763

1764

if (VT.is256BitVector() || VT.is128BitVector()) {

1765

if (Subtarget->hasVLX())

1766

if (EltVT == MVT::i32 || EltVT == MVT::i64 ||

1767

EltVT == MVT::f32 || EltVT == MVT::f64)

1768

switch(NumElts) {

1769

case 2: return MVT::v2i1;

1770

case 4: return MVT::v4i1;

1771

case 8: return MVT::v8i1;

1772

}

1773

if (Subtarget->hasBWI() && Subtarget->hasVLX())

1774

if (EltVT == MVT::i8 || EltVT == MVT::i16)

1775

switch(NumElts) {

1776

case 8: return MVT::v8i1;

1777

case 16: return MVT::v16i1;

1778

case 32: return MVT::v32i1;

1779

}

1780

}

1781

1782

return VT.changeVectorElementTypeToInteger();

1783

}

1784

1785

/// Helper for getByValTypeAlignment to determine

1786

/// the desired ByVal argument alignment.

1787

static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {

1788

if (MaxAlign == 16)

1789

return;

1790

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

1791

if (VTy->getBitWidth() == 128)

1792

MaxAlign = 16;

1793

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

1794

unsigned EltAlign = 0;

1795

getMaxByValAlign(ATy->getElementType(), EltAlign);

1796

if (EltAlign > MaxAlign)

1797

MaxAlign = EltAlign;

1798

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

1799

for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {

1800

unsigned EltAlign = 0;

1801

getMaxByValAlign(STy->getElementType(i), EltAlign);

1802

if (EltAlign > MaxAlign)

1803

MaxAlign = EltAlign;

1804

if (MaxAlign == 16)

1805

break;

1806

}

1807

}

1808

}

1809

1810

/// Return the desired alignment for ByVal aggregate

1811

/// function arguments in the caller parameter area. For X86, aggregates

1812

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

1813

/// are at 4-byte boundaries.

1814

unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {

1815

if (Subtarget->is64Bit()) {

1816

// Max of 8 and alignment of type.

1817

unsigned TyAlign = TD->getABITypeAlignment(Ty);

1818

if (TyAlign > 8)

1819

return TyAlign;

1820

return 8;

1821

}

1822

1823

unsigned Align = 4;

1824

if (Subtarget->hasSSE1())

1825

getMaxByValAlign(Ty, Align);

1826

return Align;

1827

}

1828

1829

/// Returns the target specific optimal type for load

1830

/// and store operations as a result of memset, memcpy, and memmove

1831

/// lowering. If DstAlign is zero that means it's safe to destination

1832

/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it

1833

/// means there isn't a need to check it against alignment requirement,

1834

/// probably because the source does not need to be loaded. If 'IsMemset' is

1835

/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that

1836

/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy

1837

/// source is constant so it does not need to be loaded.

1838

/// It returns EVT::Other if the type should be determined using generic

1839

/// target-independent logic.

1840

EVT

1841

X86TargetLowering::getOptimalMemOpType(uint64_t Size,

1842

unsigned DstAlign, unsigned SrcAlign,

1843

bool IsMemset, bool ZeroMemset,

1844

bool MemcpyStrSrc,

1845

MachineFunction &MF) const {

1846

const Function *F = MF.getFunction();

1847

if ((!IsMemset || ZeroMemset) &&

1848

!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,

1849

Attribute::NoImplicitFloat)) {

1850

if (Size >= 16 &&

1851

(Subtarget->isUnalignedMemAccessFast() ||

1852

((DstAlign == 0 || DstAlign >= 16) &&

1853

(SrcAlign == 0 || SrcAlign >= 16)))) {

1854

if (Size >= 32) {

1855

if (Subtarget->hasInt256())

1856

return MVT::v8i32;

1857

if (Subtarget->hasFp256())

1858

return MVT::v8f32;

1859

}

1860

if (Subtarget->hasSSE2())

1861

return MVT::v4i32;

1862

if (Subtarget->hasSSE1())

1863

return MVT::v4f32;

1864

} else if (!MemcpyStrSrc && Size >= 8 &&

1865

!Subtarget->is64Bit() &&

1866

Subtarget->hasSSE2()) {

1867

// Do not use f64 to lower memcpy if source is string constant. It's

1868

// better to use i32 to avoid the loads.

1869

return MVT::f64;

1870

}

1871

}

1872

if (Subtarget->is64Bit() && Size >= 8)

1873

return MVT::i64;

1874

return MVT::i32;

1875

}

1876

1877

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

1878

if (VT == MVT::f32)

1879

return X86ScalarSSEf32;

1880

else if (VT == MVT::f64)

1881

return X86ScalarSSEf64;

1882

return true;

1883

}

1884

1885

bool

1886

X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

1887

unsigned,

1888

unsigned,

1889

bool *Fast) const {

1890

if (Fast)

1891

*Fast = Subtarget->isUnalignedMemAccessFast();

1892

return true;

1893

}

1894

1895

/// Return the entry encoding for a jump table in the

1896

/// current function. The returned value is a member of the

1897

/// MachineJumpTableInfo::JTEntryKind enum.

1898

unsigned X86TargetLowering::getJumpTableEncoding() const {

1899

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

1900

// symbol.

1901

if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&

1902

Subtarget->isPICStyleGOT())

1903

return MachineJumpTableInfo::EK_Custom32;

1904

1905

// Otherwise, use the normal jump table encoding heuristics.

1906

return TargetLowering::getJumpTableEncoding();

1907

}

1908

1909

const MCExpr *

1910

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

1911

const MachineBasicBlock *MBB,

1912

unsigned uid,MCContext &Ctx) const{

1913

assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&((MBB->getParent()->getTarget().getRelocationModel() ==
Reloc::PIC_ && Subtarget->isPICStyleGOT()) ? static_cast
<void> (0) : __assert_fail ("MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 1914, __PRETTY_FUNCTION__))

1914

Subtarget->isPICStyleGOT())((MBB->getParent()->getTarget().getRelocationModel() ==
Reloc::PIC_ && Subtarget->isPICStyleGOT()) ? static_cast
<void> (0) : __assert_fail ("MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 1914, __PRETTY_FUNCTION__));

1915

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

1916

// entries.

1917

return MCSymbolRefExpr::Create(MBB->getSymbol(),

1918

MCSymbolRefExpr::VK_GOTOFF, Ctx);

1919

}

1920

1921

/// Returns relocation base for the given PIC jumptable.

1922

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

1923

SelectionDAG &DAG) const {

1924

if (!Subtarget->is64Bit())

1925

// This doesn't have SDLoc associated with it, but is not really the

1926

// same as a Register.

1927

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());

1928

return Table;

1929

}

1930

1931

/// This returns the relocation base for the given PIC jumptable,

1932

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

1933

const MCExpr *X86TargetLowering::

1934

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

1935

MCContext &Ctx) const {

1936

// X86-64 uses RIP relative addressing based on the jump table label.

1937

if (Subtarget->isPICStyleRIPRel())

1938

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

1939

1940

// Otherwise, the reference is relative to the PIC base.

1941

return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);

1942

}

1943

1944

// FIXME: Why this routine is here? Move to RegInfo!

1945

std::pair<const TargetRegisterClass*, uint8_t>

1946

X86TargetLowering::findRepresentativeClass(MVT VT) const{

1947

const TargetRegisterClass *RRC = nullptr;

1948

uint8_t Cost = 1;

1949

switch (VT.SimpleTy) {

1950

default:

1951

return TargetLowering::findRepresentativeClass(VT);

1952

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

1953

RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

1954

break;

1955

case MVT::x86mmx:

1956

RRC = &X86::VR64RegClass;

1957

break;

1958

case MVT::f32: case MVT::f64:

1959

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

1960

case MVT::v4f32: case MVT::v2f64:

1961

case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:

1962

case MVT::v4f64:

1963

RRC = &X86::VR128RegClass;

1964

break;

1965

}

1966

return std::make_pair(RRC, Cost);

1967

}

1968

1969

bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,

1970

unsigned &Offset) const {

1971

if (!Subtarget->isTargetLinux())

1972

return false;

1973

1974

if (Subtarget->is64Bit()) {

1975

// %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:

1976

Offset = 0x28;

1977

if (getTargetMachine().getCodeModel() == CodeModel::Kernel)

1978

AddressSpace = 256;

1979

else

1980

AddressSpace = 257;

1981

} else {

1982

// %gs:0x14 on i386

1983

Offset = 0x14;

1984

AddressSpace = 256;

1985

}

1986

return true;

1987

}

1988

1989

bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,

1990

unsigned DestAS) const {

1991

assert(SrcAS != DestAS && "Expected different address spaces!")((SrcAS != DestAS && "Expected different address spaces!"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 1991, __PRETTY_FUNCTION__));

1992

1993

return SrcAS < 256 && DestAS < 256;

1994

}

1995

1996

//===----------------------------------------------------------------------===//

1997

// Return Value Calling Convention Implementation

1998

//===----------------------------------------------------------------------===//

1999

2000

#include "X86GenCallingConv.inc"

2001

2002

bool

2003

X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,

2004

MachineFunction &MF, bool isVarArg,

2005

const SmallVectorImpl<ISD::OutputArg> &Outs,

2006

LLVMContext &Context) const {

2007

SmallVector<CCValAssign, 16> RVLocs;

2008

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

2009

return CCInfo.CheckReturn(Outs, RetCC_X86);

2010

}

2011

2012

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

2013

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

2014

return ScratchRegs;

2015

}

2016

2017

SDValue

2018

X86TargetLowering::LowerReturn(SDValue Chain,

2019

CallingConv::ID CallConv, bool isVarArg,

2020

const SmallVectorImpl<ISD::OutputArg> &Outs,

2021

const SmallVectorImpl<SDValue> &OutVals,

2022

SDLoc dl, SelectionDAG &DAG) const {

2023

MachineFunction &MF = DAG.getMachineFunction();

2024

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

2025

2026

SmallVector<CCValAssign, 16> RVLocs;

2027

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

2028

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

2029

2030

SDValue Flag;

2031

SmallVector<SDValue, 6> RetOps;

2032

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

2033

// Operand #1 = Bytes To Pop

2034

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),

2035

MVT::i16));

2036

2037

// Copy the result values into the output registers.

2038

for (unsigned i = 0; i != RVLocs.size(); ++i) {

2039

CCValAssign &VA = RVLocs[i];

2040

assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2040, __PRETTY_FUNCTION__));

2041

SDValue ValToCopy = OutVals[i];

2042

EVT ValVT = ValToCopy.getValueType();

2043

2044

// Promote values to the appropriate types.

2045

if (VA.getLocInfo() == CCValAssign::SExt)

2046

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

2047

else if (VA.getLocInfo() == CCValAssign::ZExt)

2048

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

2049

else if (VA.getLocInfo() == CCValAssign::AExt)

2050

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

2051

else if (VA.getLocInfo() == CCValAssign::BCvt)

2052

ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);

2053

2054

assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2055, __PRETTY_FUNCTION__))

2055

"Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2055, __PRETTY_FUNCTION__));

2056

2057

// If this is x86-64, and we disabled SSE, we can't return FP values,

2058

// or SSE or MMX vectors.

2059

if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||

2060

VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&

2061

(Subtarget->is64Bit() && !Subtarget->hasSSE1())) {

2062

report_fatal_error("SSE register return with SSE disabled");

2063

}

2064

// Likewise we can't return F64 values with SSE1 only. gcc does so, but

2065

// llvm-gcc has never done it right and no one has noticed, so this

2066

// should be OK for now.

2067

if (ValVT == MVT::f64 &&

2068

(Subtarget->is64Bit() && !Subtarget->hasSSE2()))

2069

report_fatal_error("SSE2 register return with SSE2 disabled");

2070

2071

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

2072

// the RET instruction and handled by the FP Stackifier.

2073

if (VA.getLocReg() == X86::FP0 ||

2074

VA.getLocReg() == X86::FP1) {

2075

// If this is a copy from an xmm register to ST(0), use an FPExtend to

2076

// change the value to the FP stack register class.

2077

if (isScalarFPTypeInSSEReg(VA.getValVT()))

2078

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

2079

RetOps.push_back(ValToCopy);

2080

// Don't emit a copytoreg.

2081

continue;

2082

}

2083

2084

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

2085

// which is returned in RAX / RDX.

2086

if (Subtarget->is64Bit()) {

2087

if (ValVT == MVT::x86mmx) {

2088

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

2089

ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);

2090

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

2091

ValToCopy);

2092

// If we don't have SSE2 available, convert to v4f32 so the generated

2093

// register is legal.

2094

if (!Subtarget->hasSSE2())

2095

ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);

2096

}

2097

}

2098

}

2099

2100

Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);

2101

Flag = Chain.getValue(1);

2102

RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));

2103

}

2104

2105

// The x86-64 ABIs require that for returning structs by value we copy

2106

// the sret argument into %rax/%eax (depending on ABI) for the return.

2107

// Win32 requires us to put the sret argument to %eax as well.

2108

// We saved the argument into a virtual register in the entry block,

2109

// so now we copy the value out and into %rax/%eax.

2110

if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&

2111

(Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {

2112

MachineFunction &MF = DAG.getMachineFunction();

2113

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

2114

unsigned Reg = FuncInfo->getSRetReturnReg();

2115

assert(Reg &&((Reg && "SRetReturnReg should have been set in LowerFormalArguments()."
) ? static_cast<void> (0) : __assert_fail ("Reg && \"SRetReturnReg should have been set in LowerFormalArguments().\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2116, __PRETTY_FUNCTION__))

2116

"SRetReturnReg should have been set in LowerFormalArguments().")((Reg && "SRetReturnReg should have been set in LowerFormalArguments()."
) ? static_cast<void> (0) : __assert_fail ("Reg && \"SRetReturnReg should have been set in LowerFormalArguments().\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2116, __PRETTY_FUNCTION__));

2117

SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());

2118

2119

unsigned RetValReg

2120

= (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?

2121

X86::RAX : X86::EAX;

2122

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);

2123

Flag = Chain.getValue(1);

2124

2125

// RAX/EAX now acts like a return value.

2126

RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));

2127

}

2128

2129

RetOps[0] = Chain; // Update chain.

2130

2131

// Add the flag if we have it.

2132

if (Flag.getNode())

2133

RetOps.push_back(Flag);

2134

2135

return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);

2136

}

2137

2138

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

2139

if (N->getNumValues() != 1)

2140

return false;

2141

if (!N->hasNUsesOfValue(1, 0))

2142

return false;

2143

2144

SDValue TCChain = Chain;

2145

SDNode *Copy = *N->use_begin();

2146

if (Copy->getOpcode() == ISD::CopyToReg) {

2147

// If the copy has a glue operand, we conservatively assume it isn't safe to

2148

// perform a tail call.

2149

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

2150

return false;

2151

TCChain = Copy->getOperand(0);

2152

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

2153

return false;

2154

2155

bool HasRet = false;

2156

for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();

2157

UI != UE; ++UI) {

2158

if (UI->getOpcode() != X86ISD::RET_FLAG)

2159

return false;

2160

// If we are returning more than one value, we can definitely

2161

// not make a tail call see PR19530

2162

if (UI->getNumOperands() > 4)

2163

return false;

2164

if (UI->getNumOperands() == 4 &&

2165

UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)

2166

return false;

2167

HasRet = true;

2168

}

2169

2170

if (!HasRet)

2171

return false;

2172

2173

Chain = TCChain;

2174

return true;

2175

}

2176

2177

EVT

2178

X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,

2179

ISD::NodeType ExtendKind) const {

2180

MVT ReturnMVT;

2181

// TODO: Is this also valid on 32-bit?

2182

if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)

2183

ReturnMVT = MVT::i8;

2184

else

2185

ReturnMVT = MVT::i32;

2186

2187

EVT MinVT = getRegisterType(Context, ReturnMVT);

2188

return VT.bitsLT(MinVT) ? MinVT : VT;

2189

}

2190

2191

/// Lower the result values of a call into the

2192

/// appropriate copies out of appropriate physical registers.

2193

///

2194

SDValue

2195

X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,

2196

CallingConv::ID CallConv, bool isVarArg,

2197

const SmallVectorImpl<ISD::InputArg> &Ins,

2198

SDLoc dl, SelectionDAG &DAG,

2199

SmallVectorImpl<SDValue> &InVals) const {

2200

2201

// Assign locations to each value returned by this call.

2202

SmallVector<CCValAssign, 16> RVLocs;

2203

bool Is64Bit = Subtarget->is64Bit();

2204

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

2205

*DAG.getContext());

2206

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

2207

2208

// Copy all of the result registers out of their specified physreg.

2209

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

2210

CCValAssign &VA = RVLocs[i];

2211

EVT CopyVT = VA.getValVT();

2212

2213

// If this is x86-64, and we disabled SSE, we can't return FP values

2214

if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&

2215

((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {

2216

report_fatal_error("SSE register return with SSE disabled");

2217

}

2218

2219

// If we prefer to use the value in xmm registers, copy it out as f80 and

2220

// use a truncate to move it from fp stack reg to xmm reg.

2221

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

2222

isScalarFPTypeInSSEReg(VA.getValVT()))

2223

CopyVT = MVT::f80;

2224

2225

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),

2226

CopyVT, InFlag).getValue(1);

2227

SDValue Val = Chain.getValue(0);

2228

2229

if (CopyVT != VA.getValVT())

2230

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

2231

// This truncation won't change the value.

2232

DAG.getIntPtrConstant(1));

2233

2234

InFlag = Chain.getValue(2);

2235

InVals.push_back(Val);

2236

}

2237

2238

return Chain;

2239

}

2240

2241

//===----------------------------------------------------------------------===//

2242

// C & StdCall & Fast Calling Convention implementation

2243

//===----------------------------------------------------------------------===//

2244

// StdCall calling convention seems to be standard for many Windows' API

2245

// routines and around. It differs from C calling convention just a little:

2246

// callee should clean up the stack, not caller. Symbols should be also

2247

// decorated in some fancy way :) It doesn't support any vector arguments.

2248

// For info on fast calling convention see Fast Calling Convention (tail call)

2249

// implementation LowerX86_32FastCCCallTo.

2250

2251

/// CallIsStructReturn - Determines whether a call uses struct return

2252

/// semantics.

2253

enum StructReturnType {

2254

NotStructReturn,

2255

RegStructReturn,

2256

StackStructReturn

2257

};

2258

static StructReturnType

2259

callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {

2260

if (Outs.empty())

2261

return NotStructReturn;

2262

2263

const ISD::ArgFlagsTy &Flags = Outs[0].Flags;

2264

if (!Flags.isSRet())

2265

return NotStructReturn;

2266

if (Flags.isInReg())

2267

return RegStructReturn;

2268

return StackStructReturn;

2269

}

2270

2271

/// Determines whether a function uses struct return semantics.

2272

static StructReturnType

2273

argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {

2274

if (Ins.empty())

2275

return NotStructReturn;

2276

2277

const ISD::ArgFlagsTy &Flags = Ins[0].Flags;

2278

if (!Flags.isSRet())

2279

return NotStructReturn;

2280

if (Flags.isInReg())

2281

return RegStructReturn;

2282

return StackStructReturn;

2283

}

2284

2285

/// Make a copy of an aggregate at address specified by "Src" to address

2286

/// "Dst" with size and alignment information specified by the specific

2287

/// parameter attribute. The copy will be passed as a byval function parameter.

2288

static SDValue

2289

CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,

2290

ISD::ArgFlagsTy Flags, SelectionDAG &DAG,

2291

SDLoc dl) {

2292

SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);

2293

2294

return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),

2295

/*isVolatile*/false, /*AlwaysInline=*/true,

2296

MachinePointerInfo(), MachinePointerInfo());

2297

}

2298

2299

/// Return true if the calling convention is one that

2300

/// supports tail call optimization.

2301

static bool IsTailCallConvention(CallingConv::ID CC) {

2302

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

2303

CC == CallingConv::HiPE);

2304

}

2305

2306

/// \brief Return true if the calling convention is a C calling convention.

2307

static bool IsCCallConvention(CallingConv::ID CC) {

2308

return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||

2309

CC == CallingConv::X86_64_SysV);

2310

}

2311

2312

bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {

2313

if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)

2314

return false;

2315

2316

CallSite CS(CI);

2317

CallingConv::ID CalleeCC = CS.getCallingConv();

2318

if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))

2319

return false;

2320

2321

return true;

2322

}

2323

2324

/// Return true if the function is being made into

2325

/// a tailcall target by changing its ABI.

2326

static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,

2327

bool GuaranteedTailCallOpt) {

2328

return GuaranteedTailCallOpt && IsTailCallConvention(CC);

2329

}

2330

2331

SDValue

2332

X86TargetLowering::LowerMemArgument(SDValue Chain,

2333

CallingConv::ID CallConv,

2334

const SmallVectorImpl<ISD::InputArg> &Ins,

2335

SDLoc dl, SelectionDAG &DAG,

2336

const CCValAssign &VA,

2337

MachineFrameInfo *MFI,

2338

unsigned i) const {

2339

// Create the nodes corresponding to a load from this parameter slot.

2340

ISD::ArgFlagsTy Flags = Ins[i].Flags;

2341

bool AlwaysUseMutable = FuncIsMadeTailCallSafe(

2342

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

2343

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

2344

EVT ValVT;

2345

2346

// If value is passed by pointer we have address passed instead of the value

2347

// itself.

2348

if (VA.getLocInfo() == CCValAssign::Indirect)

2349

ValVT = VA.getLocVT();

2350

else

2351

ValVT = VA.getValVT();

2352

2353

// FIXME: For now, all byval parameter objects are marked mutable. This can be

2354

// changed with more analysis.

2355

// In case of tail call optimization mark all arguments mutable. Since they

2356

// could be overwritten by lowering of arguments in case of a tail call.

2357

if (Flags.isByVal()) {

2358

unsigned Bytes = Flags.getByValSize();

2359

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

2360

int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);

2361

return DAG.getFrameIndex(FI, getPointerTy());

2362

} else {

2363

int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,

2364

VA.getLocMemOffset(), isImmutable);

2365

SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());

2366

return DAG.getLoad(ValVT, dl, Chain, FIN,

2367

MachinePointerInfo::getFixedStack(FI),

2368

false, false, false, 0);

2369

}

2370

}

2371

2372

// FIXME: Get this from tablegen.

2373

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

2374

const X86Subtarget *Subtarget) {

2375

assert(Subtarget->is64Bit())((Subtarget->is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget->is64Bit()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2375, __PRETTY_FUNCTION__));

2376

2377

if (Subtarget->isCallingConvWin64(CallConv)) {

2378

static const MCPhysReg GPR64ArgRegsWin64[] = {

2379

X86::RCX, X86::RDX, X86::R8, X86::R9

2380

};

2381

return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

2382

}

2383

2384

static const MCPhysReg GPR64ArgRegs64Bit[] = {

2385

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

2386

};

2387

return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

2388

}

2389

2390

// FIXME: Get this from tablegen.

2391

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

2392

CallingConv::ID CallConv,

2393

const X86Subtarget *Subtarget) {

2394

2395

if (Subtarget->isCallingConvWin64(CallConv)) {

2396

// The XMM registers which might contain var arg parameters are shadowed

2397

// in their paired GPR. So we only need to save the GPR to their home

2398

// slots.

2399

// TODO: __vectorcall will change this.

2400

return None;

2401

}

2402

2403

const Function *Fn = MF.getFunction();

2404

bool NoImplicitFloatOps = Fn->getAttributes().

2405

hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);

2406

assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&((!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2407, __PRETTY_FUNCTION__))

2407

"SSE register cannot be used when SSE is disabled!")((!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2407, __PRETTY_FUNCTION__));

2408

if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||

2409

!Subtarget->hasSSE1())

2410

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

2411

// registers.

2412

return None;

2413

2414

static const MCPhysReg XMMArgRegs64Bit[] = {

2415

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

2416

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

2417

};

2418

return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

2419

}

2420

2421

SDValue

2422

X86TargetLowering::LowerFormalArguments(SDValue Chain,

2423

CallingConv::ID CallConv,

2424

bool isVarArg,

2425

const SmallVectorImpl<ISD::InputArg> &Ins,

2426

SDLoc dl,

2427

SelectionDAG &DAG,

2428

SmallVectorImpl<SDValue> &InVals)

2429

const {

2430

MachineFunction &MF = DAG.getMachineFunction();

2431

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

2432

2433

const Function* Fn = MF.getFunction();

2434

if (Fn->hasExternalLinkage() &&

2435

Subtarget->isTargetCygMing() &&

2436

Fn->getName() == "main")

2437

FuncInfo->setForceFramePointer(true);

2438

2439

MachineFrameInfo *MFI = MF.getFrameInfo();

2440

bool Is64Bit = Subtarget->is64Bit();

2441

bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);

2442

2443

assert(!(isVarArg && IsTailCallConvention(CallConv)) &&((!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && IsTailCallConvention(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2444, __PRETTY_FUNCTION__))

2444

"Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && IsTailCallConvention(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2444, __PRETTY_FUNCTION__));

2445

2446

// Assign locations to all of the incoming arguments.

2447

SmallVector<CCValAssign, 16> ArgLocs;

2448

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

2449

2450

// Allocate shadow area for Win64

2451

if (IsWin64)

2452

CCInfo.AllocateStack(32, 8);

2453

2454

CCInfo.AnalyzeFormalArguments(Ins, CC_X86);

2455

2456

unsigned LastVal = ~0U;

2457

SDValue ArgValue;

2458

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

2459

CCValAssign &VA = ArgLocs[i];

2460

// TODO: If an arg is passed in two places (e.g. reg and stack), skip later

2461

// places.

2462

assert(VA.getValNo() != LastVal &&((VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"
) ? static_cast<void> (0) : __assert_fail ("VA.getValNo() != LastVal && \"Don't support value assigned to multiple locs yet\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2463, __PRETTY_FUNCTION__))

2463

"Don't support value assigned to multiple locs yet")((VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"
) ? static_cast<void> (0) : __assert_fail ("VA.getValNo() != LastVal && \"Don't support value assigned to multiple locs yet\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2463, __PRETTY_FUNCTION__));

2464

(void)LastVal;

2465

LastVal = VA.getValNo();

2466

2467

if (VA.isRegLoc()) {

2468

EVT RegVT = VA.getLocVT();

2469

const TargetRegisterClass *RC;

2470

if (RegVT == MVT::i32)

2471

RC = &X86::GR32RegClass;

2472

else if (Is64Bit && RegVT == MVT::i64)

2473

RC = &X86::GR64RegClass;

2474

else if (RegVT == MVT::f32)

2475

RC = &X86::FR32RegClass;

2476

else if (RegVT == MVT::f64)

2477

RC = &X86::FR64RegClass;

2478

else if (RegVT.is512BitVector())

2479

RC = &X86::VR512RegClass;

2480

else if (RegVT.is256BitVector())

2481

RC = &X86::VR256RegClass;

2482

else if (RegVT.is128BitVector())

2483

RC = &X86::VR128RegClass;

2484

else if (RegVT == MVT::x86mmx)

2485

RC = &X86::VR64RegClass;

2486

else if (RegVT == MVT::i1)

2487

RC = &X86::VK1RegClass;

2488

else if (RegVT == MVT::v8i1)

2489

RC = &X86::VK8RegClass;

2490

else if (RegVT == MVT::v16i1)

2491

RC = &X86::VK16RegClass;

2492

else if (RegVT == MVT::v32i1)

2493

RC = &X86::VK32RegClass;

2494

else if (RegVT == MVT::v64i1)

2495

RC = &X86::VK64RegClass;

2496

else

2497

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2497);

2498

2499

unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);

2500

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

2501

2502

// If this is an 8 or 16-bit value, it is really passed promoted to 32

2503

// bits. Insert an assert[sz]ext to capture this, then truncate to the

2504

// right size.

2505

if (VA.getLocInfo() == CCValAssign::SExt)

2506

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

2507

DAG.getValueType(VA.getValVT()));

2508

else if (VA.getLocInfo() == CCValAssign::ZExt)

2509

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

2510

DAG.getValueType(VA.getValVT()));

2511

else if (VA.getLocInfo() == CCValAssign::BCvt)

2512

ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);

2513

2514

if (VA.isExtInLoc()) {

2515

// Handle MMX values passed in XMM regs.

2516

if (RegVT.isVector())

2517

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

2518

else

2519

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

2520

}

2521

} else {

2522

assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2522, __PRETTY_FUNCTION__));

2523

ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);

2524

}

2525

2526

// If value is passed via pointer - do a load.

2527

if (VA.getLocInfo() == CCValAssign::Indirect)

2528

ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,

2529

MachinePointerInfo(), false, false, false, 0);

2530

2531

InVals.push_back(ArgValue);

2532

}

2533

2534

if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {

2535

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

2536

// The x86-64 ABIs require that for returning structs by value we copy

2537

// the sret argument into %rax/%eax (depending on ABI) for the return.

2538

// Win32 requires us to put the sret argument to %eax as well.

2539

// Save the argument into a virtual register so that we can access it

2540

// from the return points.

2541

if (Ins[i].Flags.isSRet()) {

2542

unsigned Reg = FuncInfo->getSRetReturnReg();

2543

if (!Reg) {

2544

MVT PtrTy = getPointerTy();

2545

Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

2546

FuncInfo->setSRetReturnReg(Reg);

2547

}

2548

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);

2549

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

2550

break;

2551

}

2552

}

2553

}

2554

2555

unsigned StackSize = CCInfo.getNextStackOffset();

2556

// Align stack specially for tail calls.

2557

if (FuncIsMadeTailCallSafe(CallConv,

2558

MF.getTarget().Options.GuaranteedTailCallOpt))

2559

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

2560

2561

// If the function takes variable number of arguments, make a frame index for

2562

// the start of the first vararg value... for expansion of llvm.va_start. We

2563

// can skip this if there are no va_start calls.

2564

if (MFI->hasVAStart() &&

2565

(Is64Bit || (CallConv != CallingConv::X86_FastCall &&

2566

CallConv != CallingConv::X86_ThisCall))) {

2567

FuncInfo->setVarArgsFrameIndex(

2568

MFI->CreateFixedObject(1, StackSize, true));

2569

}

2570

2571

// Figure out if XMM registers are in use.

2572

assert(!(MF.getTarget().Options.UseSoftFloat &&((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__))

2573

Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__))

2574

Attribute::NoImplicitFloat)) &&((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__))

2575

"SSE register cannot be used when SSE is disabled!")((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__));

2576

2577

// 64-bit calling conventions support varargs and register parameters, so we

2578

// have to do extra work to spill them in the prologue.

2579

if (Is64Bit && isVarArg && MFI->hasVAStart()) {

2580

// Find the first unallocated argument registers.

2581

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

2582

ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);

2583

unsigned NumIntRegs =

2584

CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());

2585

unsigned NumXMMRegs =

2586

CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());

2587

assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&((!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!") ? static_cast
<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget->hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2588, __PRETTY_FUNCTION__))

2588

"SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!") ? static_cast
<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget->hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2588, __PRETTY_FUNCTION__));

2589

2590

// Gather all the live in physical registers.

2591

SmallVector<SDValue, 6> LiveGPRs;

2592

SmallVector<SDValue, 8> LiveXMMRegs;

2593

SDValue ALVal;

2594

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

2595

unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);

2596

LiveGPRs.push_back(

2597

DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));

2598

}

2599

if (!ArgXMMs.empty()) {

2600

unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);

2601

ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);

2602

for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {

2603

unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);

2604

LiveXMMRegs.push_back(

2605

DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));

2606

}

2607

}

2608

2609

if (IsWin64) {

2610

const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();

2611

// Get to the caller-allocated home save location. Add 8 to account

2612

// for the return address.

2613

int HomeOffset = TFI.getOffsetOfLocalArea() + 8;

2614

FuncInfo->setRegSaveFrameIndex(

2615

MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

2616

// Fixup to set vararg frame on shadow area (4 x i64).

2617

if (NumIntRegs < 4)

2618

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

2619

} else {

2620

// For X86-64, if there are vararg parameters that are passed via

2621

// registers, then we must store them to their spots on the stack so

2622

// they may be loaded by deferencing the result of va_next.

2623

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

2624

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

2625

FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(

2626

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));

2627

}

2628

2629

// Store the integer parameter registers.

2630

SmallVector<SDValue, 8> MemOps;

2631

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

2632

getPointerTy());

2633

unsigned Offset = FuncInfo->getVarArgsGPOffset();

2634

for (SDValue Val : LiveGPRs) {

2635

SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,

2636

DAG.getIntPtrConstant(Offset));

2637

SDValue Store =

2638

DAG.getStore(Val.getValue(1), dl, Val, FIN,

2639

MachinePointerInfo::getFixedStack(

2640

FuncInfo->getRegSaveFrameIndex(), Offset),

2641

false, false, 0);

2642

MemOps.push_back(Store);

2643

Offset += 8;

2644

}

2645

2646

if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {

2647

// Now store the XMM (fp + vector) parameter registers.

2648

SmallVector<SDValue, 12> SaveXMMOps;

2649

SaveXMMOps.push_back(Chain);

2650

SaveXMMOps.push_back(ALVal);

2651

SaveXMMOps.push_back(DAG.getIntPtrConstant(

2652

FuncInfo->getRegSaveFrameIndex()));

2653

SaveXMMOps.push_back(DAG.getIntPtrConstant(

2654

FuncInfo->getVarArgsFPOffset()));

2655

SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),

2656

LiveXMMRegs.end());

2657

MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,

2658

MVT::Other, SaveXMMOps));

2659

}

2660

2661

if (!MemOps.empty())

2662

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

2663

}

2664

2665

if (isVarArg && MFI->hasMustTailInVarArgFunc()) {

2666

// Find the largest legal vector type.

2667

MVT VecVT = MVT::Other;

2668

// FIXME: Only some x86_32 calling conventions support AVX512.

2669

if (Subtarget->hasAVX512() &&

2670

(Is64Bit || (CallConv == CallingConv::X86_VectorCall ||

2671

CallConv == CallingConv::Intel_OCL_BI)))

2672

VecVT = MVT::v16f32;

2673

else if (Subtarget->hasAVX())

2674

VecVT = MVT::v8f32;

2675

else if (Subtarget->hasSSE2())

2676

VecVT = MVT::v4f32;

2677

2678

// We forward some GPRs and some vector types.

2679

SmallVector<MVT, 2> RegParmTypes;

2680

MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;

2681

RegParmTypes.push_back(IntVT);

2682

if (VecVT != MVT::Other)

2683

RegParmTypes.push_back(VecVT);

2684

2685

// Compute the set of forwarded registers. The rest are scratch.

2686

SmallVectorImpl<ForwardedRegister> &Forwards =

2687

FuncInfo->getForwardedMustTailRegParms();

2688

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

2689

2690

// Conservatively forward AL on x86_64, since it might be used for varargs.

2691

if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {

2692

unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);

2693

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

2694

}

2695

2696

// Copy all forwards from physical to virtual registers.

2697

for (ForwardedRegister &F : Forwards) {

2698

// FIXME: Can we use a less constrained schedule?

2699

SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

2700

F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));

2701

Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);

2702

}

2703

}

2704

2705

// Some CCs need callee pop.

2706

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

2707

MF.getTarget().Options.GuaranteedTailCallOpt)) {

2708

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

2709

} else {

2710

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

2711

// If this is an sret function, the return should pop the hidden pointer.

2712

if (!Is64Bit && !IsTailCallConvention(CallConv) &&

2713

!Subtarget->getTargetTriple().isOSMSVCRT() &&

2714

argsAreStructReturn(Ins) == StackStructReturn)

2715

FuncInfo->setBytesToPopOnReturn(4);

2716

}

2717

2718

if (!Is64Bit) {

2719

// RegSaveFrameIndex is X86-64 only.

2720

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

2721

if (CallConv == CallingConv::X86_FastCall ||

2722

CallConv == CallingConv::X86_ThisCall)

2723

// fastcc functions can't have varargs.

2724

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

2725

}

2726

2727

FuncInfo->setArgumentStackSize(StackSize);

2728

2729

return Chain;

2730

}

2731

2732

SDValue

2733

X86TargetLowering::LowerMemOpCallTo(SDValue Chain,

2734

SDValue StackPtr, SDValue Arg,

2735

SDLoc dl, SelectionDAG &DAG,

2736

const CCValAssign &VA,

2737

ISD::ArgFlagsTy Flags) const {

2738

unsigned LocMemOffset = VA.getLocMemOffset();

2739

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);

2740

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);

2741

if (Flags.isByVal())

2742

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

2743

2744

return DAG.getStore(Chain, dl, Arg, PtrOff,

2745

MachinePointerInfo::getStack(LocMemOffset),

2746

false, false, 0);

2747

}

2748

2749

/// Emit a load of return address if tail call

2750

/// optimization is performed and it is required.

2751

SDValue

2752

X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,

2753

SDValue &OutRetAddr, SDValue Chain,

2754

bool IsTailCall, bool Is64Bit,

2755

int FPDiff, SDLoc dl) const {

2756

// Adjust the Return address stack slot.

2757

EVT VT = getPointerTy();

2758

OutRetAddr = getReturnAddressFrameIndex(DAG);

2759

2760

// Load the "old" Return address.

2761

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),

2762

false, false, false, 0);

2763

return SDValue(OutRetAddr.getNode(), 1);

2764

}

2765

2766

/// Emit a store of the return address if tail call

2767

/// optimization is performed and it is required (FPDiff!=0).

2768

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

2769

SDValue Chain, SDValue RetAddrFrIdx,

2770

EVT PtrVT, unsigned SlotSize,

2771

int FPDiff, SDLoc dl) {

2772

// Store the return address to the appropriate stack slot.

2773

if (!FPDiff) return Chain;

2774

// Calculate the new stack slot for the return address.

2775

int NewReturnAddrFI =

2776

MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

2777

false);

2778

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

2779

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

2780

MachinePointerInfo::getFixedStack(NewReturnAddrFI),

2781

false, false, 0);

2782

return Chain;

2783

}

2784

2785

SDValue

2786

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

2787

SmallVectorImpl<SDValue> &InVals) const {

2788

SelectionDAG &DAG = CLI.DAG;

2789

SDLoc &dl = CLI.DL;

2790

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

2791

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

2792

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

2793

SDValue Chain = CLI.Chain;

2794

SDValue Callee = CLI.Callee;

2795

CallingConv::ID CallConv = CLI.CallConv;

2796

bool &isTailCall = CLI.IsTailCall;

2797

bool isVarArg = CLI.IsVarArg;

2798

2799

MachineFunction &MF = DAG.getMachineFunction();

2800

bool Is64Bit = Subtarget->is64Bit();

2801

bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);

2802

StructReturnType SR = callIsStructReturn(Outs);

2803

bool IsSibcall = false;

2804

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

2805

2806

if (MF.getTarget().Options.DisableTailCalls)

2807

isTailCall = false;

2808

2809

bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();

2810

if (IsMustTail) {

2811

// Force this to be a tail call. The verifier rules are enough to ensure

2812

// that we can lower this successfully without moving the return address

2813

// around.

2814

isTailCall = true;

2815

} else if (isTailCall) {

2816

// Check if it's really possible to do a tail call.

2817

isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,

2818

isVarArg, SR != NotStructReturn,

2819

MF.getFunction()->hasStructRetAttr(), CLI.RetTy,

2820

Outs, OutVals, Ins, DAG);

2821

2822

// Sibcalls are automatically detected tailcalls which do not require

2823

// ABI changes.

2824

if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)

2825

IsSibcall = true;

2826

2827

if (isTailCall)

2828

++NumTailCalls;

2829

}

2830

2831

2832

2833

2834

// Analyze operands of the call, assigning locations to each operand.

2835

SmallVector<CCValAssign, 16> ArgLocs;

2836

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

2837

2838

// Allocate shadow area for Win64

2839

if (IsWin64)

2840

CCInfo.AllocateStack(32, 8);

2841

2842

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

2843

2844

// Get a count of how many bytes are to be pushed on the stack.

2845

unsigned NumBytes = CCInfo.getNextStackOffset();

2846

if (IsSibcall)

2847

// This is a sibcall. The memory operands are available in caller's

2848

// own caller's stack.

2849

NumBytes = 0;

2850

else if (MF.getTarget().Options.GuaranteedTailCallOpt &&

2851

IsTailCallConvention(CallConv))

2852

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

2853

2854

int FPDiff = 0;

2855

if (isTailCall && !IsSibcall && !IsMustTail) {

2856

// Lower arguments at fp - stackoffset + fpdiff.

2857

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

2858

2859

FPDiff = NumBytesCallerPushed - NumBytes;

2860

2861

// Set the delta of movement of the returnaddr stackslot.

2862

// But only set if delta is greater than previous delta.

2863

if (FPDiff < X86Info->getTCReturnAddrDelta())

2864

X86Info->setTCReturnAddrDelta(FPDiff);

2865

}

2866

2867

unsigned NumBytesToPush = NumBytes;

2868

unsigned NumBytesToPop = NumBytes;

2869

2870

// If we have an inalloca argument, all stack space has already been allocated

2871

// for us and be right at the top of the stack. We don't support multiple

2872

// arguments passed in memory when using inalloca.

2873

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

2874

NumBytesToPush = 0;

2875

if (!ArgLocs.back().isMemLoc())

2876

report_fatal_error("cannot use inalloca attribute on a register "

2877

"parameter");

2878

if (ArgLocs.back().getLocMemOffset() != 0)

2879

report_fatal_error("any parameter with the inalloca attribute must be "

2880

"the only memory argument");

2881

}

2882

2883

if (!IsSibcall)

2884

Chain = DAG.getCALLSEQ_START(

2885

Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);

2886

2887

SDValue RetAddrFrIdx;

2888

// Load return address for tail calls.

2889

if (isTailCall && FPDiff)

2890

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

2891

Is64Bit, FPDiff, dl);

2892

2893

SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

2894

SmallVector<SDValue, 8> MemOpChains;

2895

SDValue StackPtr;

2896

2897

// Walk the register/memloc assignments, inserting copies/loads. In the case

2898

// of tail call optimization arguments are handle later.

2899

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

2900

DAG.getSubtarget().getRegisterInfo());

2901

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

2902

// Skip inalloca arguments, they have already been written.

2903

ISD::ArgFlagsTy Flags = Outs[i].Flags;

2904

if (Flags.isInAlloca())

2905

continue;

2906

2907

CCValAssign &VA = ArgLocs[i];

2908

EVT RegVT = VA.getLocVT();

2909

SDValue Arg = OutVals[i];

2910

bool isByVal = Flags.isByVal();

2911

2912

// Promote the value if needed.

2913

switch (VA.getLocInfo()) {

2914

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2914);

2915

case CCValAssign::Full: break;

2916

case CCValAssign::SExt:

2917

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

2918

break;

2919

case CCValAssign::ZExt:

2920

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

2921

break;

2922

case CCValAssign::AExt:

2923

if (RegVT.is128BitVector()) {

2924

// Special case: passing MMX values in XMM registers.

2925

Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);

2926

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

2927

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

2928

} else

2929

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

2930

break;

2931

case CCValAssign::BCvt:

2932

Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);

2933

break;

2934

case CCValAssign::Indirect: {

2935

// Store the argument.

2936

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

2937

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

2938

Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,

2939

MachinePointerInfo::getFixedStack(FI),

2940

false, false, 0);

2941

Arg = SpillSlot;

2942

break;

2943

}

2944

}

2945

2946

if (VA.isRegLoc()) {

2947

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

2948

if (isVarArg && IsWin64) {

2949

// Win64 ABI requires argument XMM reg to be copied to the corresponding

2950

// shadow reg if callee is a varargs function.

2951

unsigned ShadowReg = 0;

2952

switch (VA.getLocReg()) {

2953

case X86::XMM0: ShadowReg = X86::RCX; break;

2954

case X86::XMM1: ShadowReg = X86::RDX; break;

2955

case X86::XMM2: ShadowReg = X86::R8; break;

2956

case X86::XMM3: ShadowReg = X86::R9; break;

2957

}

2958

if (ShadowReg)

2959

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

2960

}

2961

} else if (!IsSibcall && (!isTailCall || isByVal)) {

2962

2963

if (!StackPtr.getNode())

2964

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

2965

getPointerTy());

2966

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

2967

dl, DAG, VA, Flags));

2968

}

2969

}

2970

2971

if (!MemOpChains.empty())

2972

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

2973

2974

if (Subtarget->isPICStyleGOT()) {

2975

// ELF / PIC requires GOT in the EBX register before function calls via PLT

2976

// GOT pointer.

2977

if (!isTailCall) {

2978

RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),

2979

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));

2980

} else {

2981

// If we are tail calling and generating PIC/GOT style code load the

2982

// address of the callee into ECX. The value in ecx is used as target of

2983

// the tail jump. This is done to circumvent the ebx/callee-saved problem

2984

// for tail calls on PIC/GOT architectures. Normally we would just put the

2985

// address of GOT into ebx and then call target@PLT. But for tail calls

2986

// ebx would be restored (since ebx is callee saved) before jumping to the

2987

// target@PLT.

2988

2989

// Note: The actual moving to ECX is done further down.

2990

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

2991

if (G && !G->getGlobal()->hasHiddenVisibility() &&

2992

!G->getGlobal()->hasProtectedVisibility())

2993

Callee = LowerGlobalAddress(Callee, DAG);

2994

else if (isa<ExternalSymbolSDNode>(Callee))

2995

Callee = LowerExternalSymbol(Callee, DAG);

2996

}

2997

}

2998

2999

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {

3000

// From AMD64 ABI document:

3001

// For calls that may call functions that use varargs or stdargs

3002

// (prototype-less calls or calls to functions containing ellipsis (...) in

3003

// the declaration) %al is used as hidden argument to specify the number

3004

// of SSE registers used. The contents of %al do not need to match exactly

3005

// the number of registers, but must be an ubound on the number of SSE

3006

// registers used and is in the range 0 - 8 inclusive.

3007

3008

// Count the number of XMM registers allocated.

3009

static const MCPhysReg XMMArgRegs[] = {

3010

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3011

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3012

};

3013

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);

3014

assert((Subtarget->hasSSE1() || !NumXMMRegs)(((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget->hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3015, __PRETTY_FUNCTION__))

3015

&& "SSE registers cannot be used when SSE is disabled")(((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget->hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3015, __PRETTY_FUNCTION__));

3016

3017

RegsToPass.push_back(std::make_pair(unsigned(X86::AL),

3018

DAG.getConstant(NumXMMRegs, MVT::i8)));

3019

}

3020

3021

if (isVarArg && IsMustTail) {

3022

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

3023

for (const auto &F : Forwards) {

3024

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

3025

RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));

3026

}

3027

}

3028

3029

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

3030

// don't need this because the eligibility check rejects calls that require

3031

// shuffling arguments passed in memory.

3032

if (!IsSibcall && isTailCall) {

3033

// Force all the incoming stack arguments to be loaded from the stack

3034

// before any new outgoing arguments are stored to the stack, because the

3035

// outgoing stack slots may alias the incoming argument stack slots, and

3036

// the alias isn't otherwise explicit. This is slightly more conservative

3037

// than necessary, because it means that each store effectively depends

3038

// on every argument instead of just those arguments it would clobber.

3039

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

3040

3041

SmallVector<SDValue, 8> MemOpChains2;

3042

SDValue FIN;

3043

int FI = 0;

3044

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

3045

CCValAssign &VA = ArgLocs[i];

3046

if (VA.isRegLoc())

3047

continue;

3048

3049

SDValue Arg = OutVals[i];

3050

ISD::ArgFlagsTy Flags = Outs[i].Flags;

3051

// Skip inalloca arguments. They don't require any work.

3052

if (Flags.isInAlloca())

3053

continue;

3054

// Create frame index.

3055

int32_t Offset = VA.getLocMemOffset()+FPDiff;

3056

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

3057

FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);

3058

FIN = DAG.getFrameIndex(FI, getPointerTy());

3059

3060

if (Flags.isByVal()) {

3061

// Copy relative to framepointer.

3062

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());

3063

if (!StackPtr.getNode())

3064

StackPtr = DAG.getCopyFromReg(Chain, dl,

3065

RegInfo->getStackRegister(),

3066

getPointerTy());

3067

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);

3068

3069

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

3070

ArgChain,

3071

Flags, DAG, dl));

3072

} else {

3073

// Store relative to framepointer.

3074

MemOpChains2.push_back(

3075

DAG.getStore(ArgChain, dl, Arg, FIN,

3076

MachinePointerInfo::getFixedStack(FI),

3077

false, false, 0));

3078

}

3079

}

3080

3081

if (!MemOpChains2.empty())

3082

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

3083

3084

// Store the return address to the appropriate stack slot.

3085

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

3086

getPointerTy(), RegInfo->getSlotSize(),

3087

FPDiff, dl);

3088

}

3089

3090

// Build a sequence of copy-to-reg nodes chained together with token chain

3091

// and flag operands which copy the outgoing args into registers.

3092

SDValue InFlag;

3093

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

3094

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

3095

RegsToPass[i].second, InFlag);

3096

InFlag = Chain.getValue(1);

3097

}

3098

3099

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

3100

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode."
) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3100, __PRETTY_FUNCTION__));

3101

// In the 64-bit large code model, we have to make all calls

3102

// through a register, since the call instruction's 32-bit

3103

// pc-relative offset may not be large enough to hold the whole

3104

// address.

3105

} else if (Callee->getOpcode() == ISD::GlobalAddress) {

3106

// If the callee is a GlobalAddress node (quite common, every direct call

3107

// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack

3108

// it.

3109

GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);

3110

3111

// We should use extra load for direct calls to dllimported functions in

3112

// non-JIT mode.

3113

const GlobalValue *GV = G->getGlobal();

3114

if (!GV->hasDLLImportStorageClass()) {

3115

unsigned char OpFlags = 0;

3116

bool ExtraLoad = false;

3117

unsigned WrapperKind = ISD::DELETED_NODE;

3118

3119

// On ELF targets, in both X86-64 and X86-32 mode, direct calls to

3120

// external symbols most go through the PLT in PIC mode. If the symbol

3121

// has hidden or protected visibility, or if it is static or local, then

3122

// we don't need to use the PLT - we can directly call it.

3123

if (Subtarget->isTargetELF() &&

3124

DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&

3125

GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {

3126

OpFlags = X86II::MO_PLT;

3127

} else if (Subtarget->isPICStyleStubAny() &&

3128

(GV->isDeclaration() || GV->isWeakForLinker()) &&

3129

(!Subtarget->getTargetTriple().isMacOSX() ||

3130

Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {

3131

// PC-relative references to external symbols should go through $stub,

3132

// unless we're building with the leopard linker or later, which

3133

// automatically synthesizes these stubs.

3134

OpFlags = X86II::MO_DARWIN_STUB;

3135

} else if (Subtarget->isPICStyleRIPRel() &&

3136

isa<Function>(GV) &&

3137

cast<Function>(GV)->getAttributes().

3138

hasAttribute(AttributeSet::FunctionIndex,

3139

Attribute::NonLazyBind)) {

3140

// If the function is marked as non-lazy, generate an indirect call

3141

// which loads from the GOT directly. This avoids runtime overhead

3142

// at the cost of eager binding (and one extra byte of encoding).

3143

OpFlags = X86II::MO_GOTPCREL;

3144

WrapperKind = X86ISD::WrapperRIP;

3145

ExtraLoad = true;

3146

}

3147

3148

Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),

3149

G->getOffset(), OpFlags);

3150

3151

// Add a wrapper if needed.

3152

if (WrapperKind != ISD::DELETED_NODE)

3153

Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);

3154

// Add extra indirection if needed.

3155

if (ExtraLoad)

3156

Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,

3157

MachinePointerInfo::getGOT(),

3158

false, false, false, 0);

3159

}

3160

} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {

3161

unsigned char OpFlags = 0;

3162

3163

// On ELF targets, in either X86-64 or X86-32 mode, direct calls to

3164

// external symbols should go through the PLT.

3165

if (Subtarget->isTargetELF() &&

3166

DAG.getTarget().getRelocationModel() == Reloc::PIC_) {

3167

OpFlags = X86II::MO_PLT;

3168

} else if (Subtarget->isPICStyleStubAny() &&

3169

(!Subtarget->getTargetTriple().isMacOSX() ||

3170

Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {

3171

// PC-relative references to external symbols should go through $stub,

3172

// unless we're building with the leopard linker or later, which

3173

// automatically synthesizes these stubs.

3174

OpFlags = X86II::MO_DARWIN_STUB;

3175

}

3176

3177

Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),

3178

OpFlags);

3179

} else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {

3180

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

3181

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

3182

}

3183

3184

// Returns a chain & a flag for retval copy to use.

3185

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

3186

SmallVector<SDValue, 8> Ops;

3187

3188

if (!IsSibcall && isTailCall) {

3189

Chain = DAG.getCALLSEQ_END(Chain,

3190

DAG.getIntPtrConstant(NumBytesToPop, true),

3191

DAG.getIntPtrConstant(0, true), InFlag, dl);

3192

InFlag = Chain.getValue(1);

3193

}

3194

3195

Ops.push_back(Chain);

3196

Ops.push_back(Callee);

3197

3198

if (isTailCall)

3199

Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));

3200

3201

// Add argument registers to the end of the list so that they are known live

3202

// into the call.

3203

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

3204

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

3205

RegsToPass[i].second.getValueType()));

3206

3207

// Add a register mask operand representing the call-preserved registers.

3208

const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();

3209

const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);

3210

assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3210, __PRETTY_FUNCTION__));

3211

Ops.push_back(DAG.getRegisterMask(Mask));

3212

3213

if (InFlag.getNode())

3214

Ops.push_back(InFlag);

3215

3216

if (isTailCall) {

3217

// We used to do:

3218

//// If this is the first return lowered for this function, add the regs

3219

//// to the liveout set for the function.

3220

// This isn't right, although it's probably harmless on x86; liveouts

3221

// should be computed from returns not tail calls. Consider a void

3222

// function making a tail call to a function returning int.

3223

return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

3224

}

3225

3226

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

3227

InFlag = Chain.getValue(1);

3228

3229

// Create the CALLSEQ_END node.

3230

unsigned NumBytesForCalleeToPop;

3231

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

3232

DAG.getTarget().Options.GuaranteedTailCallOpt))

3233

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

3234

else if (!Is64Bit && !IsTailCallConvention(CallConv) &&

3235

!Subtarget->getTargetTriple().isOSMSVCRT() &&

3236

SR == StackStructReturn)

3237

// If this is a call to a struct-return function, the callee

3238

// pops the hidden struct pointer, so we have to push it back.

3239

// This is common for Darwin/X86, Linux & Mingw32 targets.

3240

// For MSVC Win32 targets, the caller pops the hidden struct pointer.

3241

NumBytesForCalleeToPop = 4;

3242

else

3243

NumBytesForCalleeToPop = 0; // Callee pops nothing.

3244

3245

// Returns a flag for retval copy to use.

3246

if (!IsSibcall) {

3247

Chain = DAG.getCALLSEQ_END(Chain,

3248

DAG.getIntPtrConstant(NumBytesToPop, true),

3249

DAG.getIntPtrConstant(NumBytesForCalleeToPop,

3250

true),

3251

InFlag, dl);

3252

InFlag = Chain.getValue(1);

3253

}

3254

3255

// Handle result values, copying them out of physregs into vregs that we

3256

// return.

3257

return LowerCallResult(Chain, InFlag, CallConv, isVarArg,

3258

Ins, dl, DAG, InVals);

3259

}

3260

3261

//===----------------------------------------------------------------------===//

3262

// Fast Calling Convention (tail call) implementation

3263

//===----------------------------------------------------------------------===//

3264

3265

// Like std call, callee cleans arguments, convention except that ECX is

3266

// reserved for storing the tail called function address. Only 2 registers are

3267

// free for argument passing (inreg). Tail call optimization is performed

3268

// provided:

3269

// * tailcallopt is enabled

3270

// * caller/callee are fastcc

3271

// On X86_64 architecture with GOT-style position independent code only local

3272

// (within module) calls are supported at the moment.

3273

// To keep the stack aligned according to platform abi the function

3274

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

3275

// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)

3276

// If a tail called function callee has more arguments than the caller the

3277

// caller needs to make sure that there is room to move the RETADDR to. This is

3278

// achieved by reserving an area the size of the argument delta right after the

3279

// original RETADDR, but before the saved framepointer or the spilled registers

3280

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

3281

// stack layout:

3282

// arg1

3283

// arg2

3284

// RETADDR

3285

// [ new RETADDR

3286

// move area ]

3287

// (possible EBP)

3288

// ESI

3289

// EDI

3290

// local1 ..

3291

3292

/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned

3293

/// for a 16 byte align requirement.

3294

unsigned

3295

X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,

3296

SelectionDAG& DAG) const {

3297

MachineFunction &MF = DAG.getMachineFunction();

3298

const TargetMachine &TM = MF.getTarget();

3299

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

3300

TM.getSubtargetImpl()->getRegisterInfo());

3301

const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();

3302

unsigned StackAlignment = TFI.getStackAlignment();

3303

uint64_t AlignMask = StackAlignment - 1;

3304

int64_t Offset = StackSize;

3305

unsigned SlotSize = RegInfo->getSlotSize();

3306

if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {

3307

// Number smaller than 12 so just add the difference.

3308

Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));

3309

} else {

3310

// Mask out lower bits, add stackalignment once plus the 12 bytes.

3311

Offset = ((~AlignMask) & Offset) + StackAlignment +

3312

(StackAlignment-SlotSize);

3313

}

3314

return Offset;

3315

}

3316

3317

/// MatchingStackOffset - Return true if the given stack call argument is

3318

/// already available in the same position (relatively) of the caller's

3319

/// incoming argument stack.

3320

static

3321

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

3322

MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,

3323

const X86InstrInfo *TII) {

3324

unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;

3325

int FI = INT_MAX2147483647;

3326

if (Arg.getOpcode() == ISD::CopyFromReg) {

3327

unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

3328

if (!TargetRegisterInfo::isVirtualRegister(VR))

3329

return false;

3330

MachineInstr *Def = MRI->getVRegDef(VR);

3331

if (!Def)

3332

return false;

3333

if (!Flags.isByVal()) {

3334

if (!TII->isLoadFromStackSlot(Def, FI))

3335

return false;

3336

} else {

3337

unsigned Opcode = Def->getOpcode();

3338

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

3339

Opcode == X86::LEA64_32r) &&

3340

Def->getOperand(1).isFI()) {

3341

FI = Def->getOperand(1).getIndex();

3342

Bytes = Flags.getByValSize();

3343

} else

3344

return false;

3345

}

3346

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

3347

if (Flags.isByVal())

3348

// ByVal argument is passed in as a pointer but it's now being

3349

// dereferenced. e.g.

3350

// define @foo(%struct.X* %A) {

3351

// tail call @bar(%struct.X* byval %A)

3352

// }

3353

return false;

3354

SDValue Ptr = Ld->getBasePtr();

3355

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

3356

if (!FINode)

3357

return false;

3358

FI = FINode->getIndex();

3359

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

3360

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

3361

FI = FINode->getIndex();

3362

Bytes = Flags.getByValSize();

3363

} else

3364

return false;

3365

3366

assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail
("FI != 2147483647", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3366, __PRETTY_FUNCTION__));

3367

if (!MFI->isFixedObjectIndex(FI))

3368

return false;

3369

return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);

3370

}

3371

3372

/// IsEligibleForTailCallOptimization - Check whether the call is eligible

3373

/// for tail call optimization. Targets which want to do tail call

3374

/// optimization should implement this function.

3375

bool

3376

X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,

3377

CallingConv::ID CalleeCC,

3378

bool isVarArg,

3379

bool isCalleeStructRet,

3380

bool isCallerStructRet,

3381

Type *RetTy,

3382

const SmallVectorImpl<ISD::OutputArg> &Outs,

3383

const SmallVectorImpl<SDValue> &OutVals,

3384

const SmallVectorImpl<ISD::InputArg> &Ins,

3385

SelectionDAG &DAG) const {

3386

if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))

3387

return false;

3388

3389

// If -tailcallopt is specified, make fastcc functions tail-callable.

3390

const MachineFunction &MF = DAG.getMachineFunction();

3391

const Function *CallerF = MF.getFunction();

3392

3393

// If the function return type is x86_fp80 and the callee return type is not,

3394

// then the FP_EXTEND of the call result is not a nop. It's not safe to

3395

// perform a tailcall optimization here.

3396

if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

3397

return false;

3398

3399

CallingConv::ID CallerCC = CallerF->getCallingConv();

3400

bool CCMatch = CallerCC == CalleeCC;

3401

bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);

3402

bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);

3403

3404

if (DAG.getTarget().Options.GuaranteedTailCallOpt) {

3405

if (IsTailCallConvention(CalleeCC) && CCMatch)

3406

return true;

3407

return false;

3408

}

3409

3410

// Look for obvious safe cases to perform tail call optimization that do not

3411

// require ABI changes. This is what gcc calls sibcall.

3412

3413

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

3414

// emit a special epilogue.

3415

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

3416

DAG.getSubtarget().getRegisterInfo());

3417

if (RegInfo->needsStackRealignment(MF))

3418

return false;

3419

3420

// Also avoid sibcall optimization if either caller or callee uses struct

3421

// return semantics.

3422

if (isCalleeStructRet || isCallerStructRet)

3423

return false;

3424

3425

// An stdcall/thiscall caller is expected to clean up its arguments; the

3426

// callee isn't going to do that.

3427

// FIXME: this is more restrictive than needed. We could produce a tailcall

3428

// when the stack adjustment matches. For example, with a thiscall that takes

3429

// only one argument.

3430

if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||

3431

CallerCC == CallingConv::X86_ThisCall))

3432

return false;

3433

3434

// Do not sibcall optimize vararg calls unless all arguments are passed via

3435

// registers.

3436

if (isVarArg && !Outs.empty()) {

3437

3438

// Optimizing for varargs on Win64 is unlikely to be safe without

3439

// additional testing.

3440

if (IsCalleeWin64 || IsCallerWin64)

3441

return false;

3442

3443

SmallVector<CCValAssign, 16> ArgLocs;

3444

CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,

3445

*DAG.getContext());

3446

3447

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

3448

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

3449

if (!ArgLocs[i].isRegLoc())

3450

return false;

3451

}

3452

3453

// If the call result is in ST0 / ST1, it needs to be popped off the x87

3454

// stack. Therefore, if it's not used by the call it is not safe to optimize

3455

// this into a sibcall.

3456

bool Unused = false;

3457

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

3458

if (!Ins[i].Used) {

3459

Unused = true;

3460

break;

3461

}

3462

}

3463

if (Unused) {

3464

SmallVector<CCValAssign, 16> RVLocs;

3465

CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,

3466

*DAG.getContext());

3467

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

3468

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

3469

CCValAssign &VA = RVLocs[i];

3470

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

3471

return false;

3472

}

3473

}

3474

3475

// If the calling conventions do not match, then we'd better make sure the

3476

// results are returned in the same way as what the caller expects.

3477

if (!CCMatch) {

3478

SmallVector<CCValAssign, 16> RVLocs1;

3479

CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,

3480

*DAG.getContext());

3481

CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);

3482

3483

SmallVector<CCValAssign, 16> RVLocs2;

3484

CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,

3485

*DAG.getContext());

3486

CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);

3487

3488

if (RVLocs1.size() != RVLocs2.size())

3489

return false;

3490

for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {

3491

if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())

3492

return false;

3493

if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())

3494

return false;

3495

if (RVLocs1[i].isRegLoc()) {

3496

if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())

3497

return false;

3498

} else {

3499

if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())

3500

return false;

3501

}

3502

}

3503

}

3504

3505

// If the callee takes no arguments then go on to check the results of the

3506

// call.

3507

if (!Outs.empty()) {

3508

// Check if stack adjustment is needed. For now, do not do this if any

3509

// argument is passed on the stack.

3510

SmallVector<CCValAssign, 16> ArgLocs;

3511

CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,

3512

*DAG.getContext());

3513

3514

// Allocate shadow area for Win64

3515

if (IsCalleeWin64)

3516

CCInfo.AllocateStack(32, 8);

3517

3518

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

3519

if (CCInfo.getNextStackOffset()) {

3520

MachineFunction &MF = DAG.getMachineFunction();

3521

if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())

3522

return false;

3523

3524

// Check if the arguments are already laid out in the right way as

3525

// the caller's fixed stack objects.

3526

MachineFrameInfo *MFI = MF.getFrameInfo();

3527

const MachineRegisterInfo *MRI = &MF.getRegInfo();

3528

const X86InstrInfo *TII =

3529

static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());

3530

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

3531

CCValAssign &VA = ArgLocs[i];

3532

SDValue Arg = OutVals[i];

3533

ISD::ArgFlagsTy Flags = Outs[i].Flags;

3534

if (VA.getLocInfo() == CCValAssign::Indirect)

3535

return false;

3536

if (!VA.isRegLoc()) {

3537

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

3538

MFI, MRI, TII))

3539

return false;

3540

}

3541

}

3542

}

3543

3544

// If the tailcall address may be in a register, then make sure it's

3545

// possible to register allocate for it. In 32-bit, the call address can

3546

// only target EAX, EDX, or ECX since the tail call must be scheduled after

3547

// callee-saved registers are restored. These happen to be the same

3548

// registers used to pass 'inreg' arguments so watch out for those.

3549

if (!Subtarget->is64Bit() &&

3550

((!isa<GlobalAddressSDNode>(Callee) &&

3551

!isa<ExternalSymbolSDNode>(Callee)) ||

3552

DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {

3553

unsigned NumInRegs = 0;

3554

// In PIC we need an extra register to formulate the address computation

3555

// for the callee.

3556

unsigned MaxInRegs =

3557

(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;

3558

3559

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

3560

CCValAssign &VA = ArgLocs[i];

3561

if (!VA.isRegLoc())

3562

continue;

3563

unsigned Reg = VA.getLocReg();

3564

switch (Reg) {

3565

default: break;

3566

case X86::EAX: case X86::EDX: case X86::ECX:

3567

if (++NumInRegs == MaxInRegs)

3568

return false;

3569

break;

3570

}

3571

}

3572

}

3573

}

3574

3575

return true;

3576

}

3577

3578

FastISel *

3579

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

3580

const TargetLibraryInfo *libInfo) const {

3581

return X86::createFastISel(funcInfo, libInfo);

3582

}

3583

3584

//===----------------------------------------------------------------------===//

3585

// Other Lowering Hooks

3586

//===----------------------------------------------------------------------===//

3587

3588

static bool MayFoldLoad(SDValue Op) {

3589

return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());

3590

}

3591

3592

static bool MayFoldIntoStore(SDValue Op) {

3593

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

3594

}

3595

3596

static bool isTargetShuffle(unsigned Opcode) {

3597

switch(Opcode) {

3598

default: return false;

3599

case X86ISD::BLENDI:

3600

case X86ISD::PSHUFB:

3601

case X86ISD::PSHUFD:

3602

case X86ISD::PSHUFHW:

3603

case X86ISD::PSHUFLW:

3604

case X86ISD::SHUFP:

3605

case X86ISD::PALIGNR:

3606

case X86ISD::MOVLHPS:

3607

case X86ISD::MOVLHPD:

3608

case X86ISD::MOVHLPS:

3609

case X86ISD::MOVLPS:

3610

case X86ISD::MOVLPD:

3611

case X86ISD::MOVSHDUP:

3612

case X86ISD::MOVSLDUP:

3613

case X86ISD::MOVDDUP:

3614

case X86ISD::MOVSS:

3615

case X86ISD::MOVSD:

3616

case X86ISD::UNPCKL:

3617

case X86ISD::UNPCKH:

3618

case X86ISD::VPERMILPI:

3619

case X86ISD::VPERM2X128:

3620

case X86ISD::VPERMI:

3621

return true;

3622

}

3623

}

3624

3625

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,

3626

SDValue V1, SelectionDAG &DAG) {

3627

switch(Opc) {

3628

default: llvm_unreachable("Unknown x86 shuffle node")::llvm::llvm_unreachable_internal("Unknown x86 shuffle node",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3628);

3629

case X86ISD::MOVSHDUP:

3630

case X86ISD::MOVSLDUP:

3631

case X86ISD::MOVDDUP:

3632

return DAG.getNode(Opc, dl, VT, V1);

3633

}

3634

}

3635

3636

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,

3637

SDValue V1, unsigned TargetMask,

3638

SelectionDAG &DAG) {

3639

switch(Opc) {

3640

3641

case X86ISD::PSHUFD:

3642

case X86ISD::PSHUFHW:

3643

case X86ISD::PSHUFLW:

3644

case X86ISD::VPERMILPI:

3645

case X86ISD::VPERMI:

3646

return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));

3647

}

3648

}

3649

3650

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,

3651

SDValue V1, SDValue V2, unsigned TargetMask,

3652

SelectionDAG &DAG) {

3653

switch(Opc) {

3654

3655

case X86ISD::PALIGNR:

3656

case X86ISD::VALIGN:

3657

case X86ISD::SHUFP:

3658

case X86ISD::VPERM2X128:

3659

return DAG.getNode(Opc, dl, VT, V1, V2,

3660

DAG.getConstant(TargetMask, MVT::i8));

3661

}

3662

}

3663

3664

static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,

3665

SDValue V1, SDValue V2, SelectionDAG &DAG) {

3666

switch(Opc) {

3667

3668

case X86ISD::MOVLHPS:

3669

case X86ISD::MOVLHPD:

3670

case X86ISD::MOVHLPS:

3671

case X86ISD::MOVLPS:

3672

case X86ISD::MOVLPD:

3673

case X86ISD::MOVSS:

3674

case X86ISD::MOVSD:

3675

case X86ISD::UNPCKL:

3676

case X86ISD::UNPCKH:

3677

return DAG.getNode(Opc, dl, VT, V1, V2);

3678

}

3679

}

3680

3681

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

3682

MachineFunction &MF = DAG.getMachineFunction();

3683

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

3684

DAG.getSubtarget().getRegisterInfo());

3685

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3686

int ReturnAddrIndex = FuncInfo->getRAIndex();

3687

3688

if (ReturnAddrIndex == 0) {

3689

// Set up a frame object for the return address.

3690

unsigned SlotSize = RegInfo->getSlotSize();

3691

ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,

3692

-(int64_t)SlotSize,

3693

false);

3694

FuncInfo->setRAIndex(ReturnAddrIndex);

3695

}

3696

3697

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());

3698

}

3699

3700

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

3701

bool hasSymbolicDisplacement) {

3702

// Offset should fit into 32 bit immediate field.

3703

if (!isInt<32>(Offset))

3704

return false;

3705

3706

// If we don't have a symbolic displacement - we don't have any extra

3707

// restrictions.

3708

if (!hasSymbolicDisplacement)

3709

return true;

3710

3711

// FIXME: Some tweaks might be needed for medium code model.

3712

if (M != CodeModel::Small && M != CodeModel::Kernel)

3713

return false;

3714

3715

// For small code model we assume that latest object is 16MB before end of 31

3716

// bits boundary. We may also accept pretty large negative constants knowing

3717

// that all objects are in the positive half of address space.

3718

if (M == CodeModel::Small && Offset < 16*1024*1024)

3719

return true;

3720

3721

// For kernel code model we know that all object resist in the negative half

3722

// of 32bits address space. We may not accept negative offsets, since they may

3723

// be just off and we may accept pretty large positive ones.

3724

if (M == CodeModel::Kernel && Offset >= 0)

3725

return true;

3726

3727

return false;

3728

}

3729

3730

/// isCalleePop - Determines whether the callee is required to pop its

3731

/// own arguments. Callee pop is necessary to support tail calls.

3732

bool X86::isCalleePop(CallingConv::ID CallingConv,

3733

bool is64Bit, bool IsVarArg, bool TailCallOpt) {

3734

switch (CallingConv) {

3735

default:

3736

return false;

3737

case CallingConv::X86_StdCall:

3738

case CallingConv::X86_FastCall:

3739

case CallingConv::X86_ThisCall:

3740

return !is64Bit;

3741

case CallingConv::Fast:

3742

case CallingConv::GHC:

3743

case CallingConv::HiPE:

3744

if (IsVarArg)

3745

return false;

3746

return TailCallOpt;

3747

}

3748

}

3749

3750

/// \brief Return true if the condition is an unsigned comparison operation.

3751

static bool isX86CCUnsigned(unsigned X86CC) {

3752

switch (X86CC) {

3753

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3753);

3754

case X86::COND_E: return true;

3755

case X86::COND_G: return false;

3756

case X86::COND_GE: return false;

3757

case X86::COND_L: return false;

3758

case X86::COND_LE: return false;

3759

case X86::COND_NE: return true;

3760

case X86::COND_B: return true;

3761

case X86::COND_A: return true;

3762

case X86::COND_BE: return true;

3763

case X86::COND_AE: return true;

3764

}

3765

llvm_unreachable("covered switch fell through?!")::llvm::llvm_unreachable_internal("covered switch fell through?!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3765);

3766

}

3767

3768

/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86

3769

/// specific condition code, returning the condition code and the LHS/RHS of the

3770

/// comparison to make.

3771

static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,

3772

SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {

3773

if (!isFP) {

3774

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

3775

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {

3776

// X > -1 -> X == 0, jump !sign.

3777

RHS = DAG.getConstant(0, RHS.getValueType());

3778

return X86::COND_NS;

3779

}

3780

if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {

3781

// X < 0 -> X == 0, jump on sign.

3782

return X86::COND_S;

3783

}

3784

if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {

3785

// X < 1 -> X <= 0

3786

RHS = DAG.getConstant(0, RHS.getValueType());

3787

return X86::COND_LE;

3788

}

3789

}

3790

3791

switch (SetCCOpcode) {

3792

3793

case ISD::SETEQ: return X86::COND_E;

3794

case ISD::SETGT: return X86::COND_G;

3795

case ISD::SETGE: return X86::COND_GE;

3796

case ISD::SETLT: return X86::COND_L;

3797

case ISD::SETLE: return X86::COND_LE;

3798

case ISD::SETNE: return X86::COND_NE;

3799

case ISD::SETULT: return X86::COND_B;

3800

case ISD::SETUGT: return X86::COND_A;

3801

case ISD::SETULE: return X86::COND_BE;

3802

case ISD::SETUGE: return X86::COND_AE;

3803

}

3804

}

3805

3806

// First determine if it is required or is profitable to flip the operands.

3807

3808

// If LHS is a foldable load, but RHS is not, flip the condition.

3809

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

3810

!ISD::isNON_EXTLoad(RHS.getNode())) {

3811

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

3812

std::swap(LHS, RHS);

3813

}

3814

3815

switch (SetCCOpcode) {

3816

default: break;

3817

case ISD::SETOLT:

3818

case ISD::SETOLE:

3819

case ISD::SETUGT:

3820

case ISD::SETUGE:

3821

std::swap(LHS, RHS);

3822

break;

3823

}

3824

3825

// On a floating point condition, the flags are set as follows:

3826

// ZF PF CF op

3827

// 0 | 0 | 0 | X > Y

3828

// 0 | 0 | 1 | X < Y

3829

// 1 | 0 | 0 | X == Y

3830

// 1 | 1 | 1 | unordered

3831

switch (SetCCOpcode) {

3832

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3832);

3833

case ISD::SETUEQ:

3834

case ISD::SETEQ: return X86::COND_E;

3835

case ISD::SETOLT: // flipped

3836

case ISD::SETOGT:

3837

case ISD::SETGT: return X86::COND_A;

3838

case ISD::SETOLE: // flipped

3839

case ISD::SETOGE:

3840

case ISD::SETGE: return X86::COND_AE;

3841

case ISD::SETUGT: // flipped

3842

case ISD::SETULT:

3843

case ISD::SETLT: return X86::COND_B;

3844

case ISD::SETUGE: // flipped

3845

case ISD::SETULE:

3846

case ISD::SETLE: return X86::COND_BE;

3847

case ISD::SETONE:

3848

case ISD::SETNE: return X86::COND_NE;

3849

case ISD::SETUO: return X86::COND_P;

3850

case ISD::SETO: return X86::COND_NP;

3851

case ISD::SETOEQ:

3852

case ISD::SETUNE: return X86::COND_INVALID;

3853

}

3854

}

3855

3856

/// hasFPCMov - is there a floating point cmov for the specific X86 condition

3857

/// code. Current x86 isa includes the following FP cmov instructions:

3858

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

3859

static bool hasFPCMov(unsigned X86CC) {

3860

switch (X86CC) {

3861

default:

3862

return false;

3863

case X86::COND_B:

3864

case X86::COND_BE:

3865

case X86::COND_E:

3866

case X86::COND_P:

3867

case X86::COND_A:

3868

case X86::COND_AE:

3869

case X86::COND_NE:

3870

case X86::COND_NP:

3871

return true;

3872

}

3873

}

3874

3875

/// isFPImmLegal - Returns true if the target can instruction select the

3876

/// specified FP immediate natively. If false, the legalizer will

3877

/// materialize the FP immediate as a load from a constant pool.

3878

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {

3879

for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {

3880

if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))

3881

return true;

3882

}

3883

return false;

3884

}

3885

3886

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

3887

ISD::LoadExtType ExtTy,

3888

EVT NewVT) const {

3889

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

3890

// relocation target a movq or addq instruction: don't let the load shrink.

3891

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

3892

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

3893

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

3894

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

3895

return true;

3896

}

3897

3898

/// \brief Returns true if it is beneficial to convert a load of a constant

3899

/// to just the constant itself.

3900

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

3901

Type *Ty) const {

3902

assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3902, __PRETTY_FUNCTION__));

3903

3904

unsigned BitSize = Ty->getPrimitiveSizeInBits();

3905

if (BitSize == 0 || BitSize > 64)

3906

return false;

3907

return true;

3908

}

3909

3910

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,

3911

unsigned Index) const {

3912

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

3913

return false;

3914

3915

return (Index == 0 || Index == ResVT.getVectorNumElements());

3916

}

3917

3918

bool X86TargetLowering::isCheapToSpeculateCttz() const {

3919

// Speculate cttz only if we can directly use TZCNT.

3920

return Subtarget->hasBMI();

3921

}

3922

3923

bool X86TargetLowering::isCheapToSpeculateCtlz() const {

3924

// Speculate ctlz only if we can directly use LZCNT.

3925

return Subtarget->hasLZCNT();

3926

}

3927

3928

/// isUndefOrInRange - Return true if Val is undef or if its value falls within

3929

/// the specified range (L, H].

3930

static bool isUndefOrInRange(int Val, int Low, int Hi) {

3931

return (Val < 0) || (Val >= Low && Val < Hi);

3932

}

3933

3934

/// isUndefOrEqual - Val is either less than zero (undef) or equal to the

3935

/// specified value.

3936

static bool isUndefOrEqual(int Val, int CmpVal) {

3937

return (Val < 0 || Val == CmpVal);

3938

}

3939

3940

/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning

3941

/// from position Pos and ending in Pos+Size, falls within the specified

3942

/// sequential range (Low, Low+Size]. or is undef.

3943

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,

3944

unsigned Pos, unsigned Size, int Low) {

3945

for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)

3946

if (!isUndefOrEqual(Mask[i], Low))

3947

return false;

3948

return true;

3949

}

3950

3951

/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that

3952

/// is suitable for input to PSHUFD. That is, it doesn't reference the other

3953

/// operand - by default will match for first operand.

3954

static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,

3955

bool TestSecondOperand = false) {

3956

if (VT != MVT::v4f32 && VT != MVT::v4i32 &&

3957

VT != MVT::v2f64 && VT != MVT::v2i64)

3958

return false;

3959

3960

unsigned NumElems = VT.getVectorNumElements();

3961

unsigned Lo = TestSecondOperand ? NumElems : 0;

3962

unsigned Hi = Lo + NumElems;

3963

3964

for (unsigned i = 0; i < NumElems; ++i)

3965

if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))

3966

return false;

3967

3968

return true;

3969

}

3970

3971

/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that

3972

/// is suitable for input to PSHUFHW.

3973

static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {

3974

if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))

3975

return false;

3976

3977

// Lower quadword copied in order or undef.

3978

if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))

3979

return false;

3980

3981

// Upper quadword shuffled.

3982

for (unsigned i = 4; i != 8; ++i)

3983

if (!isUndefOrInRange(Mask[i], 4, 8))

3984

return false;

3985

3986

if (VT == MVT::v16i16) {

3987

// Lower quadword copied in order or undef.

3988

if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))

3989

return false;

3990

3991

// Upper quadword shuffled.

3992

for (unsigned i = 12; i != 16; ++i)

3993

if (!isUndefOrInRange(Mask[i], 12, 16))

3994

return false;

3995

}

3996

3997

return true;

3998

}

3999

4000

/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that

4001

/// is suitable for input to PSHUFLW.

4002

static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {

4003

if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))

4004

return false;

4005

4006

// Upper quadword copied in order.

4007

if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))

4008

return false;

4009

4010

// Lower quadword shuffled.

4011

for (unsigned i = 0; i != 4; ++i)

4012

if (!isUndefOrInRange(Mask[i], 0, 4))

4013

return false;

4014

4015

if (VT == MVT::v16i16) {

4016

// Upper quadword copied in order.

4017

if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))

4018

return false;

4019

4020

// Lower quadword shuffled.

4021

for (unsigned i = 8; i != 12; ++i)

4022

if (!isUndefOrInRange(Mask[i], 8, 12))

4023

return false;

4024

}

4025

4026

return true;

4027

}

4028

4029

/// \brief Return true if the mask specifies a shuffle of elements that is

4030

/// suitable for input to intralane (palignr) or interlane (valign) vector

4031

/// right-shift.

4032

static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {

4033

unsigned NumElts = VT.getVectorNumElements();

4034

unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;

4035

unsigned NumLaneElts = NumElts/NumLanes;

4036

4037

// Do not handle 64-bit element shuffles with palignr.

4038

if (NumLaneElts == 2)

4039

return false;

4040

4041

for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {

4042

unsigned i;

4043

for (i = 0; i != NumLaneElts; ++i) {

4044

if (Mask[i+l] >= 0)

4045

break;

4046

}

4047

4048

// Lane is all undef, go to next lane

4049

if (i == NumLaneElts)

4050

continue;

4051

4052

int Start = Mask[i+l];

4053

4054

// Make sure its in this lane in one of the sources

4055

if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&

4056

!isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))

4057

return false;

4058

4059

// If not lane 0, then we must match lane 0

4060

if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))

4061

return false;

4062

4063

// Correct second source to be contiguous with first source

4064

if (Start >= (int)NumElts)

4065

Start -= NumElts - NumLaneElts;

4066

4067

// Make sure we're shifting in the right direction.

4068

if (Start <= (int)(i+l))

4069

return false;

4070

4071

Start -= i;

4072

4073

// Check the rest of the elements to see if they are consecutive.

4074

for (++i; i != NumLaneElts; ++i) {

4075

int Idx = Mask[i+l];

4076

4077

// Make sure its in this lane

4078

if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&

4079

!isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))

4080

return false;

4081

4082

// If not lane 0, then we must match lane 0

4083

if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))

4084

return false;

4085

4086

if (Idx >= (int)NumElts)

4087

Idx -= NumElts - NumLaneElts;

4088

4089

if (!isUndefOrEqual(Idx, Start+i))

4090

return false;

4091

4092

}

4093

}

4094

4095

return true;

4096

}

4097

4098

/// \brief Return true if the node specifies a shuffle of elements that is

4099

/// suitable for input to PALIGNR.

4100

static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,

4101

const X86Subtarget *Subtarget) {

4102

if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||

4103

(VT.is256BitVector() && !Subtarget->hasInt256()) ||

4104

VT.is512BitVector())

4105

// FIXME: Add AVX512BW.

4106

return false;

4107

4108

return isAlignrMask(Mask, VT, false);

4109

}

4110

4111

/// \brief Return true if the node specifies a shuffle of elements that is

4112

/// suitable for input to VALIGN.

4113

static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,

4114

const X86Subtarget *Subtarget) {

4115

// FIXME: Add AVX512VL.

4116

if (!VT.is512BitVector() || !Subtarget->hasAVX512())

4117

return false;

4118

return isAlignrMask(Mask, VT, true);

4119

}

4120

4121

/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming

4122

/// the two vector operands have swapped position.

4123

static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,

4124

unsigned NumElems) {

4125

for (unsigned i = 0; i != NumElems; ++i) {

4126

int idx = Mask[i];

4127

if (idx < 0)

4128

continue;

4129

else if (idx < (int)NumElems)

4130

Mask[i] = idx + NumElems;

4131

else

4132

Mask[i] = idx - NumElems;

4133

}

4134

}

4135

4136

/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand

4137

/// specifies a shuffle of elements that is suitable for input to 128/256-bit

4138

/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be

4139

/// reverse of what x86 shuffles want.

4140

static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {

4141

4142

unsigned NumElems = VT.getVectorNumElements();

4143

unsigned NumLanes = VT.getSizeInBits()/128;

4144

unsigned NumLaneElems = NumElems/NumLanes;

4145

4146

if (NumLaneElems != 2 && NumLaneElems != 4)

4147

return false;

4148

4149

unsigned EltSize = VT.getVectorElementType().getSizeInBits();

4150

bool symetricMaskRequired =

4151

(VT.getSizeInBits() >= 256) && (EltSize == 32);

4152

4153

// VSHUFPSY divides the resulting vector into 4 chunks.

4154

// The sources are also splitted into 4 chunks, and each destination

4155

// chunk must come from a different source chunk.

4156

4157

// SRC1 => X7 X6 X5 X4 X3 X2 X1 X0

4158

// SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9

4159

4160

// DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,

4161

// Y3..Y0, Y3..Y0, X3..X0, X3..X0

4162

4163

// VSHUFPDY divides the resulting vector into 4 chunks.

4164

// The sources are also splitted into 4 chunks, and each destination

4165

// chunk must come from a different source chunk.

4166

4167

// SRC1 => X3 X2 X1 X0

4168

// SRC2 => Y3 Y2 Y1 Y0

4169

4170

// DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0

4171

4172

SmallVector<int, 4> MaskVal(NumLaneElems, -1);

4173

unsigned HalfLaneElems = NumLaneElems/2;

4174

for (unsigned l = 0; l != NumElems; l += NumLaneElems) {

4175

for (unsigned i = 0; i != NumLaneElems; ++i) {

4176

int Idx = Mask[i+l];

4177

unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);

4178

if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))

4179

return false;

4180

// For VSHUFPSY, the mask of the second half must be the same as the

4181

// first but with the appropriate offsets. This works in the same way as

4182

// VPERMILPS works with masks.

4183

if (!symetricMaskRequired || Idx < 0)

4184

continue;

4185

if (MaskVal[i] < 0) {

4186

MaskVal[i] = Idx - l;

4187

continue;

4188

}

4189

if ((signed)(Idx - l) != MaskVal[i])

4190

return false;

4191

}

4192

}

4193

4194

return true;

4195

}

4196

4197

/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand

4198

/// specifies a shuffle of elements that is suitable for input to MOVHLPS.

4199

static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {

4200

if (!VT.is128BitVector())

4201

return false;

4202

4203

unsigned NumElems = VT.getVectorNumElements();

4204

4205

if (NumElems != 4)

4206

return false;

4207

4208

// Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3

4209

return isUndefOrEqual(Mask[0], 6) &&

4210

isUndefOrEqual(Mask[1], 7) &&

4211

isUndefOrEqual(Mask[2], 2) &&

4212

isUndefOrEqual(Mask[3], 3);

4213

}

4214

4215

/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form

4216

/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,

4217

/// <2, 3, 2, 3>

4218

static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {

4219

if (!VT.is128BitVector())

4220

return false;

4221

4222

unsigned NumElems = VT.getVectorNumElements();

4223

4224

if (NumElems != 4)

4225

return false;

4226

4227

return isUndefOrEqual(Mask[0], 2) &&

4228

isUndefOrEqual(Mask[1], 3) &&

4229

isUndefOrEqual(Mask[2], 2) &&

4230

isUndefOrEqual(Mask[3], 3);

4231

}

4232

4233

/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand

4234

/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.

4235

static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {

4236

if (!VT.is128BitVector())

4237

return false;

4238

4239

unsigned NumElems = VT.getVectorNumElements();

4240

4241

if (NumElems != 2 && NumElems != 4)

4242

return false;

4243

4244

for (unsigned i = 0, e = NumElems/2; i != e; ++i)

4245

if (!isUndefOrEqual(Mask[i], i + NumElems))

4246

return false;

4247

4248

for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)

4249

if (!isUndefOrEqual(Mask[i], i))

4250

return false;

4251

4252

return true;

4253

}

4254

4255

/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand

4256

/// specifies a shuffle of elements that is suitable for input to MOVLHPS.

4257

static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {

4258

if (!VT.is128BitVector())

4259

return false;

4260

4261

unsigned NumElems = VT.getVectorNumElements();

4262

4263

if (NumElems != 2 && NumElems != 4)

4264

return false;

4265

4266

for (unsigned i = 0, e = NumElems/2; i != e; ++i)

4267

if (!isUndefOrEqual(Mask[i], i))

4268

return false;

4269

4270

for (unsigned i = 0, e = NumElems/2; i != e; ++i)

4271

if (!isUndefOrEqual(Mask[i + e], i + NumElems))

4272

return false;

4273

4274

return true;

4275

}

4276

4277

/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand

4278

/// specifies a shuffle of elements that is suitable for input to INSERTPS.

4279

/// i. e: If all but one element come from the same vector.

4280

static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {

4281

// TODO: Deal with AVX's VINSERTPS

4282

if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))

4283

return false;

4284

4285

unsigned CorrectPosV1 = 0;

4286

unsigned CorrectPosV2 = 0;

4287

for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {

4288

if (Mask[i] == -1) {

4289

++CorrectPosV1;

4290

++CorrectPosV2;

4291

continue;

4292

}

4293

4294

if (Mask[i] == i)

4295

++CorrectPosV1;

4296

else if (Mask[i] == i + 4)

4297

++CorrectPosV2;

4298

}

4299

4300

if (CorrectPosV1 == 3 || CorrectPosV2 == 3)

4301

// We have 3 elements (undefs count as elements from any vector) from one

4302

// vector, and one from another.

4303

return true;

4304

4305

return false;

4306

}

4307

4308

4309

// Some special combinations that can be optimized.

4310

4311

static

4312

SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,

4313

SelectionDAG &DAG) {

4314

MVT VT = SVOp->getSimpleValueType(0);

4315

SDLoc dl(SVOp);

4316

4317

if (VT != MVT::v8i32 && VT != MVT::v8f32)

4318

return SDValue();

4319

4320

ArrayRef<int> Mask = SVOp->getMask();

4321

4322

// These are the special masks that may be optimized.

4323

static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};

4324

static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};

4325

bool MatchEvenMask = true;

4326

bool MatchOddMask = true;

4327

for (int i=0; i<8; ++i) {

4328

if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))

4329

MatchEvenMask = false;

4330

if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))

4331

MatchOddMask = false;

4332

}

4333

4334

if (!MatchEvenMask && !MatchOddMask)

4335

return SDValue();

4336

4337

SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);

4338

4339

SDValue Op0 = SVOp->getOperand(0);

4340

SDValue Op1 = SVOp->getOperand(1);

4341

4342

if (MatchEvenMask) {

4343

// Shift the second operand right to 32 bits.

4344

static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };

4345

Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);

4346

} else {

4347

// Shift the first operand left to 32 bits.

4348

static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };

4349

Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);

4350

}

4351

static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};

4352

return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);

4353

}

4354

4355

/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand

4356

/// specifies a shuffle of elements that is suitable for input to UNPCKL.

4357

static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,

4358

bool HasInt256, bool V2IsSplat = false) {

4359

4360

assert(VT.getSizeInBits() >= 128 &&((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckl\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4361, __PRETTY_FUNCTION__))

4361

"Unsupported vector type for unpckl")((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckl\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4361, __PRETTY_FUNCTION__));

4362

4363

unsigned NumElts = VT.getVectorNumElements();

4364

if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&

4365

(!HasInt256 || (NumElts != 16 && NumElts != 32)))

4366

return false;

4367

4368

assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&(((!VT.is512BitVector() || VT.getScalarType().getSizeInBits()
>= 32) && "Unsupported vector type for unpckh") ?
static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4369, __PRETTY_FUNCTION__))

4369

"Unsupported vector type for unpckh")(((!VT.is512BitVector() || VT.getScalarType().getSizeInBits()
>= 32) && "Unsupported vector type for unpckh") ?
static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4369, __PRETTY_FUNCTION__));

4370

4371

// AVX defines UNPCK* to operate independently on 128-bit lanes.

4372

unsigned NumLanes = VT.getSizeInBits()/128;

4373

unsigned NumLaneElts = NumElts/NumLanes;

4374

4375

for (unsigned l = 0; l != NumElts; l += NumLaneElts) {

4376

for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {

4377

int BitI = Mask[l+i];

4378

int BitI1 = Mask[l+i+1];

4379

if (!isUndefOrEqual(BitI, j))

4380

return false;

4381

if (V2IsSplat) {

4382

if (!isUndefOrEqual(BitI1, NumElts))

4383

return false;

4384

} else {

4385

if (!isUndefOrEqual(BitI1, j + NumElts))

4386

return false;

4387

}

4388

}

4389

}

4390

4391

return true;

4392

}

4393

4394

/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand

4395

/// specifies a shuffle of elements that is suitable for input to UNPCKH.

4396

static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,

4397

bool HasInt256, bool V2IsSplat = false) {

4398

assert(VT.getSizeInBits() >= 128 &&((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4399, __PRETTY_FUNCTION__))

4399

"Unsupported vector type for unpckh")((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4399, __PRETTY_FUNCTION__));

4400

4401

unsigned NumElts = VT.getVectorNumElements();

4402

if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&

4403

(!HasInt256 || (NumElts != 16 && NumElts != 32)))

4404

return false;

4405

4406

4407

4408

4409

// AVX defines UNPCK* to operate independently on 128-bit lanes.

4410

unsigned NumLanes = VT.getSizeInBits()/128;

4411

unsigned NumLaneElts = NumElts/NumLanes;

4412

4413

for (unsigned l = 0; l != NumElts; l += NumLaneElts) {

4414

for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {

4415

int BitI = Mask[l+i];

4416

int BitI1 = Mask[l+i+1];

4417

if (!isUndefOrEqual(BitI, j))

4418

return false;

4419

if (V2IsSplat) {

4420

if (isUndefOrEqual(BitI1, NumElts))

4421

return false;

4422

} else {

4423

if (!isUndefOrEqual(BitI1, j+NumElts))

4424

return false;

4425

}

4426

}

4427

}

4428

return true;

4429

}

4430

4431

/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form

4432

/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,

4433

/// <0, 0, 1, 1>

4434

static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {

4435

unsigned NumElts = VT.getVectorNumElements();

4436

bool Is256BitVec = VT.is256BitVector();

4437

4438

if (VT.is512BitVector())

4439

return false;

4440

assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4441, __PRETTY_FUNCTION__))

4441

"Unsupported vector type for unpckh")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4441, __PRETTY_FUNCTION__));

4442

4443

if (Is256BitVec && NumElts != 4 && NumElts != 8 &&

4444

(!HasInt256 || (NumElts != 16 && NumElts != 32)))

4445

return false;

4446

4447

// For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern

4448

// FIXME: Need a better way to get rid of this, there's no latency difference

4449

// between UNPCKLPD and MOVDDUP, the later should always be checked first and

4450

// the former later. We should also remove the "_undef" special mask.

4451

if (NumElts == 4 && Is256BitVec)

4452

return false;

4453

4454

// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate

4455

// independently on 128-bit lanes.

4456

unsigned NumLanes = VT.getSizeInBits()/128;

4457

unsigned NumLaneElts = NumElts/NumLanes;

4458

4459

for (unsigned l = 0; l != NumElts; l += NumLaneElts) {

4460

for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {

4461

int BitI = Mask[l+i];

4462

int BitI1 = Mask[l+i+1];

4463

4464

if (!isUndefOrEqual(BitI, j))

4465

return false;

4466

if (!isUndefOrEqual(BitI1, j))

4467

return false;

4468

}

4469

}

4470

4471

return true;

4472

}

4473

4474

/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form

4475

/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,

4476

/// <2, 2, 3, 3>

4477

static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {

4478

unsigned NumElts = VT.getVectorNumElements();

4479

4480

if (VT.is512BitVector())

4481

return false;

4482

4483

4484

4485

4486

if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&

4487

(!HasInt256 || (NumElts != 16 && NumElts != 32)))

4488

return false;

4489

4490

// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate

4491

// independently on 128-bit lanes.

4492

unsigned NumLanes = VT.getSizeInBits()/128;

4493

unsigned NumLaneElts = NumElts/NumLanes;

4494

4495

for (unsigned l = 0; l != NumElts; l += NumLaneElts) {

4496

for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {

4497

int BitI = Mask[l+i];

4498

int BitI1 = Mask[l+i+1];

4499

if (!isUndefOrEqual(BitI, j))

4500

return false;

4501

if (!isUndefOrEqual(BitI1, j))

4502

return false;

4503

}

4504

}

4505

return true;

4506

}

4507

4508

// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or

4509

// (src1[0], src0[1]), manipulation with 256-bit sub-vectors

4510

static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {

4511

if (!VT.is512BitVector())

4512

return false;

4513

4514

unsigned NumElts = VT.getVectorNumElements();

4515

unsigned HalfSize = NumElts/2;

4516

if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {

4517

if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {

4518

*Imm = 1;

4519

return true;

4520

}

4521

}

4522

if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {

4523

if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {

4524

*Imm = 0;

4525

return true;

4526

}

4527

}

4528

return false;

4529

}

4530

4531

/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand

4532

/// specifies a shuffle of elements that is suitable for input to MOVSS,

4533

/// MOVSD, and MOVD, i.e. setting the lowest element.

4534

static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {

4535

if (VT.getVectorElementType().getSizeInBits() < 32)

4536

return false;

4537

if (!VT.is128BitVector())

4538

return false;

4539

4540

unsigned NumElts = VT.getVectorNumElements();

4541

4542

if (!isUndefOrEqual(Mask[0], NumElts))

4543

return false;

4544

4545

for (unsigned i = 1; i != NumElts; ++i)

4546

if (!isUndefOrEqual(Mask[i], i))

4547

return false;

4548

4549

return true;

4550

}

4551

4552

/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered

4553

/// as permutations between 128-bit chunks or halves. As an example: this

4554

/// shuffle bellow:

4555

/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>

4556

/// The first half comes from the second half of V1 and the second half from the

4557

/// the second half of V2.

4558

static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {

4559

if (!HasFp256 || !VT.is256BitVector())

4560

return false;

4561

4562

// The shuffle result is divided into half A and half B. In total the two

4563

// sources have 4 halves, namely: C, D, E, F. The final values of A and

4564

// B must come from C, D, E or F.

4565

unsigned HalfSize = VT.getVectorNumElements()/2;

4566

bool MatchA = false, MatchB = false;

4567

4568

// Check if A comes from one of C, D, E, F.

4569

for (unsigned Half = 0; Half != 4; ++Half) {

4570

if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {

4571

MatchA = true;

4572

break;

4573

}

4574

}

4575

4576

// Check if B comes from one of C, D, E, F.

4577

for (unsigned Half = 0; Half != 4; ++Half) {

4578

if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {

4579

MatchB = true;

4580

break;

4581

}

4582

}

4583

4584

return MatchA && MatchB;

4585

}

4586

4587

/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle

4588

/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.

4589

static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {

4590

MVT VT = SVOp->getSimpleValueType(0);

4591

4592

unsigned HalfSize = VT.getVectorNumElements()/2;

4593

4594

unsigned FstHalf = 0, SndHalf = 0;

4595

for (unsigned i = 0; i < HalfSize; ++i) {

4596

if (SVOp->getMaskElt(i) > 0) {

4597

FstHalf = SVOp->getMaskElt(i)/HalfSize;

4598

break;

4599

}

4600

}

4601

for (unsigned i = HalfSize; i < HalfSize*2; ++i) {

4602

if (SVOp->getMaskElt(i) > 0) {

4603

SndHalf = SVOp->getMaskElt(i)/HalfSize;

4604

break;

4605

}

4606

}

4607

4608

return (FstHalf | (SndHalf << 4));

4609

}

4610

4611

// Symetric in-lane mask. Each lane has 4 elements (for imm8)

4612

static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {

4613

unsigned EltSize = VT.getVectorElementType().getSizeInBits();

4614

if (EltSize < 32)

4615

return false;

4616

4617

unsigned NumElts = VT.getVectorNumElements();

4618

Imm8 = 0;

4619

if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {

4620

for (unsigned i = 0; i != NumElts; ++i) {

4621

if (Mask[i] < 0)

4622

continue;

4623

Imm8 |= Mask[i] << (i*2);

4624

}

4625

return true;

4626

}

4627

4628

unsigned LaneSize = 4;

4629

SmallVector<int, 4> MaskVal(LaneSize, -1);

4630

4631

for (unsigned l = 0; l != NumElts; l += LaneSize) {

4632

for (unsigned i = 0; i != LaneSize; ++i) {

4633

if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))

4634

return false;

4635

if (Mask[i+l] < 0)

4636

continue;

4637

if (MaskVal[i] < 0) {

4638

MaskVal[i] = Mask[i+l] - l;

4639

Imm8 |= MaskVal[i] << (i*2);

4640

continue;

4641

}

4642

if (Mask[i+l] != (signed)(MaskVal[i]+l))

4643

return false;

4644

}

4645

}

4646

return true;

4647

}

4648

4649

/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand

4650

/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.

4651

/// Note that VPERMIL mask matching is different depending whether theunderlying

4652

/// type is 32 or 64. In the VPERMILPS the high half of the mask should point

4653

/// to the same elements of the low, but to the higher half of the source.

4654

/// In VPERMILPD the two lanes could be shuffled independently of each other

4655

/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.

4656

static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {

4657

unsigned EltSize = VT.getVectorElementType().getSizeInBits();

4658

if (VT.getSizeInBits() < 256 || EltSize < 32)

4659

return false;

4660

bool symetricMaskRequired = (EltSize == 32);

4661

unsigned NumElts = VT.getVectorNumElements();

4662

4663

unsigned NumLanes = VT.getSizeInBits()/128;

4664

unsigned LaneSize = NumElts/NumLanes;

4665

// 2 or 4 elements in one lane

4666

4667

SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);

4668

for (unsigned l = 0; l != NumElts; l += LaneSize) {

4669

for (unsigned i = 0; i != LaneSize; ++i) {

4670

if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))

4671

return false;

4672

if (symetricMaskRequired) {

4673

if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {

4674

ExpectedMaskVal[i] = Mask[i+l] - l;

4675

continue;

4676

}

4677

if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))

4678

return false;

4679

}

4680

}

4681

}

4682

return true;

4683

}

4684

4685

/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse

4686

/// of what x86 movss want. X86 movs requires the lowest element to be lowest

4687

/// element of vector 2 and the other elements to come from vector 1 in order.

4688

static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,

4689

bool V2IsSplat = false, bool V2IsUndef = false) {

4690

if (!VT.is128BitVector())

4691

return false;

4692

4693

unsigned NumOps = VT.getVectorNumElements();

4694

if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)

4695

return false;

4696

4697

if (!isUndefOrEqual(Mask[0], 0))

4698

return false;

4699

4700

for (unsigned i = 1; i != NumOps; ++i)

4701

if (!(isUndefOrEqual(Mask[i], i+NumOps) ||

4702

(V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||

4703

(V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))

4704

return false;

4705

4706

return true;

4707

}

4708

4709

/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand

4710

/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.

4711

/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>

4712

static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,

4713

const X86Subtarget *Subtarget) {

4714

if (!Subtarget->hasSSE3())

4715

return false;

4716

4717

unsigned NumElems = VT.getVectorNumElements();

4718

4719

if ((VT.is128BitVector() && NumElems != 4) ||

4720

(VT.is256BitVector() && NumElems != 8) ||

4721

(VT.is512BitVector() && NumElems != 16))

4722

return false;

4723

4724

// "i+1" is the value the indexed mask element must have

4725

for (unsigned i = 0; i != NumElems; i += 2)

4726

if (!isUndefOrEqual(Mask[i], i+1) ||

4727

!isUndefOrEqual(Mask[i+1], i+1))

4728

return false;

4729

4730

return true;

4731

}

4732

4733

/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand

4734

/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.

4735

/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>

4736

static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,

4737

const X86Subtarget *Subtarget) {

4738

if (!Subtarget->hasSSE3())

4739

return false;

4740

4741

unsigned NumElems = VT.getVectorNumElements();

4742

4743

if ((VT.is128BitVector() && NumElems != 4) ||

4744

(VT.is256BitVector() && NumElems != 8) ||

4745

(VT.is512BitVector() && NumElems != 16))

4746

return false;

4747

4748

// "i" is the value the indexed mask element must have

4749

for (unsigned i = 0; i != NumElems; i += 2)

4750

if (!isUndefOrEqual(Mask[i], i) ||

4751

!isUndefOrEqual(Mask[i+1], i))

4752

return false;

4753

4754

return true;

4755

}

4756

4757

/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand

4758

/// specifies a shuffle of elements that is suitable for input to 256-bit

4759

/// version of MOVDDUP.

4760

static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {

4761

if (!HasFp256 || !VT.is256BitVector())

4762

return false;

4763

4764

unsigned NumElts = VT.getVectorNumElements();

4765

if (NumElts != 4)

4766

return false;

4767

4768

for (unsigned i = 0; i != NumElts/2; ++i)

4769

if (!isUndefOrEqual(Mask[i], 0))

4770

return false;

4771

for (unsigned i = NumElts/2; i != NumElts; ++i)

4772

if (!isUndefOrEqual(Mask[i], NumElts/2))

4773

return false;

4774

return true;

4775

}

4776

4777

/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand

4778

/// specifies a shuffle of elements that is suitable for input to 128-bit

4779

/// version of MOVDDUP.

4780

static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {

4781

if (!VT.is128BitVector())

4782

return false;

4783

4784

unsigned e = VT.getVectorNumElements() / 2;

4785

for (unsigned i = 0; i != e; ++i)

4786

if (!isUndefOrEqual(Mask[i], i))

4787

return false;

4788

for (unsigned i = 0; i != e; ++i)

4789

if (!isUndefOrEqual(Mask[e+i], i))

4790

return false;

4791

return true;

4792

}

4793

4794

/// isVEXTRACTIndex - Return true if the specified

4795

/// EXTRACT_SUBVECTOR operand specifies a vector extract that is

4796

/// suitable for instruction that extract 128 or 256 bit vectors

4797

static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {

4798

assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width")(((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"
) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 || vecWidth == 256) && \"Unexpected vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4798, __PRETTY_FUNCTION__));

4799

if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))

4800

return false;

4801

4802

// The index should be aligned on a vecWidth-bit boundary.

4803

uint64_t Index =

4804

cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();

4805

4806

MVT VT = N->getSimpleValueType(0);

4807

unsigned ElSize = VT.getVectorElementType().getSizeInBits();

4808

bool Result = (Index * ElSize) % vecWidth == 0;

4809

4810

return Result;

4811

}

4812

4813

/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR

4814

/// operand specifies a subvector insert that is suitable for input to

4815

/// insertion of 128 or 256-bit subvectors

4816

static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {

4817

4818

if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))

4819

return false;

4820

// The index should be aligned on a vecWidth-bit boundary.

4821

uint64_t Index =

4822

cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();

4823

4824

MVT VT = N->getSimpleValueType(0);

4825

unsigned ElSize = VT.getVectorElementType().getSizeInBits();

4826

bool Result = (Index * ElSize) % vecWidth == 0;

4827

4828

return Result;

4829

}

4830

4831

bool X86::isVINSERT128Index(SDNode *N) {

4832

return isVINSERTIndex(N, 128);

4833

}

4834

4835

bool X86::isVINSERT256Index(SDNode *N) {

4836

return isVINSERTIndex(N, 256);

4837

}

4838

4839

bool X86::isVEXTRACT128Index(SDNode *N) {

4840

return isVEXTRACTIndex(N, 128);

4841

}

4842

4843

bool X86::isVEXTRACT256Index(SDNode *N) {

4844

return isVEXTRACTIndex(N, 256);

4845

}

4846

4847

/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle

4848

/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.

4849

/// Handles 128-bit and 256-bit.

4850

static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {

4851

MVT VT = N->getSimpleValueType(0);

4852

4853

assert((VT.getSizeInBits() >= 128) &&(((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() >= 128) && \"Unsupported vector type for PSHUF/SHUFP\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4854, __PRETTY_FUNCTION__))

4854

"Unsupported vector type for PSHUF/SHUFP")(((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() >= 128) && \"Unsupported vector type for PSHUF/SHUFP\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4854, __PRETTY_FUNCTION__));

4855

4856

// Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate

4857

// independently on 128-bit lanes.

4858

unsigned NumElts = VT.getVectorNumElements();

4859

unsigned NumLanes = VT.getSizeInBits()/128;

4860

unsigned NumLaneElts = NumElts/NumLanes;

4861

4862

assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&(((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
"Only supports 2, 4 or 8 elements per lane") ? static_cast<
void> (0) : __assert_fail ("(NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && \"Only supports 2, 4 or 8 elements per lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4863, __PRETTY_FUNCTION__))

4863

"Only supports 2, 4 or 8 elements per lane")(((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
"Only supports 2, 4 or 8 elements per lane") ? static_cast<
void> (0) : __assert_fail ("(NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && \"Only supports 2, 4 or 8 elements per lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4863, __PRETTY_FUNCTION__));

4864

4865

unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;

4866

unsigned Mask = 0;

4867

for (unsigned i = 0; i != NumElts; ++i) {

4868

int Elt = N->getMaskElt(i);

4869

if (Elt < 0) continue;

4870

Elt &= NumLaneElts - 1;

4871

unsigned ShAmt = (i << Shift) % 8;

4872

Mask |= Elt << ShAmt;

4873

}

4874

4875

return Mask;

4876

}

4877

4878

/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle

4879

/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.

4880

static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {

4881

MVT VT = N->getSimpleValueType(0);

4882

4883

assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&(((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8i16 || VT == MVT::v16i16) && \"Unsupported vector type for PSHUFHW\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4884, __PRETTY_FUNCTION__))

4884

"Unsupported vector type for PSHUFHW")(((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8i16 || VT == MVT::v16i16) && \"Unsupported vector type for PSHUFHW\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4884, __PRETTY_FUNCTION__));

4885

4886

unsigned NumElts = VT.getVectorNumElements();

4887

4888

unsigned Mask = 0;

4889

for (unsigned l = 0; l != NumElts; l += 8) {

4890

// 8 nodes per lane, but we only care about the last 4.

4891

for (unsigned i = 0; i < 4; ++i) {

4892

int Elt = N->getMaskElt(l+i+4);

4893

if (Elt < 0) continue;

4894

Elt &= 0x3; // only 2-bits.

4895

Mask |= Elt << (i * 2);

4896

}

4897

}

4898

4899

return Mask;

4900

}

4901

4902

/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle

4903

/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.

4904

static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {

4905

MVT VT = N->getSimpleValueType(0);

4906

4907

4908

4909

4910

unsigned NumElts = VT.getVectorNumElements();

4911

4912

unsigned Mask = 0;

4913

for (unsigned l = 0; l != NumElts; l += 8) {

4914

// 8 nodes per lane, but we only care about the first 4.

4915

for (unsigned i = 0; i < 4; ++i) {

4916

int Elt = N->getMaskElt(l+i);

4917

if (Elt < 0) continue;

4918

Elt &= 0x3; // only 2-bits

4919

Mask |= Elt << (i * 2);

4920

}

4921

}

4922

4923

return Mask;

4924

}

4925

4926

/// \brief Return the appropriate immediate to shuffle the specified

4927

/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with

4928

/// VALIGN (if Interlane is true) instructions.

4929

static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,

4930

bool InterLane) {

4931

MVT VT = SVOp->getSimpleValueType(0);

4932

unsigned EltSize = InterLane ? 1 :

4933

VT.getVectorElementType().getSizeInBits() >> 3;

4934

4935

unsigned NumElts = VT.getVectorNumElements();

4936

unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;

4937

unsigned NumLaneElts = NumElts/NumLanes;

4938

4939

int Val = 0;

4940

unsigned i;

4941

for (i = 0; i != NumElts; ++i) {

4942

Val = SVOp->getMaskElt(i);

4943

if (Val >= 0)

4944

break;

4945

}

4946

if (Val >= (int)NumElts)

4947

Val -= NumElts - NumLaneElts;

4948

4949

assert(Val - i > 0 && "PALIGNR imm should be positive")((Val - i > 0 && "PALIGNR imm should be positive")
? static_cast<void> (0) : __assert_fail ("Val - i > 0 && \"PALIGNR imm should be positive\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4949, __PRETTY_FUNCTION__));

4950

return (Val - i) * EltSize;

4951

}

4952

4953

/// \brief Return the appropriate immediate to shuffle the specified

4954

/// VECTOR_SHUFFLE mask with the PALIGNR instruction.

4955

static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {

4956

return getShuffleAlignrImmediate(SVOp, false);

4957

}

4958

4959

/// \brief Return the appropriate immediate to shuffle the specified

4960

/// VECTOR_SHUFFLE mask with the VALIGN instruction.

4961

static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {

4962

return getShuffleAlignrImmediate(SVOp, true);

4963

}

4964

4965

4966

static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {

4967

assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width")(((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 || vecWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4967, __PRETTY_FUNCTION__));

4968

if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))

4969

llvm_unreachable("Illegal extract subvector for VEXTRACT")::llvm::llvm_unreachable_internal("Illegal extract subvector for VEXTRACT"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4969);

4970

4971

uint64_t Index =

4972

cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();

4973

4974

MVT VecVT = N->getOperand(0).getSimpleValueType();

4975

MVT ElVT = VecVT.getVectorElementType();

4976

4977

unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();

4978

return Index / NumElemsPerChunk;

4979

}

4980

4981

static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {

4982

4983

if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))

4984

llvm_unreachable("Illegal insert subvector for VINSERT")::llvm::llvm_unreachable_internal("Illegal insert subvector for VINSERT"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4984);

4985

4986

uint64_t Index =

4987

cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();

4988

4989

MVT VecVT = N->getSimpleValueType(0);

4990

MVT ElVT = VecVT.getVectorElementType();

4991

4992

unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();

4993

return Index / NumElemsPerChunk;

4994

}

4995

4996

/// getExtractVEXTRACT128Immediate - Return the appropriate immediate

4997

/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128

4998

/// and VINSERTI128 instructions.

4999

unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {

5000

return getExtractVEXTRACTImmediate(N, 128);

5001

}

5002

5003

/// getExtractVEXTRACT256Immediate - Return the appropriate immediate

5004

/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4

5005

/// and VINSERTI64x4 instructions.

5006

unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {

5007

return getExtractVEXTRACTImmediate(N, 256);

5008

}

5009

5010

/// getInsertVINSERT128Immediate - Return the appropriate immediate

5011

/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128

5012

/// and VINSERTI128 instructions.

5013

unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {

5014

return getInsertVINSERTImmediate(N, 128);

5015

}

5016

5017

/// getInsertVINSERT256Immediate - Return the appropriate immediate

5018

/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4

5019

/// and VINSERTI64x4 instructions.

5020

unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {

5021

return getInsertVINSERTImmediate(N, 256);

5022

}

5023

5024

/// isZero - Returns true if Elt is a constant integer zero

5025

static bool isZero(SDValue V) {

5026

ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);

5027

return C && C->isNullValue();

5028

}

5029

5030

/// isZeroNode - Returns true if Elt is a constant zero or a floating point

5031

/// constant +0.0.

5032

bool X86::isZeroNode(SDValue Elt) {

5033

if (isZero(Elt))

5034

return true;

5035

if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))

5036

return CFP->getValueAPF().isPosZero();

5037

return false;

5038

}

5039

5040

/// ShouldXformToMOVHLPS - Return true if the node should be transformed to

5041

/// match movhlps. The lower half elements should come from upper half of

5042

/// V1 (and in order), and the upper half elements should come from the upper

5043

/// half of V2 (and in order).

5044

static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {

5045

if (!VT.is128BitVector())

5046

return false;

5047

if (VT.getVectorNumElements() != 4)

5048

return false;

5049

for (unsigned i = 0, e = 2; i != e; ++i)

5050

if (!isUndefOrEqual(Mask[i], i+2))

5051

return false;

5052

for (unsigned i = 2; i != 4; ++i)

5053

if (!isUndefOrEqual(Mask[i], i+4))

5054

return false;

5055

return true;

5056

}

5057

5058

/// isScalarLoadToVector - Returns true if the node is a scalar load that

5059

/// is promoted to a vector. It also returns the LoadSDNode by reference if

5060

/// required.

5061

static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {

5062

if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)

5063

return false;

5064

N = N->getOperand(0).getNode();

5065

if (!ISD::isNON_EXTLoad(N))

5066

return false;

5067

if (LD)

5068

*LD = cast<LoadSDNode>(N);

5069

return true;

5070

}

5071

5072

// Test whether the given value is a vector value which will be legalized

5073

// into a load.

5074

static bool WillBeConstantPoolLoad(SDNode *N) {

5075

if (N->getOpcode() != ISD::BUILD_VECTOR)

5076

return false;

5077

5078

// Check for any non-constant elements.

5079

for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)

5080

switch (N->getOperand(i).getNode()->getOpcode()) {

5081

case ISD::UNDEF:

5082

case ISD::ConstantFP:

5083

case ISD::Constant:

5084

break;

5085

default:

5086

return false;

5087

}

5088

5089

// Vectors of all-zeros and all-ones are materialized with special

5090

// instructions rather than being loaded.

5091

return !ISD::isBuildVectorAllZeros(N) &&

5092

!ISD::isBuildVectorAllOnes(N);

5093

}

5094

5095

/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to

5096

/// match movlp{s|d}. The lower half elements should come from lower half of

5097

/// V1 (and in order), and the upper half elements should come from the upper

5098

/// half of V2 (and in order). And since V1 will become the source of the

5099

/// MOVLP, it must be either a vector load or a scalar load to vector.

5100

static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,

5101

ArrayRef<int> Mask, MVT VT) {

5102

if (!VT.is128BitVector())

5103

return false;

5104

5105

if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))

5106

return false;

5107

// Is V2 is a vector load, don't do this transformation. We will try to use

5108

// load folding shufps op.

5109

if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))

5110

return false;

5111

5112

unsigned NumElems = VT.getVectorNumElements();

5113

5114

if (NumElems != 2 && NumElems != 4)

5115

return false;

5116

for (unsigned i = 0, e = NumElems/2; i != e; ++i)

5117

if (!isUndefOrEqual(Mask[i], i))

5118

return false;

5119

for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)

5120

if (!isUndefOrEqual(Mask[i], i+NumElems))

5121

return false;

5122

return true;

5123

}

5124

5125

/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved

5126

/// to an zero vector.

5127

/// FIXME: move to dag combiner / method on ShuffleVectorSDNode

5128

static bool isZeroShuffle(ShuffleVectorSDNode *N) {

5129

SDValue V1 = N->getOperand(0);

5130

SDValue V2 = N->getOperand(1);

5131

unsigned NumElems = N->getValueType(0).getVectorNumElements();

5132

for (unsigned i = 0; i != NumElems; ++i) {

5133

int Idx = N->getMaskElt(i);

5134

if (Idx >= (int)NumElems) {

5135

unsigned Opc = V2.getOpcode();

5136

if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))

5137

continue;

5138

if (Opc != ISD::BUILD_VECTOR ||

5139

!X86::isZeroNode(V2.getOperand(Idx-NumElems)))

5140

return false;

5141

} else if (Idx >= 0) {

5142

unsigned Opc = V1.getOpcode();

5143

if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))

5144

continue;

5145

if (Opc != ISD::BUILD_VECTOR ||

5146

!X86::isZeroNode(V1.getOperand(Idx)))

5147

return false;

5148

}

5149

}

5150

return true;

5151

}

5152

5153

/// getZeroVector - Returns a vector of specified type with all zero elements.

5154

///

5155

static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,

5156

SelectionDAG &DAG, SDLoc dl) {

5157

assert(VT.isVector() && "Expected a vector type")((VT.isVector() && "Expected a vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected a vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5157, __PRETTY_FUNCTION__));

5158

5159

// Always build SSE zero vectors as <4 x i32> bitcasted

5160

// to their dest type. This ensures they get CSE'd.

5161

SDValue Vec;

5162

if (VT.is128BitVector()) { // SSE

5163

if (Subtarget->hasSSE2()) { // SSE2

5164

SDValue Cst = DAG.getConstant(0, MVT::i32);

5165

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);

5166

} else { // SSE1

5167

SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);

5168

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);

5169

}

5170

} else if (VT.is256BitVector()) { // AVX

5171

if (Subtarget->hasInt256()) { // AVX2

5172

SDValue Cst = DAG.getConstant(0, MVT::i32);

5173

SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };

5174

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);

5175

} else {

5176

// 256-bit logic and arithmetic instructions in AVX are all

5177

// floating-point, no support for integer ops. Emit fp zeroed vectors.

5178

SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);

5179

SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };

5180

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);

5181

}

5182

} else if (VT.is512BitVector()) { // AVX-512

5183

SDValue Cst = DAG.getConstant(0, MVT::i32);

5184

SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,

5185

Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };

5186

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);

5187

} else if (VT.getScalarType() == MVT::i1) {

5188

assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type")((VT.getVectorNumElements() <= 16 && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() <= 16 && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5188, __PRETTY_FUNCTION__));

5189

SDValue Cst = DAG.getConstant(0, MVT::i1);

5190

SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);

5191

return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);

5192

} else

5193

llvm_unreachable("Unexpected vector type")::llvm::llvm_unreachable_internal("Unexpected vector type", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5193);

5194

5195

return DAG.getNode(ISD::BITCAST, dl, VT, Vec);

5196

}

5197

5198

/// getOnesVector - Returns a vector of specified type with all bits set.

5199

/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with

5200

/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.

5201

/// Then bitcast to their original type, ensuring they get CSE'd.

5202

static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,

5203

SDLoc dl) {

5204

5205

5206

SDValue Cst = DAG.getConstant(~0U, MVT::i32);

5207

SDValue Vec;

5208

if (VT.is256BitVector()) {

5209

if (HasInt256) { // AVX2

5210

SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };

5211

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);

5212

} else { // AVX

5213

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);

5214

Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);

5215

}

5216

} else if (VT.is128BitVector()) {

5217

Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);

5218

} else

5219

llvm_unreachable("Unexpected vector type")::llvm::llvm_unreachable_internal("Unexpected vector type", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5219);

5220

5221

return DAG.getNode(ISD::BITCAST, dl, VT, Vec);

5222

}

5223

5224

/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements

5225

/// that point to V2 points to its first element.

5226

static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {

5227

for (unsigned i = 0; i != NumElems; ++i) {

5228

if (Mask[i] > (int)NumElems) {

5229

Mask[i] = NumElems;

5230

}

5231

}

5232

}

5233

5234

/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd

5235

/// operation of specified width.

5236

static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,

5237

SDValue V2) {

5238

unsigned NumElems = VT.getVectorNumElements();

5239

SmallVector<int, 8> Mask;

5240

Mask.push_back(NumElems);

5241

for (unsigned i = 1; i != NumElems; ++i)

5242

Mask.push_back(i);

5243

return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);

5244

}

5245

5246

/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.

5247

static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,

5248

SDValue V2) {

5249

unsigned NumElems = VT.getVectorNumElements();

5250

SmallVector<int, 8> Mask;

5251

for (unsigned i = 0, e = NumElems/2; i != e; ++i) {

5252

Mask.push_back(i);

5253

Mask.push_back(i + NumElems);

5254

}

5255

return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);

5256

}

5257

5258

/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.

5259

static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,

5260

SDValue V2) {

5261

unsigned NumElems = VT.getVectorNumElements();

5262

SmallVector<int, 8> Mask;

5263

for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {

5264

Mask.push_back(i + Half);

5265

Mask.push_back(i + NumElems + Half);

5266

}

5267

return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);

5268

}

5269

5270

// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by

5271

// a generic shuffle instruction because the target has no such instructions.

5272

// Generate shuffles which repeat i16 and i8 several times until they can be

5273

// represented by v4f32 and then be manipulated by target suported shuffles.

5274

static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {

5275

MVT VT = V.getSimpleValueType();

5276

int NumElems = VT.getVectorNumElements();

5277

SDLoc dl(V);

5278

5279

while (NumElems > 4) {

5280

if (EltNo < NumElems/2) {

5281

V = getUnpackl(DAG, dl, VT, V, V);

5282

} else {

5283

V = getUnpackh(DAG, dl, VT, V, V);

5284

EltNo -= NumElems/2;

5285

}

5286

NumElems >>= 1;

5287

}

5288

return V;

5289

}

5290

5291

/// getLegalSplat - Generate a legal splat with supported x86 shuffles

5292

static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {

5293

MVT VT = V.getSimpleValueType();

5294

SDLoc dl(V);

5295

5296

if (VT.is128BitVector()) {

5297

V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);

5298

int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };

5299

V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),

5300

&SplatMask[0]);

5301

} else if (VT.is256BitVector()) {

5302

// To use VPERMILPS to splat scalars, the second half of indicies must

5303

// refer to the higher part, which is a duplication of the lower one,

5304

// because VPERMILPS can only handle in-lane permutations.

5305

int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,

5306

EltNo+4, EltNo+4, EltNo+4, EltNo+4 };

5307

5308

V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);

5309

V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),

5310

&SplatMask[0]);

5311

} else

5312

llvm_unreachable("Vector size not supported")::llvm::llvm_unreachable_internal("Vector size not supported"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5312);

5313

5314

return DAG.getNode(ISD::BITCAST, dl, VT, V);

5315

}

5316

5317

/// PromoteSplat - Splat is promoted to target supported vector shuffles.

5318

static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {

5319

MVT SrcVT = SV->getSimpleValueType(0);

5320

SDValue V1 = SV->getOperand(0);

5321

SDLoc dl(SV);

5322

5323

int EltNo = SV->getSplatIndex();

5324

int NumElems = SrcVT.getVectorNumElements();

5325

bool Is256BitVec = SrcVT.is256BitVector();

5326

5327

assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&((((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec
) && "Unknown how to promote splat for type") ? static_cast
<void> (0) : __assert_fail ("((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && \"Unknown how to promote splat for type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5328, __PRETTY_FUNCTION__))

5328

"Unknown how to promote splat for type")((((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec
) && "Unknown how to promote splat for type") ? static_cast
<void> (0) : __assert_fail ("((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && \"Unknown how to promote splat for type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5328, __PRETTY_FUNCTION__));

5329

5330

// Extract the 128-bit part containing the splat element and update

5331

// the splat element index when it refers to the higher register.

5332

if (Is256BitVec) {

5333

V1 = Extract128BitVector(V1, EltNo, DAG, dl);

5334

if (EltNo >= NumElems/2)

5335

EltNo -= NumElems/2;

5336

}

5337

5338

// All i16 and i8 vector types can't be used directly by a generic shuffle

5339

// instruction because the target has no such instruction. Generate shuffles

5340

// which repeat i16 and i8 several times until they fit in i32, and then can

5341

// be manipulated by target suported shuffles.

5342

MVT EltVT = SrcVT.getVectorElementType();

5343

if (EltVT == MVT::i8 || EltVT == MVT::i16)

5344

V1 = PromoteSplati8i16(V1, DAG, EltNo);

5345

5346

// Recreate the 256-bit vector and place the same 128-bit vector

5347

// into the low and high part. This is necessary because we want

5348

// to use VPERM* to shuffle the vectors

5349

if (Is256BitVec) {

5350

V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);

5351

}

5352

5353

return getLegalSplat(DAG, V1, EltNo);

5354

}

5355

5356

/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified

5357

/// vector of zero or undef vector. This produces a shuffle where the low

5358

/// element of V2 is swizzled into the zero/undef vector, landing at element

5359

/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

5360

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,

5361

bool IsZero,

5362

const X86Subtarget *Subtarget,

5363

SelectionDAG &DAG) {

5364

MVT VT = V2.getSimpleValueType();

5365

SDValue V1 = IsZero

5366

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

5367

unsigned NumElems = VT.getVectorNumElements();

5368

SmallVector<int, 16> MaskVec;

5369

for (unsigned i = 0; i != NumElems; ++i)

5370

// If this is the insertion idx, put the low elt of V2 here.

5371

MaskVec.push_back(i == Idx ? NumElems : i);

5372

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);

5373

}

5374

5375

/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the

5376

/// target specific opcode. Returns true if the Mask could be calculated. Sets

5377

/// IsUnary to true if only uses one source. Note that this will set IsUnary for

5378

/// shuffles which use a single input multiple times, and in those cases it will

5379

/// adjust the mask to only have indices within that single input.

5380

static bool getTargetShuffleMask(SDNode *N, MVT VT,

5381

SmallVectorImpl<int> &Mask, bool &IsUnary) {

5382

unsigned NumElems = VT.getVectorNumElements();

5383

SDValue ImmN;

5384

5385

IsUnary = false;

5386

bool IsFakeUnary = false;

5387

switch(N->getOpcode()) {

5388

case X86ISD::BLENDI:

5389

ImmN = N->getOperand(N->getNumOperands()-1);

5390

DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5391

break;

5392

case X86ISD::SHUFP:

5393

ImmN = N->getOperand(N->getNumOperands()-1);

5394

DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5395

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5396

break;

5397

case X86ISD::UNPCKH:

5398

DecodeUNPCKHMask(VT, Mask);

5399

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5400

break;

5401

case X86ISD::UNPCKL:

5402

DecodeUNPCKLMask(VT, Mask);

5403

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5404

break;

5405

case X86ISD::MOVHLPS:

5406

DecodeMOVHLPSMask(NumElems, Mask);

5407

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5408

break;

5409

case X86ISD::MOVLHPS:

5410

DecodeMOVLHPSMask(NumElems, Mask);

5411

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

5412

break;

5413

case X86ISD::PALIGNR:

5414

ImmN = N->getOperand(N->getNumOperands()-1);

5415

DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5416

break;

5417

case X86ISD::PSHUFD:

5418

case X86ISD::VPERMILPI:

5419

ImmN = N->getOperand(N->getNumOperands()-1);

5420

DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5421

IsUnary = true;

5422

break;

5423

case X86ISD::PSHUFHW:

5424

ImmN = N->getOperand(N->getNumOperands()-1);

5425

DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5426

IsUnary = true;

5427

break;

5428

case X86ISD::PSHUFLW:

5429

ImmN = N->getOperand(N->getNumOperands()-1);

5430

DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5431

IsUnary = true;

5432

break;

5433

case X86ISD::PSHUFB: {

5434

IsUnary = true;

5435

SDValue MaskNode = N->getOperand(1);

5436

while (MaskNode->getOpcode() == ISD::BITCAST)

5437

MaskNode = MaskNode->getOperand(0);

5438

5439

if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {

5440

// If we have a build-vector, then things are easy.

5441

EVT VT = MaskNode.getValueType();

5442

assert(VT.isVector() &&((VT.isVector() && "Can't produce a non-vector with a build_vector!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Can't produce a non-vector with a build_vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5443, __PRETTY_FUNCTION__))

5443

"Can't produce a non-vector with a build_vector!")((VT.isVector() && "Can't produce a non-vector with a build_vector!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Can't produce a non-vector with a build_vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5443, __PRETTY_FUNCTION__));

5444

if (!VT.isInteger())

5445

return false;

5446

5447

int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;

5448

5449

SmallVector<uint64_t, 32> RawMask;

5450

for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {

5451

SDValue Op = MaskNode->getOperand(i);

5452

if (Op->getOpcode() == ISD::UNDEF) {

5453

RawMask.push_back((uint64_t)SM_SentinelUndef);

5454

continue;

5455

}

5456

auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());

5457

if (!CN)

5458

return false;

5459

APInt MaskElement = CN->getAPIntValue();

5460

5461

// We now have to decode the element which could be any integer size and

5462

// extract each byte of it.

5463

for (int j = 0; j < NumBytesPerElement; ++j) {

5464

// Note that this is x86 and so always little endian: the low byte is

5465

// the first byte of the mask.

5466

RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());

5467

MaskElement = MaskElement.lshr(8);

5468

}

5469

}

5470

DecodePSHUFBMask(RawMask, Mask);

5471

break;

5472

}

5473

5474

auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);

5475

if (!MaskLoad)

5476

return false;

5477

5478

SDValue Ptr = MaskLoad->getBasePtr();

5479

if (Ptr->getOpcode() == X86ISD::Wrapper)

5480

Ptr = Ptr->getOperand(0);

5481

5482

auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);

5483

if (!MaskCP || MaskCP->isMachineConstantPoolEntry())

5484

return false;

5485

5486

if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {

5487

DecodePSHUFBMask(C, Mask);

5488

break;

5489

}

5490

5491

return false;

5492

}

5493

case X86ISD::VPERMI:

5494

ImmN = N->getOperand(N->getNumOperands()-1);

5495

DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5496

IsUnary = true;

5497

break;

5498

case X86ISD::MOVSS:

5499

case X86ISD::MOVSD:

5500

DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);

5501

break;

5502

case X86ISD::VPERM2X128:

5503

ImmN = N->getOperand(N->getNumOperands()-1);

5504

DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);

5505

if (Mask.empty()) return false;

5506

break;

5507

case X86ISD::MOVSLDUP:

5508

DecodeMOVSLDUPMask(VT, Mask);

5509

IsUnary = true;

5510

break;

5511

case X86ISD::MOVSHDUP:

5512

DecodeMOVSHDUPMask(VT, Mask);

5513

IsUnary = true;

5514

break;

5515

case X86ISD::MOVDDUP:

5516

DecodeMOVDDUPMask(VT, Mask);

5517

IsUnary = true;

5518

break;

5519

case X86ISD::MOVLHPD:

5520

case X86ISD::MOVLPD:

5521

case X86ISD::MOVLPS:

5522

// Not yet implemented

5523

return false;

5524

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5524);

5525

}

5526

5527

// If we have a fake unary shuffle, the shuffle mask is spread across two

5528

// inputs that are actually the same node. Re-map the mask to always point

5529

// into the first input.

5530

if (IsFakeUnary)

5531

for (int &M : Mask)

5532

if (M >= (int)Mask.size())

5533

M -= Mask.size();

5534

5535

return true;

5536

}

5537

5538

/// getShuffleScalarElt - Returns the scalar element that will make up the ith

5539

/// element of the result of the vector shuffle.

5540

static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,

5541

unsigned Depth) {

5542

if (Depth == 6)

5543

return SDValue(); // Limit search depth.

5544

5545

SDValue V = SDValue(N, 0);

5546

EVT VT = V.getValueType();

5547

unsigned Opcode = V.getOpcode();

5548

5549

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

5550

if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {

5551

int Elt = SV->getMaskElt(Index);

5552

5553

if (Elt < 0)

5554

return DAG.getUNDEF(VT.getVectorElementType());

5555

5556

unsigned NumElems = VT.getVectorNumElements();

5557

SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)

5558

: SV->getOperand(1);

5559

return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);

5560

}

5561

5562

// Recurse into target specific vector shuffles to find scalars.

5563

if (isTargetShuffle(Opcode)) {

5564

MVT ShufVT = V.getSimpleValueType();

5565

unsigned NumElems = ShufVT.getVectorNumElements();

5566

SmallVector<int, 16> ShuffleMask;

5567

bool IsUnary;

5568

5569

if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))

5570

return SDValue();

5571

5572

int Elt = ShuffleMask[Index];

5573

if (Elt < 0)

5574

return DAG.getUNDEF(ShufVT.getVectorElementType());

5575

5576

SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)

5577

: N->getOperand(1);

5578

return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,

5579

Depth+1);

5580

}

5581

5582

// Actual nodes that may contain scalar elements

5583

if (Opcode == ISD::BITCAST) {

5584

V = V.getOperand(0);

5585

EVT SrcVT = V.getValueType();

5586

unsigned NumElems = VT.getVectorNumElements();

5587

5588

if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)

5589

return SDValue();

5590

}

5591

5592

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)

5593

return (Index == 0) ? V.getOperand(0)

5594

: DAG.getUNDEF(VT.getVectorElementType());

5595

5596

if (V.getOpcode() == ISD::BUILD_VECTOR)

5597

return V.getOperand(Index);

5598

5599

return SDValue();

5600

}

5601

5602

/// getNumOfConsecutiveZeros - Return the number of elements of a vector

5603

/// shuffle operation which come from a consecutively from a zero. The

5604

/// search can start in two different directions, from left or right.

5605

/// We count undefs as zeros until PreferredNum is reached.

5606

static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,

5607

unsigned NumElems, bool ZerosFromLeft,

5608

SelectionDAG &DAG,

5609

unsigned PreferredNum = -1U) {

5610

unsigned NumZeros = 0;

5611

for (unsigned i = 0; i != NumElems; ++i) {

5612

unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;

5613

SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);

5614

if (!Elt.getNode())

5615

break;

5616

5617

if (X86::isZeroNode(Elt))

5618

++NumZeros;

5619

else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.

5620

NumZeros = std::min(NumZeros + 1, PreferredNum);

5621

else

5622

break;

5623

}

5624

5625

return NumZeros;

5626

}

5627

5628

/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)

5629

/// correspond consecutively to elements from one of the vector operands,

5630

/// starting from its index OpIdx. Also tell OpNum which source vector operand.

5631

static

5632

bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,

5633

unsigned MaskI, unsigned MaskE, unsigned OpIdx,

5634

unsigned NumElems, unsigned &OpNum) {

5635

bool SeenV1 = false;

5636

bool SeenV2 = false;

5637

5638

for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {

5639

int Idx = SVOp->getMaskElt(i);

5640

// Ignore undef indicies

5641

if (Idx < 0)

5642

continue;

5643

5644

if (Idx < (int)NumElems)

5645

SeenV1 = true;

5646

else

5647

SeenV2 = true;

5648

5649

// Only accept consecutive elements from the same vector

5650

if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))

5651

return false;

5652

}

5653

5654

OpNum = SeenV1 ? 0 : 1;

5655

return true;

5656

}

5657

5658

/// isVectorShiftRight - Returns true if the shuffle can be implemented as a

5659

/// logical left shift of a vector.

5660

static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,

5661

bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {

5662

unsigned NumElems =

5663

SVOp->getSimpleValueType(0).getVectorNumElements();

5664

unsigned NumZeros = getNumOfConsecutiveZeros(

5665

SVOp, NumElems, false /* check zeros from right */, DAG,

5666

SVOp->getMaskElt(0));

5667

unsigned OpSrc;

5668

5669

if (!NumZeros)

5670

return false;

5671

5672

// Considering the elements in the mask that are not consecutive zeros,

5673

// check if they consecutively come from only one of the source vectors.

5674

5675

// V1 = {X, A, B, C} 0

5676

// \ \ \ /

5677

// vector_shuffle V1, V2 <1, 2, 3, X>

5678

5679

if (!isShuffleMaskConsecutive(SVOp,

5680

0, // Mask Start Index

5681

NumElems-NumZeros, // Mask End Index(exclusive)

5682

NumZeros, // Where to start looking in the src vector

5683

NumElems, // Number of elements in vector

5684

OpSrc)) // Which source operand ?

5685

return false;

5686

5687

isLeft = false;

5688

ShAmt = NumZeros;

5689

ShVal = SVOp->getOperand(OpSrc);

5690

return true;

5691

}

5692

5693

/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a

5694

/// logical left shift of a vector.

5695

static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,

5696

bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {

5697

unsigned NumElems =

5698

SVOp->getSimpleValueType(0).getVectorNumElements();

5699

unsigned NumZeros = getNumOfConsecutiveZeros(

5700

SVOp, NumElems, true /* check zeros from left */, DAG,

5701

NumElems - SVOp->getMaskElt(NumElems - 1) - 1);

5702

unsigned OpSrc;

5703

5704

if (!NumZeros)

5705

return false;

5706

5707

// Considering the elements in the mask that are not consecutive zeros,

5708

// check if they consecutively come from only one of the source vectors.

5709

5710

// 0 { A, B, X, X } = V2

5711

// / \ / /

5712

// vector_shuffle V1, V2 <X, X, 4, 5>

5713

5714

if (!isShuffleMaskConsecutive(SVOp,

5715

NumZeros, // Mask Start Index

5716

NumElems, // Mask End Index(exclusive)

5717

0, // Where to start looking in the src vector

5718

NumElems, // Number of elements in vector

5719

OpSrc)) // Which source operand ?

5720

return false;

5721

5722

isLeft = true;

5723

ShAmt = NumZeros;

5724

ShVal = SVOp->getOperand(OpSrc);

5725

return true;

5726

}

5727

5728

/// isVectorShift - Returns true if the shuffle can be implemented as a

5729

/// logical left or right shift of a vector.

5730

static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,

5731

bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {

5732

// Although the logic below support any bitwidth size, there are no

5733

// shift instructions which handle more than 128-bit vectors.

5734

if (!SVOp->getSimpleValueType(0).is128BitVector())

5735

return false;

5736

5737

if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||

5738

isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))

5739

return true;

5740

5741

return false;

5742

}

5743

5744

/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.

5745

///

5746

static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,

5747

unsigned NumNonZero, unsigned NumZero,

5748

SelectionDAG &DAG,

5749

const X86Subtarget* Subtarget,

5750

const TargetLowering &TLI) {

5751

if (NumNonZero > 8)

5752

return SDValue();

5753

5754

SDLoc dl(Op);

5755

SDValue V;

5756

bool First = true;

5757

for (unsigned i = 0; i < 16; ++i) {

5758

bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;

5759

if (ThisIsNonZero && First) {

5760

if (NumZero)

5761

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

5762

else

5763

V = DAG.getUNDEF(MVT::v8i16);

5764

First = false;

5765

}

5766

5767

if ((i & 1) != 0) {

5768

SDValue ThisElt, LastElt;

5769

bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;

5770

if (LastIsNonZero) {

5771

LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,

5772

MVT::i16, Op.getOperand(i-1));

5773

}

5774

if (ThisIsNonZero) {

5775

ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));

5776

ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,

5777

ThisElt, DAG.getConstant(8, MVT::i8));

5778

if (LastIsNonZero)

5779

ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);

5780

} else

5781

ThisElt = LastElt;

5782

5783

if (ThisElt.getNode())

5784

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,

5785

DAG.getIntPtrConstant(i/2));

5786

}

5787

}

5788

5789

return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);

5790

}

5791

5792

/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.

5793

///

5794

static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,

5795

unsigned NumNonZero, unsigned NumZero,

5796

SelectionDAG &DAG,

5797

const X86Subtarget* Subtarget,

5798

const TargetLowering &TLI) {

5799

if (NumNonZero > 4)

5800

return SDValue();

5801

5802

SDLoc dl(Op);

5803

SDValue V;

5804

bool First = true;

5805

for (unsigned i = 0; i < 8; ++i) {

5806

bool isNonZero = (NonZeros & (1 << i)) != 0;

5807

if (isNonZero) {

5808

if (First) {

5809

if (NumZero)

5810

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

5811

else

5812

V = DAG.getUNDEF(MVT::v8i16);

5813

First = false;

5814

}

5815

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,

5816

MVT::v8i16, V, Op.getOperand(i),

5817

DAG.getIntPtrConstant(i));

5818

}

5819

}

5820

5821

return V;

5822

}

5823

5824

/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.

5825

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

5826

const X86Subtarget *Subtarget,

5827

const TargetLowering &TLI) {

5828

// Find all zeroable elements.

5829

bool Zeroable[4];

5830

for (int i=0; i < 4; ++i) {

5831

SDValue Elt = Op->getOperand(i);

5832

Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));

5833

}

5834

assert(std::count_if(&Zeroable[0], &Zeroable[4],((std::count_if(&Zeroable[0], &Zeroable[4], [](bool M
) { return !M; }) > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(&Zeroable[0], &Zeroable[4], [](bool M) { return !M; }) > 1 && \"We expect at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5836, __PRETTY_FUNCTION__))

5835

[](bool M) { return !M; }) > 1 &&((std::count_if(&Zeroable[0], &Zeroable[4], [](bool M
) { return !M; }) > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(&Zeroable[0], &Zeroable[4], [](bool M) { return !M; }) > 1 && \"We expect at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5836, __PRETTY_FUNCTION__))

5836

"We expect at least two non-zero elements!")((std::count_if(&Zeroable[0], &Zeroable[4], [](bool M
) { return !M; }) > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(&Zeroable[0], &Zeroable[4], [](bool M) { return !M; }) > 1 && \"We expect at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5836, __PRETTY_FUNCTION__));

5837

5838

// We only know how to deal with build_vector nodes where elements are either

5839

// zeroable or extract_vector_elt with constant index.

5840

SDValue FirstNonZero;

5841

unsigned FirstNonZeroIdx;

5842

for (unsigned i=0; i < 4; ++i) {

5843

if (Zeroable[i])

5844

continue;

5845

SDValue Elt = Op->getOperand(i);

5846

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

5847

!isa<ConstantSDNode>(Elt.getOperand(1)))

5848

return SDValue();

5849

// Make sure that this node is extracting from a 128-bit vector.

5850

MVT VT = Elt.getOperand(0).getSimpleValueType();

5851

if (!VT.is128BitVector())

5852

return SDValue();

5853

if (!FirstNonZero.getNode()) {

5854

FirstNonZero = Elt;

5855

FirstNonZeroIdx = i;

5856

}

5857

}

5858

5859

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5859, __PRETTY_FUNCTION__));

5860

SDValue V1 = FirstNonZero.getOperand(0);

5861

MVT VT = V1.getSimpleValueType();

5862

5863

// See if this build_vector can be lowered as a blend with zero.

5864

SDValue Elt;

5865

unsigned EltMaskIdx, EltIdx;

5866

int Mask[4];

5867

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

5868

if (Zeroable[EltIdx]) {

5869

// The zero vector will be on the right hand side.

5870

Mask[EltIdx] = EltIdx+4;

5871

continue;

5872

}

5873

5874

Elt = Op->getOperand(EltIdx);

5875

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

5876

EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();

5877

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

5878

break;

5879

Mask[EltIdx] = EltIdx;

5880

}

5881

5882

if (EltIdx == 4) {

5883

// Let the shuffle legalizer deal with blend operations.

5884

SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

5885

if (V1.getSimpleValueType() != VT)

5886

V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);

5887

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);

5888

}

5889

5890

// See if we can lower this build_vector to a INSERTPS.

5891

if (!Subtarget->hasSSE41())

5892

return SDValue();

5893

5894

SDValue V2 = Elt.getOperand(0);

5895

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

5896

V1 = SDValue();

5897

5898

bool CanFold = true;

5899

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

5900

if (Zeroable[i])

5901

continue;

5902

5903

SDValue Current = Op->getOperand(i);

5904

SDValue SrcVector = Current->getOperand(0);

5905

if (!V1.getNode())

5906

V1 = SrcVector;

5907

CanFold = SrcVector == V1 &&

5908

cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;

5909

}

5910

5911

if (!CanFold)

5912

return SDValue();

5913

5914

assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5914, __PRETTY_FUNCTION__));

5915

if (V1.getSimpleValueType() != MVT::v4f32)

5916

V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);

5917

if (V2.getSimpleValueType() != MVT::v4f32)

5918

V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);

5919

5920

// Ok, we can emit an INSERTPS instruction.

5921

unsigned ZMask = 0;

5922

for (int i = 0; i < 4; ++i)

5923

if (Zeroable[i])

5924

ZMask |= 1 << i;

5925

5926

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

5927

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5927, __PRETTY_FUNCTION__));

5928

SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,

5929

DAG.getIntPtrConstant(InsertPSMask));

5930

return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);

5931

}

5932

5933

/// Return a vector logical shift node.

5934

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,

5935

unsigned NumBits, SelectionDAG &DAG,

5936

const TargetLowering &TLI, SDLoc dl) {

5937

assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5937, __PRETTY_FUNCTION__));

5938

MVT ShVT = MVT::v2i64;

5939

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

5940

SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);

5941

MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());

5942

SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);

5943

return DAG.getNode(ISD::BITCAST, dl, VT,

5944

DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

5945

}

5946

5947

static SDValue

5948

LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {

5949

5950

// Check if the scalar load can be widened into a vector load. And if

5951

// the address is "base + cst" see if the cst can be "absorbed" into

5952

// the shuffle mask.

5953

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

5954

SDValue Ptr = LD->getBasePtr();

5955

if (!ISD::isNormalLoad(LD) || LD->isVolatile())

5956

return SDValue();

5957

EVT PVT = LD->getValueType(0);

5958

if (PVT != MVT::i32 && PVT != MVT::f32)

5959

return SDValue();

5960

5961

int FI = -1;

5962

int64_t Offset = 0;

5963

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

5964

FI = FINode->getIndex();

5965

Offset = 0;

5966

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

5967

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

5968

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

5969

Offset = Ptr.getConstantOperandVal(1);

5970

Ptr = Ptr.getOperand(0);

5971

} else {

5972

return SDValue();

5973

}

5974

5975

// FIXME: 256-bit vector instructions don't require a strict alignment,

5976

// improve this code to support it better.

5977

unsigned RequiredAlign = VT.getSizeInBits()/8;

5978

SDValue Chain = LD->getChain();

5979

// Make sure the stack object alignment is at least 16 or 32.

5980

MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();

5981

if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {

5982

if (MFI->isFixedObjectIndex(FI)) {

5983

// Can't change the alignment. FIXME: It's possible to compute

5984

// the exact stack offset and reference FI + adjust offset instead.

5985

// If someone *really* cares about this. That's the way to implement it.

5986

return SDValue();

5987

} else {

5988

MFI->setObjectAlignment(FI, RequiredAlign);

5989

}

5990

}

5991

5992

// (Offset % 16 or 32) must be multiple of 4. Then address is then

5993

// Ptr + (Offset & ~15).

5994

if (Offset < 0)

5995

return SDValue();

5996

if ((Offset % RequiredAlign) & 3)

5997

return SDValue();

5998

int64_t StartOffset = Offset & ~(RequiredAlign-1);

5999

if (StartOffset)

6000

Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),

6001

Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));

6002

6003

int EltNo = (Offset - StartOffset) >> 2;

6004

unsigned NumElems = VT.getVectorNumElements();

6005

6006

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

6007

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

6008

LD->getPointerInfo().getWithOffset(StartOffset),

6009

false, false, false, 0);

6010

6011

SmallVector<int, 8> Mask;

6012

for (unsigned i = 0; i != NumElems; ++i)

6013

Mask.push_back(EltNo);

6014

6015

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);

6016

}

6017

6018

return SDValue();

6019

}

6020

6021

/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a

6022

/// vector of type 'VT', see if the elements can be replaced by a single large

6023

/// load which has the same value as a build_vector whose operands are 'elts'.

6024

///

6025

/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a

6026

///

6027

/// FIXME: we'd also like to handle the case where the last elements are zero

6028

/// rather than undef via VZEXT_LOAD, but we do not detect that case today.

6029

/// There's even a handy isZeroNode for that purpose.

6030

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

6031

SDLoc &DL, SelectionDAG &DAG,

6032

bool isAfterLegalize) {

6033

EVT EltVT = VT.getVectorElementType();

6034

unsigned NumElems = Elts.size();

6035

6036

LoadSDNode *LDBase = nullptr;

←

'LDBase' initialized to a null pointer value

→

6037

unsigned LastLoadedElt = -1U;

6038

6039

// For each element in the initializer, see if we've found a load or an undef.

6040

// If we don't find an initial load element, or later load elements are

6041

// non-consecutive, bail out.

6042

for (unsigned i = 0; i < NumElems; ++i) {

←

Assuming 'i' is >= 'NumElems'

→

←

Loop condition is false. Execution continues on line 6067

→

6043

SDValue Elt = Elts[i];

6044

6045

if (!Elt.getNode() ||

6046

(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))

6047

return SDValue();

6048

if (!LDBase) {

6049

if (Elt.getNode()->getOpcode() == ISD::UNDEF)

6050

return SDValue();

6051

LDBase = cast<LoadSDNode>(Elt.getNode());

6052

LastLoadedElt = i;

6053

continue;

6054

}

6055

if (Elt.getOpcode() == ISD::UNDEF)

6056

continue;

6057

6058

LoadSDNode *LD = cast<LoadSDNode>(Elt);

6059

if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))

6060

return SDValue();

6061

LastLoadedElt = i;

6062

}

6063

6064

// If we have found an entire vector of loads and undefs, then return a large

6065

// load of the entire vector width starting at the base pointer. If we found

6066

// consecutive loads for the low half, generate a vzext_load node.

6067

if (LastLoadedElt == NumElems - 1) {

←

Taking true branch

→

6068

6069

if (isAfterLegalize &&

←

Taking false branch

→

6070

!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))

6071

return SDValue();

6072

6073

SDValue NewLd = SDValue();

6074

6075

NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

←

Called C++ object pointer is null

6076

LDBase->getPointerInfo(), LDBase->isVolatile(),

6077

LDBase->isNonTemporal(), LDBase->isInvariant(),

6078

LDBase->getAlignment());

6079

6080

if (LDBase->hasAnyUseOfValue(1)) {

6081

SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,

6082

SDValue(LDBase, 1),

6083

SDValue(NewLd.getNode(), 1));

6084

DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);

6085

DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),

6086

SDValue(NewLd.getNode(), 1));

6087

}

6088

6089

return NewLd;

6090

}

6091

6092

//TODO: The code below fires only for for loading the low v2i32 / v2f32

6093

//of a v4i32 / v4f32. It's probably worth generalizing.

6094

if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&

6095

DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {

6096

SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);

6097

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

6098

SDValue ResNode =

6099

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,

6100

LDBase->getPointerInfo(),

6101

LDBase->getAlignment(),

6102

false/*isVolatile*/, true/*ReadMem*/,

6103

false/*WriteMem*/);

6104

6105

// Make sure the newly-created LOAD is in the same position as LDBase in

6106

// terms of dependency. We create a TokenFactor for LDBase and ResNode, and

6107

// update uses of LDBase's output chain to use the TokenFactor.

6108

if (LDBase->hasAnyUseOfValue(1)) {

6109

SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,

6110

SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));

6111

DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);

6112

DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),

6113

SDValue(ResNode.getNode(), 1));

6114

}

6115

6116

return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);

6117

}

6118

return SDValue();

6119

}

6120

6121

/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction

6122

/// to generate a splat value for the following cases:

6123

/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.

6124

/// 2. A splat shuffle which uses a scalar_to_vector node which comes from

6125

/// a scalar load, or a constant.

6126

/// The VBROADCAST node is returned when a pattern is found,

6127

/// or SDValue() otherwise.

6128

static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,

6129

SelectionDAG &DAG) {

6130

// VBROADCAST requires AVX.

6131

// TODO: Splats could be generated for non-AVX CPUs using SSE

6132

// instructions, but there's less potential gain for only 128-bit vectors.

6133

if (!Subtarget->hasAVX())

6134

return SDValue();

6135

6136

MVT VT = Op.getSimpleValueType();

6137

SDLoc dl(Op);

6138

6139

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6140, __PRETTY_FUNCTION__))

6140

"Unsupported vector type for broadcast.")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6140, __PRETTY_FUNCTION__));

6141

6142

SDValue Ld;

6143

bool ConstSplatVal;

6144

6145

switch (Op.getOpcode()) {

6146

default:

6147

// Unknown pattern found.

6148

return SDValue();

6149

6150

case ISD::BUILD_VECTOR: {

6151

auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());

6152

BitVector UndefElements;

6153

SDValue Splat = BVOp->getSplatValue(&UndefElements);

6154

6155

// We need a splat of a single value to use broadcast, and it doesn't

6156

// make any sense if the value is only in one element of the vector.

6157

if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)

6158

return SDValue();

6159

6160

Ld = Splat;

6161

ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||

6162

Ld.getOpcode() == ISD::ConstantFP);

6163

6164

// Make sure that all of the users of a non-constant load are from the

6165

// BUILD_VECTOR node.

6166

if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))

6167

return SDValue();

6168

break;

6169

}

6170

6171

case ISD::VECTOR_SHUFFLE: {

6172

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

6173

6174

// Shuffles must have a splat mask where the first element is

6175

// broadcasted.

6176

if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)

6177

return SDValue();

6178

6179

SDValue Sc = Op.getOperand(0);

6180

if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&

6181

Sc.getOpcode() != ISD::BUILD_VECTOR) {

6182

6183

if (!Subtarget->hasInt256())

6184

return SDValue();

6185

6186

// Use the register form of the broadcast instruction available on AVX2.

6187

if (VT.getSizeInBits() >= 256)

6188

Sc = Extract128BitVector(Sc, 0, DAG, dl);

6189

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);

6190

}

6191

6192

Ld = Sc.getOperand(0);

6193

ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||

6194

Ld.getOpcode() == ISD::ConstantFP);

6195

6196

// The scalar_to_vector node and the suspected

6197

// load node must have exactly one user.

6198

// Constants may have multiple users.

6199

6200

// AVX-512 has register version of the broadcast

6201

bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&

6202

Ld.getValueType().getSizeInBits() >= 32;

6203

if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&

6204

!hasRegVer))

6205

return SDValue();

6206

break;

6207

}

6208

}

6209

6210

unsigned ScalarSize = Ld.getValueType().getSizeInBits();

6211

bool IsGE256 = (VT.getSizeInBits() >= 256);

6212

6213

// When optimizing for size, generate up to 5 extra bytes for a broadcast

6214

// instruction to save 8 or more bytes of constant pool data.

6215

// TODO: If multiple splats are generated to load the same constant,

6216

// it may be detrimental to overall size. There needs to be a way to detect

6217

// that condition to know if this is truly a size win.

6218

const Function *F = DAG.getMachineFunction().getFunction();

6219

bool OptForSize = F->getAttributes().

6220

hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);

6221

6222

// Handle broadcasting a single constant scalar from the constant pool

6223

// into a vector.

6224

// On Sandybridge (no AVX2), it is still better to load a constant vector

6225

// from the constant pool and not to broadcast it from a scalar.

6226

// But override that restriction when optimizing for size.

6227

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

6228

if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {

6229

EVT CVT = Ld.getValueType();

6230

assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type"
) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6230, __PRETTY_FUNCTION__));

6231

6232

// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.

6233

// For size optimization, also splat v2f64 and v2i64, and for size opt

6234

// with AVX2, also splat i8 and i16.

6235

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

6236

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

6237

(OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {

6238

const Constant *C = nullptr;

6239

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

6240

C = CI->getConstantIntValue();

6241

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

6242

C = CF->getConstantFPValue();

6243

6244

assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void
> (0) : __assert_fail ("C && \"Invalid constant type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6244, __PRETTY_FUNCTION__));

6245

6246

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

6247

SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());

6248

unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

6249

Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,

6250

MachinePointerInfo::getConstantPool(),

6251

false, false, false, Alignment);

6252

6253

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6254

}

6255

}

6256

6257

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

6258

6259

// Handle AVX2 in-register broadcasts.

6260

if (!IsLoad && Subtarget->hasInt256() &&

6261

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

6262

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6263

6264

// The scalar source must be a normal load.

6265

if (!IsLoad)

6266

return SDValue();

6267

6268

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

6269

(Subtarget->hasVLX() && ScalarSize == 64))

6270

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6271

6272

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

6273

// double since there is no vbroadcastsd xmm

6274

if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {

6275

if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)

6276

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

6277

}

6278

6279

// Unsupported broadcast.

6280

return SDValue();

6281

}

6282

6283

/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real

6284

/// underlying vector and index.

6285

///

6286

/// Modifies \p ExtractedFromVec to the real vector and returns the real

6287

/// index.

6288

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

6289

SDValue ExtIdx) {

6290

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

6291

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

6292

return Idx;

6293

6294

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

6295

// lowered this:

6296

// (extract_vector_elt (v8f32 %vreg1), Constant<6>)

6297

// to:

6298

// (extract_vector_elt (vector_shuffle<2,u,u,u>

6299

// (extract_subvector (v8f32 %vreg0), Constant<4>),

6300

// undef)

6301

// Constant<0>)

6302

// In this case the vector is the extract_subvector expression and the index

6303

// is 2, as specified by the shuffle.

6304

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

6305

SDValue ShuffleVec = SVOp->getOperand(0);

6306

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

6307

assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6308, __PRETTY_FUNCTION__))

6308

ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6308, __PRETTY_FUNCTION__));

6309

6310

int ShuffleIdx = SVOp->getMaskElt(Idx);

6311

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

6312

ExtractedFromVec = ShuffleVec;

6313

return ShuffleIdx;

6314

}

6315

return Idx;

6316

}

6317

6318

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

6319

MVT VT = Op.getSimpleValueType();

6320

6321

// Skip if insert_vec_elt is not supported.

6322

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

6323

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

6324

return SDValue();

6325

6326

SDLoc DL(Op);

6327

unsigned NumElems = Op.getNumOperands();

6328

6329

SDValue VecIn1;

6330

SDValue VecIn2;

6331

SmallVector<unsigned, 4> InsertIndices;

6332

SmallVector<int, 8> Mask(NumElems, -1);

6333

6334

for (unsigned i = 0; i != NumElems; ++i) {

6335

unsigned Opc = Op.getOperand(i).getOpcode();

6336

6337

if (Opc == ISD::UNDEF)

6338

continue;

6339

6340

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

6341

// Quit if more than 1 elements need inserting.

6342

if (InsertIndices.size() > 1)

6343

return SDValue();

6344

6345

InsertIndices.push_back(i);

6346

continue;

6347

}

6348

6349

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

6350

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

6351

// Quit if non-constant index.

6352

if (!isa<ConstantSDNode>(ExtIdx))

6353

return SDValue();

6354

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

6355

6356

// Quit if extracted from vector of different type.

6357

if (ExtractedFromVec.getValueType() != VT)

6358

return SDValue();

6359

6360

if (!VecIn1.getNode())

6361

VecIn1 = ExtractedFromVec;

6362

else if (VecIn1 != ExtractedFromVec) {

6363

if (!VecIn2.getNode())

6364

VecIn2 = ExtractedFromVec;

6365

else if (VecIn2 != ExtractedFromVec)

6366

// Quit if more than 2 vectors to shuffle

6367

return SDValue();

6368

}

6369

6370

if (ExtractedFromVec == VecIn1)

6371

Mask[i] = Idx;

6372

else if (ExtractedFromVec == VecIn2)

6373

Mask[i] = Idx + NumElems;

6374

}

6375

6376

if (!VecIn1.getNode())

6377

return SDValue();

6378

6379

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

6380

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);

6381

for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {

6382

unsigned Idx = InsertIndices[i];

6383

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

6384

DAG.getIntPtrConstant(Idx));

6385

}

6386

6387

return NV;

6388

}

6389

6390

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

6391

SDValue

6392

X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {

6393

6394

MVT VT = Op.getSimpleValueType();

6395

assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&(((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits
() <= 16) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6396, __PRETTY_FUNCTION__))

6396

"Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits
() <= 16) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6396, __PRETTY_FUNCTION__));

6397

6398

SDLoc dl(Op);

6399

if (ISD::isBuildVectorAllZeros(Op.getNode())) {

6400

SDValue Cst = DAG.getTargetConstant(0, MVT::i1);

6401

SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);

6402

return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);

6403

}

6404

6405

if (ISD::isBuildVectorAllOnes(Op.getNode())) {

6406

SDValue Cst = DAG.getTargetConstant(1, MVT::i1);

6407

SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);

6408

return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);

6409

}

6410

6411

bool AllContants = true;

6412

uint64_t Immediate = 0;

6413

int NonConstIdx = -1;

6414

bool IsSplat = true;

6415

unsigned NumNonConsts = 0;

6416

unsigned NumConsts = 0;

6417

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

6418

SDValue In = Op.getOperand(idx);

6419

if (In.getOpcode() == ISD::UNDEF)

6420

continue;

6421

if (!isa<ConstantSDNode>(In)) {

6422

AllContants = false;

6423

NonConstIdx = idx;

6424

NumNonConsts++;

6425

} else {

6426

NumConsts++;

6427

if (cast<ConstantSDNode>(In)->getZExtValue())

6428

Immediate |= (1ULL << idx);

6429

}

6430

if (In != Op.getOperand(0))

6431

IsSplat = false;

6432

}

6433

6434

if (AllContants) {

6435

SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,

6436

DAG.getConstant(Immediate, MVT::i16));

6437

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,

6438

DAG.getIntPtrConstant(0));

6439

}

6440

6441

if (NumNonConsts == 1 && NonConstIdx != 0) {

6442

SDValue DstVec;

6443

if (NumConsts) {

6444

SDValue VecAsImm = DAG.getConstant(Immediate,

6445

MVT::getIntegerVT(VT.getSizeInBits()));

6446

DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);

6447

}

6448

else

6449

DstVec = DAG.getUNDEF(VT);

6450

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

6451

Op.getOperand(NonConstIdx),

6452

DAG.getIntPtrConstant(NonConstIdx));

6453

}

6454

if (!IsSplat && (NonConstIdx != 0))

6455

llvm_unreachable("Unsupported BUILD_VECTOR operation")::llvm::llvm_unreachable_internal("Unsupported BUILD_VECTOR operation"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6455);

6456

MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;

6457

SDValue Select;

6458

if (IsSplat)

6459

Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),

6460

DAG.getConstant(-1, SelectVT),

6461

DAG.getConstant(0, SelectVT));

6462

else

6463

Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),

6464

DAG.getConstant((Immediate | 1), SelectVT),

6465

DAG.getConstant(Immediate, SelectVT));

6466

return DAG.getNode(ISD::BITCAST, dl, VT, Select);

6467

}

6468

6469

/// \brief Return true if \p N implements a horizontal binop and return the

6470

/// operands for the horizontal binop into V0 and V1.

6471

///

6472

/// This is a helper function of PerformBUILD_VECTORCombine.

6473

/// This function checks that the build_vector \p N in input implements a

6474

/// horizontal operation. Parameter \p Opcode defines the kind of horizontal

6475

/// operation to match.

6476

/// For example, if \p Opcode is equal to ISD::ADD, then this function

6477

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

6478

/// is equal to ISD::SUB, then this function checks if this is a horizontal

6479

/// arithmetic sub.

6480

///

6481

/// This function only analyzes elements of \p N whose indices are

6482

/// in range [BaseIdx, LastIdx).

6483

static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,

6484

SelectionDAG &DAG,

6485

unsigned BaseIdx, unsigned LastIdx,

6486

SDValue &V0, SDValue &V1) {

6487

EVT VT = N->getValueType(0);

6488

6489

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"
) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6489, __PRETTY_FUNCTION__));

6490

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6491, __PRETTY_FUNCTION__))

6491

"Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6491, __PRETTY_FUNCTION__));

6492

6493

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

6494

bool CanFold = true;

6495

unsigned ExpectedVExtractIdx = BaseIdx;

6496

unsigned NumElts = LastIdx - BaseIdx;

6497

V0 = DAG.getUNDEF(VT);

6498

V1 = DAG.getUNDEF(VT);

6499

6500

// Check if N implements a horizontal binop.

6501

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

6502

SDValue Op = N->getOperand(i + BaseIdx);

6503

6504

// Skip UNDEFs.

6505

if (Op->getOpcode() == ISD::UNDEF) {

6506

// Update the expected vector extract index.

6507

if (i * 2 == NumElts)

6508

ExpectedVExtractIdx = BaseIdx;

6509

ExpectedVExtractIdx += 2;

6510

continue;

6511

}

6512

6513

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

6514

6515

if (!CanFold)

6516

break;

6517

6518

SDValue Op0 = Op.getOperand(0);

6519

SDValue Op1 = Op.getOperand(1);

6520

6521

// Try to match the following pattern:

6522

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

6523

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

6524

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

6525

Op0.getOperand(0) == Op1.getOperand(0) &&

6526

isa<ConstantSDNode>(Op0.getOperand(1)) &&

6527

isa<ConstantSDNode>(Op1.getOperand(1)));

6528

if (!CanFold)

6529

break;

6530

6531

unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();

6532

unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

6533

6534

if (i * 2 < NumElts) {

6535

if (V0.getOpcode() == ISD::UNDEF)

6536

V0 = Op0.getOperand(0);

6537

} else {

6538

if (V1.getOpcode() == ISD::UNDEF)

6539

V1 = Op0.getOperand(0);

6540

if (i * 2 == NumElts)

6541

ExpectedVExtractIdx = BaseIdx;

6542

}

6543

6544

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

6545

if (I0 == ExpectedVExtractIdx)

6546

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

6547

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

6548

// Try to match the following dag sequence:

6549

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

6550

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

6551

} else

6552

CanFold = false;

6553

6554

ExpectedVExtractIdx += 2;

6555

}

6556

6557

return CanFold;

6558

}

6559

6560

/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by

6561

/// a concat_vector.

6562

///

6563

/// This is a helper function of PerformBUILD_VECTORCombine.

6564

/// This function expects two 256-bit vectors called V0 and V1.

6565

/// At first, each vector is split into two separate 128-bit vectors.

6566

/// Then, the resulting 128-bit vectors are used to implement two

6567

/// horizontal binary operations.

6568

///

6569

/// The kind of horizontal binary operation is defined by \p X86Opcode.

6570

///

6571

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

6572

/// the two new horizontal binop.

6573

/// When Mode is set, the first horizontal binop dag node would take as input

6574

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

6575

/// horizontal binop dag node would take as input the lower 128-bit of V1

6576

/// and the upper 128-bit of V1.

6577

/// Example:

6578

/// HADD V0_LO, V0_HI

6579

/// HADD V1_LO, V1_HI

6580

///

6581

/// Otherwise, the first horizontal binop dag node takes as input the lower

6582

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

6583

/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.

6584

/// Example:

6585

/// HADD V0_LO, V1_LO

6586

/// HADD V0_HI, V1_HI

6587

///

6588

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

6589

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

6590

/// the upper 128-bits of the result.

6591

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

6592

SDLoc DL, SelectionDAG &DAG,

6593

unsigned X86Opcode, bool Mode,

6594

bool isUndefLO, bool isUndefHI) {

6595

EVT VT = V0.getValueType();

6596

assert(VT.is256BitVector() && VT == V1.getValueType() &&((VT.is256BitVector() && VT == V1.getValueType() &&
"Invalid nodes in input!") ? static_cast<void> (0) : __assert_fail
("VT.is256BitVector() && VT == V1.getValueType() && \"Invalid nodes in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6597, __PRETTY_FUNCTION__))

6597

"Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getValueType() &&
"Invalid nodes in input!") ? static_cast<void> (0) : __assert_fail
("VT.is256BitVector() && VT == V1.getValueType() && \"Invalid nodes in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6597, __PRETTY_FUNCTION__));

6598

6599

unsigned NumElts = VT.getVectorNumElements();

6600

SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);

6601

SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);

6602

SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);

6603

SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);

6604

EVT NewVT = V0_LO.getValueType();

6605

6606

SDValue LO = DAG.getUNDEF(NewVT);

6607

SDValue HI = DAG.getUNDEF(NewVT);

6608

6609

if (Mode) {

6610

// Don't emit a horizontal binop if the result is expected to be UNDEF.

6611

if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)

6612

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

6613

if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)

6614

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

6615

} else {

6616

// Don't emit a horizontal binop if the result is expected to be UNDEF.

6617

if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||

6618

V1_LO->getOpcode() != ISD::UNDEF))

6619

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

6620

6621

if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||

6622

V1_HI->getOpcode() != ISD::UNDEF))

6623

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

6624

}

6625

6626

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

6627

}

6628

6629

/// \brief Try to fold a build_vector that performs an 'addsub' into the

6630

/// sequence of 'vadd + vsub + blendi'.

6631

static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,

6632

const X86Subtarget *Subtarget) {

6633

SDLoc DL(BV);

6634

EVT VT = BV->getValueType(0);

6635

unsigned NumElts = VT.getVectorNumElements();

6636

SDValue InVec0 = DAG.getUNDEF(VT);

6637

SDValue InVec1 = DAG.getUNDEF(VT);

6638

6639

assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v2f64) && "build_vector with an invalid type found!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && \"build_vector with an invalid type found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6640, __PRETTY_FUNCTION__))

6640

VT == MVT::v2f64) && "build_vector with an invalid type found!")(((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v2f64) && "build_vector with an invalid type found!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && \"build_vector with an invalid type found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6640, __PRETTY_FUNCTION__));

6641

6642

// Odd-numbered elements in the input build vector are obtained from

6643

// adding two integer/float elements.

6644

// Even-numbered elements in the input build vector are obtained from

6645

// subtracting two integer/float elements.

6646

unsigned ExpectedOpcode = ISD::FSUB;

6647

unsigned NextExpectedOpcode = ISD::FADD;

6648

bool AddFound = false;

6649

bool SubFound = false;

6650

6651

for (unsigned i = 0, e = NumElts; i != e; i++) {

6652

SDValue Op = BV->getOperand(i);

6653

6654

// Skip 'undef' values.

6655

unsigned Opcode = Op.getOpcode();

6656

if (Opcode == ISD::UNDEF) {

6657

std::swap(ExpectedOpcode, NextExpectedOpcode);

6658

continue;

6659

}

6660

6661

// Early exit if we found an unexpected opcode.

6662

if (Opcode != ExpectedOpcode)

6663

return SDValue();

6664

6665

SDValue Op0 = Op.getOperand(0);

6666

SDValue Op1 = Op.getOperand(1);

6667

6668

// Try to match the following pattern:

6669

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

6670

// Early exit if we cannot match that sequence.

6671

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

6672

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

6673

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

6674

!isa<ConstantSDNode>(Op1.getOperand(1)) ||

6675

Op0.getOperand(1) != Op1.getOperand(1))

6676

return SDValue();

6677

6678

unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();

6679

if (I0 != i)

6680

return SDValue();

6681

6682

// We found a valid add/sub node. Update the information accordingly.

6683

if (i & 1)

6684

AddFound = true;

6685

else

6686

SubFound = true;

6687

6688

// Update InVec0 and InVec1.

6689

if (InVec0.getOpcode() == ISD::UNDEF)

6690

InVec0 = Op0.getOperand(0);

6691

if (InVec1.getOpcode() == ISD::UNDEF)

6692

InVec1 = Op1.getOperand(0);

6693

6694

// Make sure that operands in input to each add/sub node always

6695

// come from a same pair of vectors.

6696

if (InVec0 != Op0.getOperand(0)) {

6697

if (ExpectedOpcode == ISD::FSUB)

6698

return SDValue();

6699

6700

// FADD is commutable. Try to commute the operands

6701

// and then test again.

6702

std::swap(Op0, Op1);

6703

if (InVec0 != Op0.getOperand(0))

6704

return SDValue();

6705

}

6706

6707

if (InVec1 != Op1.getOperand(0))

6708

return SDValue();

6709

6710

// Update the pair of expected opcodes.

6711

std::swap(ExpectedOpcode, NextExpectedOpcode);

6712

}

6713

6714

// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.

6715

if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&

6716

InVec1.getOpcode() != ISD::UNDEF)

6717

return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);

6718

6719

return SDValue();

6720

}

6721

6722

static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,

6723

const X86Subtarget *Subtarget) {

6724

SDLoc DL(N);

6725

EVT VT = N->getValueType(0);

6726

unsigned NumElts = VT.getVectorNumElements();

6727

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);

6728

SDValue InVec0, InVec1;

6729

6730

// Try to match an ADDSUB.

6731

if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

6732

(Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

6733

SDValue Value = matchAddSub(BV, DAG, Subtarget);

6734

if (Value.getNode())

6735

return Value;

6736

}

6737

6738

// Try to match horizontal ADD/SUB.

6739

unsigned NumUndefsLO = 0;

6740

unsigned NumUndefsHI = 0;

6741

unsigned Half = NumElts/2;

6742

6743

// Count the number of UNDEF operands in the build_vector in input.

6744

for (unsigned i = 0, e = Half; i != e; ++i)

6745

if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)

6746

NumUndefsLO++;

6747

6748

for (unsigned i = Half, e = NumElts; i != e; ++i)

6749

if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)

6750

NumUndefsHI++;

6751

6752

// Early exit if this is either a build_vector of all UNDEFs or all the

6753

// operands but one are UNDEF.

6754

if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)

6755

return SDValue();

6756

6757

if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {

6758

// Try to match an SSE3 float HADD/HSUB.

6759

if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))

6760

return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

6761

6762

if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))

6763

return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);

6764

} else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {

6765

// Try to match an SSSE3 integer HADD/HSUB.

6766

if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

6767

return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);

6768

6769

if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))

6770

return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);

6771

}

6772

6773

if (!Subtarget->hasAVX())

6774

return SDValue();

6775

6776

if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {

6777

// Try to match an AVX horizontal add/sub of packed single/double

6778

// precision floating point values from 256-bit vectors.

6779

SDValue InVec2, InVec3;

6780

if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&

6781

isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&

6782

((InVec0.getOpcode() == ISD::UNDEF ||

6783

InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&

6784

((InVec1.getOpcode() == ISD::UNDEF ||

6785

InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))

6786

return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

6787

6788

if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&

6789

isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&

6790

((InVec0.getOpcode() == ISD::UNDEF ||

6791

InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&

6792

((InVec1.getOpcode() == ISD::UNDEF ||

6793

InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))

6794

return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);

6795

} else if (VT == MVT::v8i32 || VT == MVT::v16i16) {

6796

// Try to match an AVX2 horizontal add/sub of signed integers.

6797

SDValue InVec2, InVec3;

6798

unsigned X86Opcode;

6799

bool CanFold = true;

6800

6801

if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

6802

isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&

6803

((InVec0.getOpcode() == ISD::UNDEF ||

6804

InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&

6805

((InVec1.getOpcode() == ISD::UNDEF ||

6806

InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))

6807

X86Opcode = X86ISD::HADD;

6808

else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&

6809

isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&

6810

((InVec0.getOpcode() == ISD::UNDEF ||

6811

InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&

6812

((InVec1.getOpcode() == ISD::UNDEF ||

6813

InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))

6814

X86Opcode = X86ISD::HSUB;

6815

else

6816

CanFold = false;

6817

6818

if (CanFold) {

6819

// Fold this build_vector into a single horizontal add/sub.

6820

// Do this only if the target has AVX2.

6821

if (Subtarget->hasAVX2())

6822

return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);

6823

6824

// Do not try to expand this build_vector into a pair of horizontal

6825

// add/sub if we can emit a pair of scalar add/sub.

6826

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

6827

return SDValue();

6828

6829

// Convert this build_vector into a pair of horizontal binop followed by

6830

// a concat vector.

6831

bool isUndefLO = NumUndefsLO == Half;

6832

bool isUndefHI = NumUndefsHI == Half;

6833

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,

6834

isUndefLO, isUndefHI);

6835

}

6836

}

6837

6838

if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

6839

VT == MVT::v16i16) && Subtarget->hasAVX()) {

6840

unsigned X86Opcode;

6841

if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

6842

X86Opcode = X86ISD::HADD;

6843

else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))

6844

X86Opcode = X86ISD::HSUB;

6845

else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))

6846

X86Opcode = X86ISD::FHADD;

6847

else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))

6848

X86Opcode = X86ISD::FHSUB;

6849

else

6850

return SDValue();

6851

6852

// Don't try to expand this build_vector into a pair of horizontal add/sub

6853

// if we can simply emit a pair of scalar add/sub.

6854

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

6855

return SDValue();

6856

6857

// Convert this build_vector into two horizontal add/sub followed by

6858

// a concat vector.

6859

bool isUndefLO = NumUndefsLO == Half;

6860

bool isUndefHI = NumUndefsHI == Half;

6861

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

6862

isUndefLO, isUndefHI);

6863

}

6864

6865

return SDValue();

6866

}

6867

6868

SDValue

6869

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

6870

SDLoc dl(Op);

6871

6872

MVT VT = Op.getSimpleValueType();

6873

MVT ExtVT = VT.getVectorElementType();

6874

unsigned NumElems = Op.getNumOperands();

6875

6876

// Generate vectors for predicate vectors.

6877

if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())

6878

return LowerBUILD_VECTORvXi1(Op, DAG);

6879

6880

// Vectors containing all zeros can be matched by pxor and xorps later

6881

if (ISD::isBuildVectorAllZeros(Op.getNode())) {

6882

// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd

6883

// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.

6884

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

6885

return Op;

6886

6887

return getZeroVector(VT, Subtarget, DAG, dl);

6888

}

6889

6890

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

6891

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

6892

// vpcmpeqd on 256-bit vectors.

6893

if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

6894

if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))

6895

return Op;

6896

6897

if (!VT.is512BitVector())

6898

return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);

6899

}

6900

6901

SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);

6902

if (Broadcast.getNode())

6903

return Broadcast;

6904

6905

unsigned EVTBits = ExtVT.getSizeInBits();

6906

6907

unsigned NumZero = 0;

6908

unsigned NumNonZero = 0;

6909

unsigned NonZeros = 0;

6910

bool IsAllConstants = true;

6911

SmallSet<SDValue, 8> Values;

6912

for (unsigned i = 0; i < NumElems; ++i) {

6913

SDValue Elt = Op.getOperand(i);

6914

if (Elt.getOpcode() == ISD::UNDEF)

6915

continue;

6916

Values.insert(Elt);

6917

if (Elt.getOpcode() != ISD::Constant &&

6918

Elt.getOpcode() != ISD::ConstantFP)

6919

IsAllConstants = false;

6920

if (X86::isZeroNode(Elt))

6921

NumZero++;

6922

else {

6923

NonZeros |= (1 << i);

6924

NumNonZero++;

6925

}

6926

}

6927

6928

// All undef vector. Return an UNDEF. All zero vectors were handled above.

6929

if (NumNonZero == 0)

6930

return DAG.getUNDEF(VT);

6931

6932

// Special case for single non-zero, non-undef, element.

6933

if (NumNonZero == 1) {

6934

unsigned Idx = countTrailingZeros(NonZeros);

6935

SDValue Item = Op.getOperand(Idx);

6936

6937

// If this is an insertion of an i64 value on x86-32, and if the top bits of

6938

// the value are obviously zero, truncate the value to i32 and do the

6939

// insertion that way. Only do this if the value is non-constant or if the

6940

// value is a constant being inserted into element 0. It is cheaper to do

6941

// a constant pool load than it is to do a movd + shuffle.

6942

if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&

6943

(!IsAllConstants || Idx == 0)) {

6944

if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {

6945

// Handle SSE only.

6946

assert(VT == MVT::v2i64 && "Expected an SSE value type!")((VT == MVT::v2i64 && "Expected an SSE value type!") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::v2i64 && \"Expected an SSE value type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6946, __PRETTY_FUNCTION__));

6947

EVT VecVT = MVT::v4i32;

6948

unsigned VecElts = 4;

6949

6950

// Truncate the value (which may itself be a constant) to i32, and

6951

// convert it to a vector with movd (S2V+shuffle to zero extend).

6952

Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);

6953

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);

6954

6955

// If using the new shuffle lowering, just directly insert this.

6956

if (ExperimentalVectorShuffleLowering)

6957

return DAG.getNode(

6958

ISD::BITCAST, dl, VT,

6959

getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));

6960

6961

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

6962

6963

// Now we have our 32-bit value zero extended in the low element of

6964

// a vector. If Idx != 0, swizzle it into place.

6965

if (Idx != 0) {

6966

SmallVector<int, 4> Mask;

6967

Mask.push_back(Idx);

6968

for (unsigned i = 1; i != VecElts; ++i)

6969

Mask.push_back(i);

6970

Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),

6971

&Mask[0]);

6972

}

6973

return DAG.getNode(ISD::BITCAST, dl, VT, Item);

6974

}

6975

}

6976

6977

// If we have a constant or non-constant insertion into the low element of

6978

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

6979

// the rest of the elements. This will be matched as movd/movq/movss/movsd

6980

// depending on what the source datatype is.

6981

if (Idx == 0) {

6982

if (NumZero == 0)

6983

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

6984

6985

if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||

6986

(ExtVT == MVT::i64 && Subtarget->is64Bit())) {

6987

if (VT.is256BitVector() || VT.is512BitVector()) {

6988

SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);

6989

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,

6990

Item, DAG.getIntPtrConstant(0));

6991

}

6992

assert(VT.is128BitVector() && "Expected an SSE value type!")((VT.is128BitVector() && "Expected an SSE value type!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Expected an SSE value type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6992, __PRETTY_FUNCTION__));

6993

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

6994

// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.

6995

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

6996

}

6997

6998

if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {

6999

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

7000

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);

7001

if (VT.is256BitVector()) {

7002

SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);

7003

Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);

7004

} else {

7005

7006

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

7007

}

7008

return DAG.getNode(ISD::BITCAST, dl, VT, Item);

7009

}

7010

}

7011

7012

// Is it a vector logical left shift?

7013

if (NumElems == 2 && Idx == 1 &&

7014

X86::isZeroNode(Op.getOperand(0)) &&

7015

!X86::isZeroNode(Op.getOperand(1))) {

7016

unsigned NumBits = VT.getSizeInBits();

7017

return getVShift(true, VT,

7018

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

7019

VT, Op.getOperand(1)),

7020

NumBits/2, DAG, *this, dl);

7021

}

7022

7023

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

7024

return SDValue();

7025

7026

// Otherwise, if this is a vector with i32 or f32 elements, and the element

7027

// is a non-constant being inserted into an element other than the low one,

7028

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

7029

// movd/movss) to move this into the low element, then shuffle it into

7030

// place.

7031

if (EVTBits == 32) {

7032

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

7033

7034

// If using the new shuffle lowering, just directly insert this.

7035

if (ExperimentalVectorShuffleLowering)

7036

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

7037

7038

// Turn it into a shuffle of zero and zero-extended scalar to vector.

7039

Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);

7040

SmallVector<int, 8> MaskVec;

7041

for (unsigned i = 0; i != NumElems; ++i)

7042

MaskVec.push_back(i == Idx ? 0 : 1);

7043

return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);

7044

}

7045

}

7046

7047

// Splat is obviously ok. Let legalizer expand it to a shuffle.

7048

if (Values.size() == 1) {

7049

if (EVTBits == 32) {

7050

// Instead of a shuffle like this:

7051

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

7052

// Check if it's possible to issue this instead.

7053

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

7054

unsigned Idx = countTrailingZeros(NonZeros);

7055

SDValue Item = Op.getOperand(Idx);

7056

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

7057

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

7058

}

7059

return SDValue();

7060

}

7061

7062

// A vector full of immediates; various special cases are already

7063

// handled, so this is best done with a single constant-pool load.

7064

if (IsAllConstants)

7065

return SDValue();

7066

7067

// For AVX-length vectors, see if we can use a vector load to get all of the

7068

// elements, otherwise build the individual 128-bit pieces and use

7069

// shuffles to put them in place.

7070

if (VT.is256BitVector() || VT.is512BitVector()) {

7071

SmallVector<SDValue, 64> V;

7072

for (unsigned i = 0; i != NumElems; ++i)

7073

V.push_back(Op.getOperand(i));

7074

7075

// Check for a build vector of consecutive loads.

7076

if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))

7077

return LD;

7078

7079

EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);

7080

7081

// Build both the lower and upper subvector.

7082

SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,

7083

makeArrayRef(&V[0], NumElems/2));

7084

SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,

7085

makeArrayRef(&V[NumElems / 2], NumElems/2));

7086

7087

// Recreate the wider vector with the lower and upper part.

7088

if (VT.is256BitVector())

7089

return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);

7090

return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);

7091

}

7092

7093

// Let legalizer expand 2-wide build_vectors.

7094

if (EVTBits == 64) {

7095

if (NumNonZero == 1) {

7096

// One half is zero or undef.

7097

unsigned Idx = countTrailingZeros(NonZeros);

7098

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

7099

Op.getOperand(Idx));

7100

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

7101

}

7102

return SDValue();

7103

}

7104

7105

// If element VT is < 32 bits, convert it to inserts into a zero vector.

7106

if (EVTBits == 8 && NumElems == 16) {

7107

SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,

7108

Subtarget, *this);

7109

if (V.getNode()) return V;

7110

}

7111

7112

if (EVTBits == 16 && NumElems == 8) {

7113

SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,

7114

Subtarget, *this);

7115

if (V.getNode()) return V;

7116

}

7117

7118

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

7119

if (EVTBits == 32 && NumElems == 4) {

7120

SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);

7121

if (V.getNode())

7122

return V;

7123

}

7124

7125

// If element VT is == 32 bits, turn it into a number of shuffles.

7126

SmallVector<SDValue, 8> V(NumElems);

7127

if (NumElems == 4 && NumZero > 0) {

7128

for (unsigned i = 0; i < 4; ++i) {

7129

bool isZero = !(NonZeros & (1 << i));

7130

if (isZero)

7131

V[i] = getZeroVector(VT, Subtarget, DAG, dl);

7132

else

7133

V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

7134

}

7135

7136

for (unsigned i = 0; i < 2; ++i) {

7137

switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {

7138

default: break;

7139

case 0:

7140

V[i] = V[i*2]; // Must be a zero vector.

7141

break;

7142

case 1:

7143

V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);

7144

break;

7145

case 2:

7146

V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);

7147

break;

7148

case 3:

7149

V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);

7150

break;

7151

}

7152

}

7153

7154

bool Reverse1 = (NonZeros & 0x3) == 2;

7155

bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;

7156

int MaskVec[] = {

7157

Reverse1 ? 1 : 0,

7158

Reverse1 ? 0 : 1,

7159

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

7160

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

7161

};

7162

return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);

7163

}

7164

7165

if (Values.size() > 1 && VT.is128BitVector()) {

7166

// Check for a build vector of consecutive loads.

7167

for (unsigned i = 0; i < NumElems; ++i)

7168

V[i] = Op.getOperand(i);

7169

7170

// Check for elements which are consecutive loads.

7171

SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);

7172

if (LD.getNode())

7173

return LD;

7174

7175

// Check for a build vector from mostly shuffle plus few inserting.

7176

SDValue Sh = buildFromShuffleMostly(Op, DAG);

7177

if (Sh.getNode())

7178

return Sh;

7179

7180

// For SSE 4.1, use insertps to put the high elements into the low element.

7181

if (getSubtarget()->hasSSE41()) {

7182

SDValue Result;

7183

if (Op.getOperand(0).getOpcode() != ISD::UNDEF)

7184

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

7185

else

7186

Result = DAG.getUNDEF(VT);

7187

7188

for (unsigned i = 1; i < NumElems; ++i) {

7189

if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;

7190

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

7191

Op.getOperand(i), DAG.getIntPtrConstant(i));

7192

}

7193

return Result;

7194

}

7195

7196

// Otherwise, expand into a number of unpckl*, start by extending each of

7197

// our (non-undef) elements to the full vector width with the element in the

7198

// bottom slot of the vector (which generates no code for SSE).

7199

for (unsigned i = 0; i < NumElems; ++i) {

7200

if (Op.getOperand(i).getOpcode() != ISD::UNDEF)

7201

V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

7202

else

7203

V[i] = DAG.getUNDEF(VT);

7204

}

7205

7206

// Next, we iteratively mix elements, e.g. for v4f32:

7207

// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>

7208

// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>

7209

// Step 2: unpcklps X, Y ==> <3, 2, 1, 0>

7210

unsigned EltStride = NumElems >> 1;

7211

while (EltStride != 0) {

7212

for (unsigned i = 0; i < EltStride; ++i) {

7213

// If V[i+EltStride] is undef and this is the first round of mixing,

7214

// then it is safe to just drop this shuffle: V[i] is already in the

7215

// right place, the one element (since it's the first round) being

7216

// inserted as undef can be dropped. This isn't safe for successive

7217

// rounds because they will permute elements within both vectors.

7218

if (V[i+EltStride].getOpcode() == ISD::UNDEF &&

7219

EltStride == NumElems/2)

7220

continue;

7221

7222

V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);

7223

}

7224

EltStride >>= 1;

7225

}

7226

return V[0];

7227

}

7228

return SDValue();

7229

}

7230

7231

// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction

7232

// to create 256-bit vectors from two other 128-bit ones.

7233

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {

7234

SDLoc dl(Op);

7235

MVT ResVT = Op.getSimpleValueType();

7236

7237

assert((ResVT.is256BitVector() ||(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7238, __PRETTY_FUNCTION__))

7238

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7238, __PRETTY_FUNCTION__));

7239

7240

SDValue V1 = Op.getOperand(0);

7241

SDValue V2 = Op.getOperand(1);

7242

unsigned NumElems = ResVT.getVectorNumElements();

7243

if(ResVT.is256BitVector())

7244

return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

7245

7246

if (Op.getNumOperands() == 4) {

7247

MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),

7248

ResVT.getVectorNumElements()/2);

7249

SDValue V3 = Op.getOperand(2);

7250

SDValue V4 = Op.getOperand(3);

7251

return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),

7252

Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);

7253

}

7254

return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

7255

}

7256

7257

static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {

7258

MVT LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) VT = Op.getSimpleValueType();

7259

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7261, __PRETTY_FUNCTION__))

7260

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7261, __PRETTY_FUNCTION__))

7261

Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7261, __PRETTY_FUNCTION__));

7262

7263

// AVX can use the vinsertf128 instruction to create 256-bit vectors

7264

// from two other 128-bit ones.

7265

7266

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

7267

return LowerAVXCONCAT_VECTORS(Op, DAG);

7268

}

7269

7270

7271

//===----------------------------------------------------------------------===//

7272

// Vector shuffle lowering

7273

7274

// This is an experimental code path for lowering vector shuffles on x86. It is

7275

// designed to handle arbitrary vector shuffles and blends, gracefully

7276

// degrading performance as necessary. It works hard to recognize idiomatic

7277

// shuffles and lower them to optimal instruction patterns without leaving

7278

// a framework that allows reasonably efficient handling of all vector shuffle

7279

// patterns.

7280

//===----------------------------------------------------------------------===//

7281

7282

/// \brief Tiny helper function to identify a no-op mask.

7283

///

7284

/// This is a somewhat boring predicate function. It checks whether the mask

7285

/// array input, which is assumed to be a single-input shuffle mask of the kind

7286

/// used by the X86 shuffle instructions (not a fully general

7287

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

7288

/// in-place shuffle are 'no-op's.

7289

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

7290

for (int i = 0, Size = Mask.size(); i < Size; ++i)

7291

if (Mask[i] != -1 && Mask[i] != i)

7292

return false;

7293

return true;

7294

}

7295

7296

/// \brief Helper function to classify a mask as a single-input mask.

7297

///

7298

/// This isn't a generic single-input test because in the vector shuffle

7299

/// lowering we canonicalize single inputs to be the first input operand. This

7300

/// means we can more quickly test for a single input by only checking whether

7301

/// an input from the second operand exists. We also assume that the size of

7302

/// mask corresponds to the size of the input vectors which isn't true in the

7303

/// fully general case.

7304

static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {

7305

for (int M : Mask)

7306

if (M >= (int)Mask.size())

7307

return false;

7308

return true;

7309

}

7310

7311

/// \brief Test whether there are elements crossing 128-bit lanes in this

7312

/// shuffle mask.

7313

///

7314

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

7315

/// and we routinely test for these.

7316

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

7317

int LaneSize = 128 / VT.getScalarSizeInBits();

7318

int Size = Mask.size();

7319

for (int i = 0; i < Size; ++i)

7320

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

7321

return true;

7322

return false;

7323

}

7324

7325

/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.

7326

///

7327

/// This checks a shuffle mask to see if it is performing the same

7328

/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies

7329

/// that it is also not lane-crossing. It may however involve a blend from the

7330

/// same lane of a second vector.

7331

///

7332

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

7333

/// non-trivial to compute in the face of undef lanes. The representation is

7334

/// *not* suitable for use with existing 128-bit shuffles as it will contain

7335

/// entries from both V1 and V2 inputs to the wider mask.

7336

static bool

7337

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

7338

SmallVectorImpl<int> &RepeatedMask) {

7339

int LaneSize = 128 / VT.getScalarSizeInBits();

7340

RepeatedMask.resize(LaneSize, -1);

7341

int Size = Mask.size();

7342

for (int i = 0; i < Size; ++i) {

7343

if (Mask[i] < 0)

7344

continue;

7345

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

7346

// This entry crosses lanes, so there is no way to model this shuffle.

7347

return false;

7348

7349

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

7350

if (RepeatedMask[i % LaneSize] == -1)

7351

// This is the first non-undef entry in this slot of a 128-bit lane.

7352

RepeatedMask[i % LaneSize] =

7353

Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;

7354

else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])

7355

// Found a mismatch with the repeated mask.

7356

return false;

7357

}

7358

return true;

7359

}

7360

7361

// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC

7362

// 2013 will allow us to use it as a non-type template parameter.

7363

namespace {

7364

7365

/// \brief Implementation of the \c isShuffleEquivalent variadic functor.

7366

///

7367

/// See its documentation for details.

7368

bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {

7369

if (Mask.size() != Args.size())

7370

return false;

7371

for (int i = 0, e = Mask.size(); i < e; ++i) {

7372

assert(*Args[i] >= 0 && "Arguments must be positive integers!")((*Args[i] >= 0 && "Arguments must be positive integers!"
) ? static_cast<void> (0) : __assert_fail ("*Args[i] >= 0 && \"Arguments must be positive integers!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7372, __PRETTY_FUNCTION__));

7373

if (Mask[i] != -1 && Mask[i] != *Args[i])

7374

return false;

7375

}

7376

return true;

7377

}

7378

7379

} // namespace

7380

7381

/// \brief Checks whether a shuffle mask is equivalent to an explicit list of

7382

/// arguments.

7383

///

7384

/// This is a fast way to test a shuffle mask against a fixed pattern:

7385

///

7386

/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }

7387

///

7388

/// It returns true if the mask is exactly as wide as the argument list, and

7389

/// each element of the mask is either -1 (signifying undef) or the value given

7390

/// in the argument.

7391

static const VariadicFunction1<

7392

bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};

7393

7394

/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.

7395

///

7396

/// This helper function produces an 8-bit shuffle immediate corresponding to

7397

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

7398

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

7399

/// example.

7400

///

7401

/// NB: We rely heavily on "undef" masks preserving the input lane.

7402

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,

7403

SelectionDAG &DAG) {

7404

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7404, __PRETTY_FUNCTION__));

7405

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7405, __PRETTY_FUNCTION__));

7406

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7406, __PRETTY_FUNCTION__));

7407

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7407, __PRETTY_FUNCTION__));

7408

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7408, __PRETTY_FUNCTION__));

7409

7410

unsigned Imm = 0;

7411

Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;

7412

Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;

7413

Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;

7414

Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;

7415

return DAG.getConstant(Imm, MVT::i8);

7416

}

7417

7418

/// \brief Try to emit a blend instruction for a shuffle.

7419

///

7420

/// This doesn't do any checks for the availability of instructions for blending

7421

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

7422

/// be matched in the backend with the type given. What it does check for is

7423

/// that the shuffle mask is in fact a blend.

7424

static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,

7425

SDValue V2, ArrayRef<int> Mask,

7426

const X86Subtarget *Subtarget,

7427

SelectionDAG &DAG) {

7428

7429

unsigned BlendMask = 0;

7430

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

7431

if (Mask[i] >= Size) {

7432

if (Mask[i] != i + Size)

7433

return SDValue(); // Shuffled V2 input!

7434

BlendMask |= 1u << i;

7435

continue;

7436

}

7437

if (Mask[i] >= 0 && Mask[i] != i)

7438

return SDValue(); // Shuffled V1 input!

7439

}

7440

switch (VT.SimpleTy) {

7441

case MVT::v2f64:

7442

case MVT::v4f32:

7443

case MVT::v4f64:

7444

case MVT::v8f32:

7445

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

7446

DAG.getConstant(BlendMask, MVT::i8));

7447

7448

case MVT::v4i64:

7449

case MVT::v8i32:

7450

assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7450, __PRETTY_FUNCTION__));

7451

// FALLTHROUGH

7452

case MVT::v2i64:

7453

case MVT::v4i32:

7454

// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into

7455

// that instruction.

7456

if (Subtarget->hasAVX2()) {

7457

// Scale the blend by the number of 32-bit dwords per element.

7458

int Scale = VT.getScalarSizeInBits() / 32;

7459

BlendMask = 0;

7460

for (int i = 0, Size = Mask.size(); i < Size; ++i)

7461

if (Mask[i] >= Size)

7462

for (int j = 0; j < Scale; ++j)

7463

BlendMask |= 1u << (i * Scale + j);

7464

7465

MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;

7466

V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);

7467

V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);

7468

return DAG.getNode(ISD::BITCAST, DL, VT,

7469

DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,

7470

DAG.getConstant(BlendMask, MVT::i8)));

7471

}

7472

// FALLTHROUGH

7473

case MVT::v8i16: {

7474

// For integer shuffles we need to expand the mask and cast the inputs to

7475

// v8i16s prior to blending.

7476

int Scale = 8 / VT.getVectorNumElements();

7477

BlendMask = 0;

7478

for (int i = 0, Size = Mask.size(); i < Size; ++i)

7479

if (Mask[i] >= Size)

7480

for (int j = 0; j < Scale; ++j)

7481

BlendMask |= 1u << (i * Scale + j);

7482

7483

V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);

7484

V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);

7485

return DAG.getNode(ISD::BITCAST, DL, VT,

7486

DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,

7487

DAG.getConstant(BlendMask, MVT::i8)));

7488

}

7489

7490

case MVT::v16i16: {

7491

7492

SmallVector<int, 8> RepeatedMask;

7493

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

7494

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

7495

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7495, __PRETTY_FUNCTION__));

7496

BlendMask = 0;

7497

for (int i = 0; i < 8; ++i)

7498

if (RepeatedMask[i] >= 16)

7499

BlendMask |= 1u << i;

7500

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

7501

DAG.getConstant(BlendMask, MVT::i8));

7502

}

7503

}

7504

// FALLTHROUGH

7505

case MVT::v32i8: {

7506

7507

// Scale the blend by the number of bytes per element.

7508

int Scale = VT.getScalarSizeInBits() / 8;

7509

assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!")((Mask.size() * Scale == 32 && "Not a 256-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() * Scale == 32 && \"Not a 256-bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7509, __PRETTY_FUNCTION__));

7510

7511

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

7512

// mix of LLVM's code generator and the x86 backend. We tell the code

7513

// generator that boolean values in the elements of an x86 vector register

7514

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

7515

// mapping a select to operand #1, and 'false' mapping to operand #2. The

7516

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

7517

// of the element (the remaining are ignored) and 0 in that high bit would

7518

// mean operand #1 while 1 in the high bit would mean operand #2. So while

7519

// the LLVM model for boolean values in vector elements gets the relevant

7520

// bit set, it is set backwards and over constrained relative to x86's

7521

// actual model.

7522

SDValue VSELECTMask[32];

7523

for (int i = 0, Size = Mask.size(); i < Size; ++i)

7524

for (int j = 0; j < Scale; ++j)

7525

VSELECTMask[Scale * i + j] =

7526

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

7527

: DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);

7528

7529

V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);

7530

V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);

7531

return DAG.getNode(

7532

ISD::BITCAST, DL, VT,

7533

DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,

7534

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),

7535

V1, V2));

7536

}

7537

7538

default:

7539

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7539);

7540

}

7541

}

7542

7543

/// \brief Generic routine to lower a shuffle and blend as a decomposed set of

7544

/// unblended shuffles followed by an unshuffled blend.

7545

///

7546

/// This matches the extremely common pattern for handling combined

7547

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

7548

/// operations.

7549

static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,

7550

SDValue V1,

7551

SDValue V2,

7552

ArrayRef<int> Mask,

7553

SelectionDAG &DAG) {

7554

// Shuffle the input elements into the desired positions in V1 and V2 and

7555

// blend them together.

7556

SmallVector<int, 32> V1Mask(Mask.size(), -1);

7557

SmallVector<int, 32> V2Mask(Mask.size(), -1);

7558

SmallVector<int, 32> BlendMask(Mask.size(), -1);

7559

for (int i = 0, Size = Mask.size(); i < Size; ++i)

7560

if (Mask[i] >= 0 && Mask[i] < Size) {

7561

V1Mask[i] = Mask[i];

7562

BlendMask[i] = i;

7563

} else if (Mask[i] >= Size) {

7564

V2Mask[i] = Mask[i] - Size;

7565

BlendMask[i] = i + Size;

7566

}

7567

7568

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

7569

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

7570

return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

7571

}

7572

7573

/// \brief Try to lower a vector shuffle as a byte rotation.

7574

///

7575

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

7576

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

7577

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

7578

/// try to generically lower a vector shuffle through such an pattern. It

7579

/// does not check for the profitability of lowering either as PALIGNR or

7580

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

7581

/// This matches shuffle vectors that look like:

7582

///

7583

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

7584

///

7585

/// Essentially it concatenates V1 and V2, shifts right by some number of

7586

/// elements, and takes the low elements as the result. Note that while this is

7587

/// specified as a *right shift* because x86 is little-endian, it is a *left

7588

/// rotate* of the vector lanes.

7589

///

7590

/// Note that this only handles 128-bit vector widths currently.

7591

static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,

7592

SDValue V2,

7593

ArrayRef<int> Mask,

7594

const X86Subtarget *Subtarget,

7595

SelectionDAG &DAG) {

7596

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7596, __PRETTY_FUNCTION__));

7597

7598

// We need to detect various ways of spelling a rotation:

7599

// [11, 12, 13, 14, 15, 0, 1, 2]

7600

// [-1, 12, 13, 14, -1, -1, 1, -1]

7601

// [-1, -1, -1, -1, -1, -1, 1, 2]

7602

// [ 3, 4, 5, 6, 7, 8, 9, 10]

7603

// [-1, 4, 5, 6, -1, -1, 9, -1]

7604

// [-1, 4, 5, 6, -1, -1, -1, -1]

7605

int Rotation = 0;

7606

SDValue Lo, Hi;

7607

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

7608

if (Mask[i] == -1)

7609

continue;

7610

assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!")((Mask[i] >= 0 && "Only -1 is a valid negative mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] >= 0 && \"Only -1 is a valid negative mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7610, __PRETTY_FUNCTION__));

7611

7612

// Based on the mod-Size value of this mask element determine where

7613

// a rotated vector would have started.

7614

int StartIdx = i - (Mask[i] % Size);

7615

if (StartIdx == 0)

7616

// The identity rotation isn't interesting, stop.

7617

return SDValue();

7618

7619

// If we found the tail of a vector the rotation must be the missing

7620

// front. If we found the head of a vector, it must be how much of the head.

7621

int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;

7622

7623

if (Rotation == 0)

7624

Rotation = CandidateRotation;

7625

else if (Rotation != CandidateRotation)

7626

// The rotations don't match, so we can't match this mask.

7627

return SDValue();

7628

7629

// Compute which value this mask is pointing at.

7630

SDValue MaskV = Mask[i] < Size ? V1 : V2;

7631

7632

// Compute which of the two target values this index should be assigned to.

7633

// This reflects whether the high elements are remaining or the low elements

7634

// are remaining.

7635

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

7636

7637

// Either set up this value if we've not encountered it before, or check

7638

// that it remains consistent.

7639

if (!TargetV)

7640

TargetV = MaskV;

7641

else if (TargetV != MaskV)

7642

// This may be a rotation, but it pulls from the inputs in some

7643

// unsupported interleaving.

7644

return SDValue();

7645

}

7646

7647

// Check that we successfully analyzed the mask, and normalize the results.

7648

assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!"
) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7648, __PRETTY_FUNCTION__));

7649

assert((Lo || Hi) && "Failed to find a rotated input vector!")(((Lo || Hi) && "Failed to find a rotated input vector!"
) ? static_cast<void> (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7649, __PRETTY_FUNCTION__));

7650

if (!Lo)

7651

Lo = Hi;

7652

else if (!Hi)

7653

Hi = Lo;

7654

7655

assert(VT.getSizeInBits() == 128 &&((VT.getSizeInBits() == 128 && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == 128 && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7656, __PRETTY_FUNCTION__))

7656

"Rotate-based lowering only supports 128-bit lowering!")((VT.getSizeInBits() == 128 && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == 128 && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7656, __PRETTY_FUNCTION__));

7657

assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7658, __PRETTY_FUNCTION__))

7658

"Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7658, __PRETTY_FUNCTION__));

7659

7660

// The actual rotate instruction rotates bytes, so we need to scale the

7661

// rotation based on how many bytes are in the vector.

7662

int Scale = 16 / Mask.size();

7663

7664

// SSSE3 targets can use the palignr instruction

7665

if (Subtarget->hasSSSE3()) {

7666

// Cast the inputs to v16i8 to match PALIGNR.

7667

Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);

7668

Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);

7669

7670

return DAG.getNode(ISD::BITCAST, DL, VT,

7671

DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,

7672

DAG.getConstant(Rotation * Scale, MVT::i8)));

7673

}

7674

7675

// Default SSE2 implementation

7676

int LoByteShift = 16 - Rotation * Scale;

7677

int HiByteShift = Rotation * Scale;

7678

7679

// Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.

7680

Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);

7681

Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);

7682

7683

SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,

7684

DAG.getConstant(8 * LoByteShift, MVT::i8));

7685

SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,

7686

DAG.getConstant(8 * HiByteShift, MVT::i8));

7687

return DAG.getNode(ISD::BITCAST, DL, VT,

7688

DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));

7689

}

7690

7691

/// \brief Compute whether each element of a shuffle is zeroable.

7692

///

7693

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

7694

/// Either it is an undef element in the shuffle mask, the element of the input

7695

/// referenced is undef, or the element of the input referenced is known to be

7696

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

7697

/// as many lanes with this technique as possible to simplify the remaining

7698

/// shuffle.

7699

static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,

7700

SDValue V1, SDValue V2) {

7701

SmallBitVector Zeroable(Mask.size(), false);

7702

7703

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

7704

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

7705

7706

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

7707

int M = Mask[i];

7708

// Handle the easy cases.

7709

if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

7710

Zeroable[i] = true;

7711

continue;

7712

}

7713

7714

// If this is an index into a build_vector node, dig out the input value and

7715

// use it.

7716

SDValue V = M < Size ? V1 : V2;

7717

if (V.getOpcode() != ISD::BUILD_VECTOR)

7718

continue;

7719

7720

SDValue Input = V.getOperand(M % Size);

7721

// The UNDEF opcode check really should be dead code here, but not quite

7722

// worth asserting on (it isn't invalid, just unexpected).

7723

if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))

7724

Zeroable[i] = true;

7725

}

7726

7727

return Zeroable;

7728

}

7729

7730

/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).

7731

///

7732

/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2

7733

/// byte-shift instructions. The mask must consist of a shifted sequential

7734

/// shuffle from one of the input vectors and zeroable elements for the

7735

/// remaining 'shifted in' elements.

7736

///

7737

/// Note that this only handles 128-bit vector widths currently.

7738

static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,

7739

SDValue V2, ArrayRef<int> Mask,

7740

SelectionDAG &DAG) {

7741

7742

7743

SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

7744

7745

int Size = Mask.size();

7746

int Scale = 16 / Size;

7747

7748

for (int Shift = 1; Shift < Size; Shift++) {

7749

int ByteShift = Shift * Scale;

7750

7751

// PSRLDQ : (little-endian) right byte shift

7752

// [ 5, 6, 7, zz, zz, zz, zz, zz]

7753

// [ -1, 5, 6, 7, zz, zz, zz, zz]

7754

// [ 1, 2, -1, -1, -1, -1, zz, zz]

7755

bool ZeroableRight = true;

7756

for (int i = Size - Shift; i < Size; i++) {

7757

ZeroableRight &= Zeroable[i];

7758

}

7759

7760

if (ZeroableRight) {

7761

bool ValidShiftRight1 =

7762

isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);

7763

bool ValidShiftRight2 =

7764

isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);

7765

7766

if (ValidShiftRight1 || ValidShiftRight2) {

7767

// Cast the inputs to v2i64 to match PSRLDQ.

7768

SDValue &TargetV = ValidShiftRight1 ? V1 : V2;

7769

SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);

7770

SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,

7771

DAG.getConstant(ByteShift * 8, MVT::i8));

7772

return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);

7773

}

7774

}

7775

7776

// PSLLDQ : (little-endian) left byte shift

7777

// [ zz, 0, 1, 2, 3, 4, 5, 6]

7778

// [ zz, zz, -1, -1, 2, 3, 4, -1]

7779

// [ zz, zz, zz, zz, zz, zz, -1, 1]

7780

bool ZeroableLeft = true;

7781

for (int i = 0; i < Shift; i++) {

7782

ZeroableLeft &= Zeroable[i];

7783

}

7784

7785

if (ZeroableLeft) {

7786

bool ValidShiftLeft1 =

7787

isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);

7788

bool ValidShiftLeft2 =

7789

isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);

7790

7791

if (ValidShiftLeft1 || ValidShiftLeft2) {

7792

// Cast the inputs to v2i64 to match PSLLDQ.

7793

SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;

7794

SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);

7795

SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,

7796

DAG.getConstant(ByteShift * 8, MVT::i8));

7797

return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);

7798

}

7799

}

7800

}

7801

7802

return SDValue();

7803

}

7804

7805

/// \brief Lower a vector shuffle as a zero or any extension.

7806

///

7807

/// Given a specific number of elements, element bit width, and extension

7808

/// stride, produce either a zero or any extension based on the available

7809

/// features of the subtarget.

7810

static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(

7811

SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,

7812

const X86Subtarget *Subtarget, SelectionDAG &DAG) {

7813

assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast
<void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7813, __PRETTY_FUNCTION__));

7814

int EltBits = VT.getSizeInBits() / NumElements;

7815

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7816, __PRETTY_FUNCTION__))

7816

"Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7816, __PRETTY_FUNCTION__));

7817

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."
) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7817, __PRETTY_FUNCTION__));

7818

7819

// Found a valid zext mask! Try various lowering strategies based on the

7820

// input type and available ISA extensions.

7821

if (Subtarget->hasSSE41()) {

7822

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

7823

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

7824

NumElements / Scale);

7825

InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);

7826

return DAG.getNode(ISD::BITCAST, DL, VT,

7827

DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));

7828

}

7829

7830

// For any extends we can cheat for larger element sizes and use shuffle

7831

// instructions that can fold with a load and/or copy.

7832

if (AnyExt && EltBits == 32) {

7833

int PSHUFDMask[4] = {0, -1, 1, -1};

7834

return DAG.getNode(

7835

ISD::BITCAST, DL, VT,

7836

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

7837

DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),

7838

getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));

7839

}

7840

if (AnyExt && EltBits == 16 && Scale > 2) {

7841

int PSHUFDMask[4] = {0, -1, 0, -1};

7842

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

7843

DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),

7844

getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));

7845

int PSHUFHWMask[4] = {1, -1, -1, -1};

7846

return DAG.getNode(

7847

ISD::BITCAST, DL, VT,

7848

DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,

7849

DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),

7850

getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));

7851

}

7852

7853

// If this would require more than 2 unpack instructions to expand, use

7854

// pshufb when available. We can only use more than 2 unpack instructions

7855

// when zero extending i8 elements which also makes it easier to use pshufb.

7856

if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {

7857

assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7857, __PRETTY_FUNCTION__));

7858

SDValue PSHUFBMask[16];

7859

for (int i = 0; i < 16; ++i)

7860

PSHUFBMask[i] =

7861

DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);

7862

InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);

7863

return DAG.getNode(ISD::BITCAST, DL, VT,

7864

DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

7865

DAG.getNode(ISD::BUILD_VECTOR, DL,

7866

MVT::v16i8, PSHUFBMask)));

7867

}

7868

7869

// Otherwise emit a sequence of unpacks.

7870

do {

7871

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

7872

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

7873

: getZeroVector(InputVT, Subtarget, DAG, DL);

7874

InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);

7875

InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);

7876

Scale /= 2;

7877

EltBits *= 2;

7878

NumElements /= 2;

7879

} while (Scale > 1);

7880

return DAG.getNode(ISD::BITCAST, DL, VT, InputV);

7881

}

7882

7883

/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.

7884

///

7885

/// This routine will try to do everything in its power to cleverly lower

7886

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

7887

/// check for the profitability of this lowering, it tries to aggressively

7888

/// match this pattern. It will use all of the micro-architectural details it

7889

/// can to emit an efficient lowering. It handles both blends with all-zero

7890

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

7891

/// masking out later).

7892

///

7893

/// The reason we have dedicated lowering for zext-style shuffles is that they

7894

/// are both incredibly common and often quite performance sensitive.

7895

static SDValue lowerVectorShuffleAsZeroOrAnyExtend(

7896

SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

7897

const X86Subtarget *Subtarget, SelectionDAG &DAG) {

7898

SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

7899

7900

int Bits = VT.getSizeInBits();

7901

int NumElements = Mask.size();

7902

7903

// Define a helper function to check a particular ext-scale and lower to it if

7904

// valid.

7905

auto Lower = [&](int Scale) -> SDValue {

7906

SDValue InputV;

7907

bool AnyExt = true;

7908

for (int i = 0; i < NumElements; ++i) {

7909

if (Mask[i] == -1)

7910

continue; // Valid anywhere but doesn't tell us anything.

7911

if (i % Scale != 0) {

7912

// Each of the extended elements need to be zeroable.

7913

if (!Zeroable[i])

7914

return SDValue();

7915

7916

// We no longer are in the anyext case.

7917

AnyExt = false;

7918

continue;

7919

}

7920

7921

// Each of the base elements needs to be consecutive indices into the

7922

// same input vector.

7923

SDValue V = Mask[i] < NumElements ? V1 : V2;

7924

if (!InputV)

7925

InputV = V;

7926

else if (InputV != V)

7927

return SDValue(); // Flip-flopping inputs.

7928

7929

if (Mask[i] % NumElements != i / Scale)

7930

return SDValue(); // Non-consecutive strided elements.

7931

}

7932

7933

// If we fail to find an input, we have a zero-shuffle which should always

7934

// have already been handled.

7935

// FIXME: Maybe handle this here in case during blending we end up with one?

7936

if (!InputV)

7937

return SDValue();

7938

7939

return lowerVectorShuffleAsSpecificZeroOrAnyExtend(

7940

DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);

7941

};

7942

7943

// The widest scale possible for extending is to a 64-bit integer.

7944

assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7945, __PRETTY_FUNCTION__))

7945

"The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7945, __PRETTY_FUNCTION__));

7946

int NumExtElements = Bits / 64;

7947

7948

// Each iteration, try extending the elements half as much, but into twice as

7949

// many elements.

7950

for (; NumExtElements < NumElements; NumExtElements *= 2) {

7951

assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7952, __PRETTY_FUNCTION__))

7952

"The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7952, __PRETTY_FUNCTION__));

7953

if (SDValue V = Lower(NumElements / NumExtElements))

7954

return V;

7955

}

7956

7957

// No viable ext lowering found.

7958

return SDValue();

7959

}

7960

7961

/// \brief Try to get a scalar value for a specific element of a vector.

7962

///

7963

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

7964

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

7965

SelectionDAG &DAG) {

7966

MVT VT = V.getSimpleValueType();

7967

MVT EltVT = VT.getVectorElementType();

7968

while (V.getOpcode() == ISD::BITCAST)

7969

V = V.getOperand(0);

7970

// If the bitcasts shift the element size, we can't extract an equivalent

7971

// element from it.

7972

MVT NewVT = V.getSimpleValueType();

7973

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

7974

return SDValue();

7975

7976

if (V.getOpcode() == ISD::BUILD_VECTOR ||

7977

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))

7978

return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));

7979

7980

return SDValue();

7981

}

7982

7983

/// \brief Helper to test for a load that can be folded with x86 shuffles.

7984

///

7985

/// This is particularly important because the set of instructions varies

7986

/// significantly based on whether the operand is a load or not.

7987

static bool isShuffleFoldableLoad(SDValue V) {

7988

while (V.getOpcode() == ISD::BITCAST)

7989

V = V.getOperand(0);

7990

7991

return ISD::isNON_EXTLoad(V.getNode());

7992

}

7993

7994

/// \brief Try to lower insertion of a single element into a zero vector.

7995

///

7996

/// This is a common pattern that we have especially efficient patterns to lower

7997

/// across all subtarget feature sets.

7998

static SDValue lowerVectorShuffleAsElementInsertion(

7999

MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,

8000

const X86Subtarget *Subtarget, SelectionDAG &DAG) {

8001

SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

8002

MVT ExtVT = VT;

8003

MVT EltVT = VT.getVectorElementType();

8004

8005

int V2Index = std::find_if(Mask.begin(), Mask.end(),

8006

[&Mask](int M) { return M >= (int)Mask.size(); }) -

8007

Mask.begin();

8008

bool IsV1Zeroable = true;

8009

for (int i = 0, Size = Mask.size(); i < Size; ++i)

8010

if (i != V2Index && !Zeroable[i]) {

8011

IsV1Zeroable = false;

8012

break;

8013

}

8014

8015

// Check for a single input from a SCALAR_TO_VECTOR node.

8016

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

8017

// all the smarts here sunk into that routine. However, the current

8018

// lowering of BUILD_VECTOR makes that nearly impossible until the old

8019

// vector shuffle lowering is dead.

8020

if (SDValue V2S = getScalarValueForVectorElement(

8021

V2, Mask[V2Index] - Mask.size(), DAG)) {

8022

// We need to zext the scalar if it is smaller than an i32.

8023

V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);

8024

if (EltVT == MVT::i8 || EltVT == MVT::i16) {

8025

// Using zext to expand a narrow element won't work for non-zero

8026

// insertions.

8027

if (!IsV1Zeroable)

8028

return SDValue();

8029

8030

// Zero-extend directly to i32.

8031

ExtVT = MVT::v4i32;

8032

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

8033

}

8034

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

8035

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

8036

EltVT == MVT::i16) {

8037

// Either not inserting from the low element of the input or the input

8038

// element size is too small to use VZEXT_MOVL to clear the high bits.

8039

return SDValue();

8040

}

8041

8042

if (!IsV1Zeroable) {

8043

// If V1 can't be treated as a zero vector we have fewer options to lower

8044

// this. We can't support integer vectors or non-zero targets cheaply, and

8045

// the V1 elements can't be permuted in any way.

8046

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8046, __PRETTY_FUNCTION__));

8047

if (!VT.isFloatingPoint() || V2Index != 0)

8048

return SDValue();

8049

SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());

8050

V1Mask[V2Index] = -1;

8051

if (!isNoopShuffleMask(V1Mask))

8052

return SDValue();

8053

// This is essentially a special case blend operation, but if we have

8054

// general purpose blend operations, they are always faster. Bail and let

8055

// the rest of the lowering handle these as blends.

8056

if (Subtarget->hasSSE41())

8057

return SDValue();

8058

8059

// Otherwise, use MOVSD or MOVSS.

8060

assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8061, __PRETTY_FUNCTION__))

8061

"Only two types of floating point element types to handle!")(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8061, __PRETTY_FUNCTION__));

8062

return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,

8063

ExtVT, V1, V2);

8064

}

8065

8066

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

8067

if (ExtVT != VT)

8068

V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);

8069

8070

if (V2Index != 0) {

8071

// If we have 4 or fewer lanes we can cheaply shuffle the element into

8072

// the desired position. Otherwise it is more efficient to do a vector

8073

// shift left. We know that we can do a vector shift left because all

8074

// the inputs are zero.

8075

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

8076

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

8077

V2Shuffle[V2Index] = 0;

8078

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

8079

} else {

8080

V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);

8081

V2 = DAG.getNode(

8082

X86ISD::VSHLDQ, DL, MVT::v2i64, V2,

8083

DAG.getConstant(

8084

V2Index * EltVT.getSizeInBits(),

8085

DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));

8086

V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);

8087

}

8088

}

8089

return V2;

8090

}

8091

8092

/// \brief Try to lower broadcast of a single element.

8093

///

8094

/// For convenience, this code also bundles all of the subtarget feature set

8095

/// filtering. While a little annoying to re-dispatch on type here, there isn't

8096

/// a convenient way to factor it out.

8097

static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,

8098

ArrayRef<int> Mask,

8099

const X86Subtarget *Subtarget,

8100

SelectionDAG &DAG) {

8101

if (!Subtarget->hasAVX())

8102

return SDValue();

8103

if (VT.isInteger() && !Subtarget->hasAVX2())

8104

return SDValue();

8105

8106

// Check that the mask is a broadcast.

8107

int BroadcastIdx = -1;

8108

for (int M : Mask)

8109

if (M >= 0 && BroadcastIdx == -1)

8110

BroadcastIdx = M;

8111

else if (M >= 0 && M != BroadcastIdx)

8112

return SDValue();

8113

8114

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8116, __PRETTY_FUNCTION__))

8115

"a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8116, __PRETTY_FUNCTION__))

8116

"comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8116, __PRETTY_FUNCTION__));

8117

8118

// Go up the chain of (vector) values to try and find a scalar load that

8119

// we can combine with the broadcast.

8120

for (;;) {

8121

switch (V.getOpcode()) {

8122

case ISD::CONCAT_VECTORS: {

8123

int OperandSize = Mask.size() / V.getNumOperands();

8124

V = V.getOperand(BroadcastIdx / OperandSize);

8125

BroadcastIdx %= OperandSize;

8126

continue;

8127

}

8128

8129

case ISD::INSERT_SUBVECTOR: {

8130

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

8131

auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));

8132

if (!ConstantIdx)

8133

break;

8134

8135

int BeginIdx = (int)ConstantIdx->getZExtValue();

8136

int EndIdx =

8137

BeginIdx + (int)VInner.getValueType().getVectorNumElements();

8138

if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {

8139

BroadcastIdx -= BeginIdx;

8140

V = VInner;

8141

} else {

8142

V = VOuter;

8143

}

8144

continue;

8145

}

8146

}

8147

break;

8148

}

8149

8150

// Check if this is a broadcast of a scalar. We special case lowering

8151

// for scalars so that we can more effectively fold with loads.

8152

if (V.getOpcode() == ISD::BUILD_VECTOR ||

8153

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {

8154

V = V.getOperand(BroadcastIdx);

8155

8156

// If the scalar isn't a load we can't broadcast from it in AVX1, only with

8157

// AVX2.

8158

if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))

8159

return SDValue();

8160

} else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {

8161

// We can't broadcast from a vector register w/o AVX2, and we can only

8162

// broadcast from the zero-element of a vector register.

8163

return SDValue();

8164

}

8165

8166

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);

8167

}

8168

8169

// Check for whether we can use INSERTPS to perform the shuffle. We only use

8170

// INSERTPS when the V1 elements are already in the correct locations

8171

// because otherwise we can just always use two SHUFPS instructions which

8172

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

8173

// perform INSERTPS if a single V1 element is out of place and all V2

8174

// elements are zeroable.

8175

static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,

8176

ArrayRef<int> Mask,

8177

SelectionDAG &DAG) {

8178

assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v4f32 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8178, __PRETTY_FUNCTION__));

8179

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8179, __PRETTY_FUNCTION__));

8180

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8180, __PRETTY_FUNCTION__));

8181

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8181, __PRETTY_FUNCTION__));

8182

8183

SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

8184

8185

unsigned ZMask = 0;

8186

int V1DstIndex = -1;

8187

int V2DstIndex = -1;

8188

bool V1UsedInPlace = false;

8189

8190

for (int i = 0; i < 4; i++) {

8191

// Synthesize a zero mask from the zeroable elements (includes undefs).

8192

if (Zeroable[i]) {

8193

ZMask |= 1 << i;

8194

continue;

8195

}

8196

8197

// Flag if we use any V1 inputs in place.

8198

if (i == Mask[i]) {

8199

V1UsedInPlace = true;

8200

continue;

8201

}

8202

8203

// We can only insert a single non-zeroable element.

8204

if (V1DstIndex != -1 || V2DstIndex != -1)

8205

return SDValue();

8206

8207

if (Mask[i] < 4) {

8208

// V1 input out of place for insertion.

8209

V1DstIndex = i;

8210

} else {

8211

// V2 input for insertion.

8212

V2DstIndex = i;

8213

}

8214

}

8215

8216

// Don't bother if we have no (non-zeroable) element for insertion.

8217

if (V1DstIndex == -1 && V2DstIndex == -1)

8218

return SDValue();

8219

8220

// Determine element insertion src/dst indices. The src index is from the

8221

// start of the inserted vector, not the start of the concatenated vector.

8222

unsigned V2SrcIndex = 0;

8223

if (V1DstIndex != -1) {

8224

// If we have a V1 input out of place, we use V1 as the V2 element insertion

8225

// and don't use the original V2 at all.

8226

V2SrcIndex = Mask[V1DstIndex];

8227

V2DstIndex = V1DstIndex;

8228

V2 = V1;

8229

} else {

8230

V2SrcIndex = Mask[V2DstIndex] - 4;

8231

}

8232

8233

// If no V1 inputs are used in place, then the result is created only from

8234

// the zero mask and the V2 insertion - so remove V1 dependency.

8235

if (!V1UsedInPlace)

8236

V1 = DAG.getUNDEF(MVT::v4f32);

8237

8238

unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;

8239

8240

8241

// Insert the V2 element into the desired position.

8242

SDLoc DL(Op);

8243

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

8244

DAG.getConstant(InsertPSMask, MVT::i8));

8245

}

8246

8247

/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.

8248

///

8249

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

8250

/// support for floating point shuffles but not integer shuffles. These

8251

/// instructions will incur a domain crossing penalty on some chips though so

8252

/// it is better to avoid lowering through this for integer vectors where

8253

/// possible.

8254

static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

8255

const X86Subtarget *Subtarget,

8256

SelectionDAG &DAG) {

8257

SDLoc DL(Op);

8258

assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v2f64 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8258, __PRETTY_FUNCTION__));

8259

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8259, __PRETTY_FUNCTION__));

8260

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8260, __PRETTY_FUNCTION__));

8261

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

8262

ArrayRef<int> Mask = SVOp->getMask();

8263

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8263, __PRETTY_FUNCTION__));

8264

8265

if (isSingleInputShuffleMask(Mask)) {

8266

// Use low duplicate instructions for masks that match their pattern.

8267

if (Subtarget->hasSSE3())

8268

if (isShuffleEquivalent(Mask, 0, 0))

8269

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);

8270

8271

// Straight shuffle of a single input vector. Simulate this by using the

8272

// single input as both of the "inputs" to this instruction..

8273

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

8274

8275

if (Subtarget->hasAVX()) {

8276

// If we have AVX, we can use VPERMILPS which will allow folding a load

8277

// into the shuffle.

8278

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

8279

DAG.getConstant(SHUFPDMask, MVT::i8));

8280

}

8281

8282

return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,

8283

DAG.getConstant(SHUFPDMask, MVT::i8));

8284

}

8285

assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!")((Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && Mask[0] < 2 && \"Non-canonicalized blend!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8285, __PRETTY_FUNCTION__));

8286

assert(Mask[1] >= 2 && "Non-canonicalized blend!")((Mask[1] >= 2 && "Non-canonicalized blend!") ? static_cast
<void> (0) : __assert_fail ("Mask[1] >= 2 && \"Non-canonicalized blend!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8286, __PRETTY_FUNCTION__));

8287

8288

// Use dedicated unpack instructions for masks that match their pattern.

8289

if (isShuffleEquivalent(Mask, 0, 2))

8290

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);

8291

if (isShuffleEquivalent(Mask, 1, 3))

8292

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);

8293

8294

// If we have a single input, insert that into V1 if we can do so cheaply.

8295

if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {

8296

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

8297

MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))

8298

return Insertion;

8299

// Try inverting the insertion since for v2 masks it is easy to do and we

8300

// can't reliably sort the mask one way or the other.

8301

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

8302

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

8303

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

8304

MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))

8305

return Insertion;

8306

}

8307

8308

// Try to use one of the special instruction patterns to handle two common

8309

// blend patterns if a zero-blend above didn't work.

8310

if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))

8311

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

8312

// We can either use a special instruction to load over the low double or

8313

// to move just the low double.

8314

return DAG.getNode(

8315

isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,

8316

DL, MVT::v2f64, V2,

8317

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

8318

8319

if (Subtarget->hasSSE41())

8320

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

8321

Subtarget, DAG))

8322

return Blend;

8323

8324

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

8325

return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,

8326

DAG.getConstant(SHUFPDMask, MVT::i8));

8327

}

8328

8329

/// \brief Handle lowering of 2-lane 64-bit integer shuffles.

8330

///

8331

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

8332

/// the integer unit to minimize domain crossing penalties. However, for blends

8333

/// it falls back to the floating point shuffle operation with appropriate bit

8334

/// casting.

8335

static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

8336

const X86Subtarget *Subtarget,

8337

SelectionDAG &DAG) {

8338

SDLoc DL(Op);

8339

assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v2i64 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8339, __PRETTY_FUNCTION__));

8340

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8340, __PRETTY_FUNCTION__));

8341

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8341, __PRETTY_FUNCTION__));

8342

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

8343

ArrayRef<int> Mask = SVOp->getMask();

8344

8345

8346

if (isSingleInputShuffleMask(Mask)) {

8347

// Check for being able to broadcast a single element.

8348

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,

8349

Mask, Subtarget, DAG))

8350

return Broadcast;

8351

8352

// Straight shuffle of a single input vector. For everything from SSE2

8353

// onward this has a single fast instruction with no scary immediates.

8354

// We have to map the mask as it is actually a v4i32 shuffle instruction.

8355

V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);

8356

int WidenedMask[4] = {

8357

std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,

8358

std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};

8359

return DAG.getNode(

8360

ISD::BITCAST, DL, MVT::v2i64,

8361

DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,

8362

getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));

8363

}

8364

8365

// Try to use byte shift instructions.

8366

if (SDValue Shift = lowerVectorShuffleAsByteShift(

8367

DL, MVT::v2i64, V1, V2, Mask, DAG))

8368

return Shift;

8369

8370

// If we have a single input from V2 insert that into V1 if we can do so

8371

// cheaply.

8372

if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {

8373

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

8374

MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))

8375

return Insertion;

8376

// Try inverting the insertion since for v2 masks it is easy to do and we

8377

// can't reliably sort the mask one way or the other.

8378

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

8379

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

8380

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

8381

MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))

8382

return Insertion;

8383

}

8384

8385

// Use dedicated unpack instructions for masks that match their pattern.

8386

if (isShuffleEquivalent(Mask, 0, 2))

8387

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);

8388

if (isShuffleEquivalent(Mask, 1, 3))

8389

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);

8390

8391

if (Subtarget->hasSSE41())

8392

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

8393

Subtarget, DAG))

8394

return Blend;

8395

8396

// Try to use byte rotation instructions.

8397

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

8398

if (Subtarget->hasSSSE3())

8399

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

8400

DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))

8401

return Rotate;

8402

8403

// We implement this with SHUFPD which is pretty lame because it will likely

8404

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

8405

// However, all the alternatives are still more cycles and newer chips don't

8406

// have this problem. It would be really nice if x86 had better shuffles here.

8407

V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);

8408

V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);

8409

return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,

8410

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

8411

}

8412

8413

/// \brief Lower a vector shuffle using the SHUFPS instruction.

8414

///

8415

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

8416

/// It makes no assumptions about whether this is the *best* lowering, it simply

8417

/// uses it.

8418

static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,

8419

ArrayRef<int> Mask, SDValue V1,

8420

SDValue V2, SelectionDAG &DAG) {

8421

SDValue LowV = V1, HighV = V2;

8422

int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

8423

8424

int NumV2Elements =

8425

std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });

8426

8427

if (NumV2Elements == 1) {

8428

int V2Index =

8429

std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -

8430

Mask.begin();

8431

8432

// Compute the index adjacent to V2Index and in the same half by toggling

8433

// the low bit.

8434

int V2AdjIndex = V2Index ^ 1;

8435

8436

if (Mask[V2AdjIndex] == -1) {

8437

// Handles all the cases where we have a single V2 element and an undef.

8438

// This will only ever happen in the high lanes because we commute the

8439

// vector otherwise.

8440

if (V2Index < 2)

8441

std::swap(LowV, HighV);

8442

NewMask[V2Index] -= 4;

8443

} else {

8444

// Handle the case where the V2 element ends up adjacent to a V1 element.

8445

// To make this work, blend them together as the first step.

8446

int V1Index = V2AdjIndex;

8447

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

8448

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

8449

getV4X86ShuffleImm8ForMask(BlendMask, DAG));

8450

8451

// Now proceed to reconstruct the final blend as we have the necessary

8452

// high or low half formed.

8453

if (V2Index < 2) {

8454

LowV = V2;

8455

HighV = V1;

8456

} else {

8457

HighV = V2;

8458

}

8459

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

8460

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

8461

}

8462

} else if (NumV2Elements == 2) {

8463

if (Mask[0] < 4 && Mask[1] < 4) {

8464

// Handle the easy case where we have V1 in the low lanes and V2 in the

8465

// high lanes.

8466

NewMask[2] -= 4;

8467

NewMask[3] -= 4;

8468

} else if (Mask[2] < 4 && Mask[3] < 4) {

8469

// We also handle the reversed case because this utility may get called

8470

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

8471

// arrange things in the right direction.

8472

NewMask[0] -= 4;

8473

NewMask[1] -= 4;

8474

HighV = V1;

8475

LowV = V2;

8476

} else {

8477

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

8478

// trying to place elements directly, just blend them and set up the final

8479

// shuffle to place them.

8480

8481

// The first two blend mask elements are for V1, the second two are for

8482

// V2.

8483

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

8484

Mask[2] < 4 ? Mask[2] : Mask[3],

8485

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

8486

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

8487

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

8488

getV4X86ShuffleImm8ForMask(BlendMask, DAG));

8489

8490

// Now we do a normal shuffle of V1 by giving V1 as both operands to

8491

// a blend.

8492

LowV = HighV = V1;

8493

NewMask[0] = Mask[0] < 4 ? 0 : 2;

8494

NewMask[1] = Mask[0] < 4 ? 2 : 0;

8495

NewMask[2] = Mask[2] < 4 ? 1 : 3;

8496

NewMask[3] = Mask[2] < 4 ? 3 : 1;

8497

}

8498

}

8499

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

8500

getV4X86ShuffleImm8ForMask(NewMask, DAG));

8501

}

8502

8503

/// \brief Lower 4-lane 32-bit floating point shuffles.

8504

///

8505

/// Uses instructions exclusively from the floating point unit to minimize

8506

/// domain crossing penalties, as these are sufficient to implement all v4f32

8507

/// shuffles.

8508

static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

8509

const X86Subtarget *Subtarget,

8510

SelectionDAG &DAG) {

8511

SDLoc DL(Op);

8512

8513

8514

8515

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

8516

ArrayRef<int> Mask = SVOp->getMask();

8517

8518

8519

int NumV2Elements =

8520

std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });

8521

8522

if (NumV2Elements == 0) {

8523

// Check for being able to broadcast a single element.

8524

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,

8525

Mask, Subtarget, DAG))

8526

return Broadcast;

8527

8528

// Use even/odd duplicate instructions for masks that match their pattern.

8529

if (Subtarget->hasSSE3()) {

8530

if (isShuffleEquivalent(Mask, 0, 0, 2, 2))

8531

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

8532

if (isShuffleEquivalent(Mask, 1, 1, 3, 3))

8533

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

8534

}

8535

8536

if (Subtarget->hasAVX()) {

8537

// If we have AVX, we can use VPERMILPS which will allow folding a load

8538

// into the shuffle.

8539

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

8540

getV4X86ShuffleImm8ForMask(Mask, DAG));

8541

}

8542

8543

// Otherwise, use a straight shuffle of a single input vector. We pass the

8544

// input vector to both operands to simulate this with a SHUFPS.

8545

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

8546

getV4X86ShuffleImm8ForMask(Mask, DAG));

8547

}

8548

8549

// Use dedicated unpack instructions for masks that match their pattern.

8550

if (isShuffleEquivalent(Mask, 0, 4, 1, 5))

8551

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);

8552

if (isShuffleEquivalent(Mask, 2, 6, 3, 7))

8553

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);

8554

8555

// There are special ways we can lower some single-element blends. However, we

8556

// have custom ways we can lower more complex single-element blends below that

8557

// we defer to if both this and BLENDPS fail to match, so restrict this to

8558

// when the V2 input is targeting element 0 of the mask -- that is the fast

8559

// case here.

8560

if (NumV2Elements == 1 && Mask[0] >= 4)

8561

if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,

8562

Mask, Subtarget, DAG))

8563

return V;

8564

8565

if (Subtarget->hasSSE41()) {

8566

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

8567

Subtarget, DAG))

8568

return Blend;

8569

8570

// Use INSERTPS if we can complete the shuffle efficiently.

8571

if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))

8572

return V;

8573

}

8574

8575

// Otherwise fall back to a SHUFPS lowering strategy.

8576

return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

8577

}

8578

8579

/// \brief Lower 4-lane i32 vector shuffles.

8580

///

8581

/// We try to handle these with integer-domain shuffles where we can, but for

8582

/// blends we use the floating point domain blend instructions.

8583

static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

8584

const X86Subtarget *Subtarget,

8585

SelectionDAG &DAG) {

8586

SDLoc DL(Op);

8587

assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v4i32 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8587, __PRETTY_FUNCTION__));

8588

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8588, __PRETTY_FUNCTION__));

8589

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8589, __PRETTY_FUNCTION__));

8590

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

8591

ArrayRef<int> Mask = SVOp->getMask();

8592

8593

8594

// Whenever we can lower this as a zext, that instruction is strictly faster

8595

// than any alternative. It also allows us to fold memory operands into the

8596

// shuffle in many cases.

8597

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,

8598

Mask, Subtarget, DAG))

8599

return ZExt;

8600

8601

int NumV2Elements =

8602

std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });

8603

8604

if (NumV2Elements == 0) {

8605

// Check for being able to broadcast a single element.

8606

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,

8607

Mask, Subtarget, DAG))

8608

return Broadcast;

8609

8610

// Straight shuffle of a single input vector. For everything from SSE2

8611

// onward this has a single fast instruction with no scary immediates.

8612

// We coerce the shuffle pattern to be compatible with UNPCK instructions

8613

// but we aren't actually going to use the UNPCK instruction because doing

8614

// so prevents folding a load into this instruction or making a copy.

8615

const int UnpackLoMask[] = {0, 0, 1, 1};

8616

const int UnpackHiMask[] = {2, 2, 3, 3};

8617

if (isShuffleEquivalent(Mask, 0, 0, 1, 1))

8618

Mask = UnpackLoMask;

8619

else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))

8620

Mask = UnpackHiMask;

8621

8622

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

8623

getV4X86ShuffleImm8ForMask(Mask, DAG));

8624

}

8625

8626

// Try to use byte shift instructions.

8627

if (SDValue Shift = lowerVectorShuffleAsByteShift(

8628

DL, MVT::v4i32, V1, V2, Mask, DAG))

8629

return Shift;

8630

8631

// There are special ways we can lower some single-element blends.

8632

if (NumV2Elements == 1)

8633

if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,

8634

Mask, Subtarget, DAG))

8635

return V;

8636

8637

// Use dedicated unpack instructions for masks that match their pattern.

8638

if (isShuffleEquivalent(Mask, 0, 4, 1, 5))

8639

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);

8640

if (isShuffleEquivalent(Mask, 2, 6, 3, 7))

8641

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);

8642

8643

if (Subtarget->hasSSE41())

8644

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

8645

Subtarget, DAG))

8646

return Blend;

8647

8648

// Try to use byte rotation instructions.

8649

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

8650

if (Subtarget->hasSSSE3())

8651

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

8652

DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))

8653

return Rotate;

8654

8655

// We implement this with SHUFPS because it can blend from two vectors.

8656

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

8657

// up the inputs, bypassing domain shift penalties that we would encur if we

8658

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

8659

// relevant.

8660

return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,

8661

DAG.getVectorShuffle(

8662

MVT::v4f32, DL,

8663

DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),

8664

DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));

8665

}

8666

8667

/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

8668

/// shuffle lowering, and the most complex part.

8669

///

8670

/// The lowering strategy is to try to form pairs of input lanes which are

8671

/// targeted at the same half of the final vector, and then use a dword shuffle

8672

/// to place them onto the right half, and finally unpack the paired lanes into

8673

/// their final position.

8674

///

8675

/// The exact breakdown of how to form these dword pairs and align them on the

8676

/// correct sides is really tricky. See the comments within the function for

8677

/// more of the details.

8678

static SDValue lowerV8I16SingleInputVectorShuffle(

8679

SDLoc DL, SDValue V, MutableArrayRef<int> Mask,

8680

const X86Subtarget *Subtarget, SelectionDAG &DAG) {

8681

assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!")((V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("V.getSimpleValueType() == MVT::v8i16 && \"Bad input type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8681, __PRETTY_FUNCTION__));

8682

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

8683

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

8684

8685

SmallVector<int, 4> LoInputs;

8686

std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),

8687

[](int M) { return M >= 0; });

8688

std::sort(LoInputs.begin(), LoInputs.end());

8689

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

8690

SmallVector<int, 4> HiInputs;

8691

std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),

8692

[](int M) { return M >= 0; });

8693

std::sort(HiInputs.begin(), HiInputs.end());

8694

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

8695

int NumLToL =

8696

std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();

8697

int NumHToL = LoInputs.size() - NumLToL;

8698

int NumLToH =

8699

std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();

8700

int NumHToH = HiInputs.size() - NumLToH;

8701

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

8702

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

8703

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

8704

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

8705

8706

// Check for being able to broadcast a single element.

8707

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,

8708

Mask, Subtarget, DAG))

8709

return Broadcast;

8710

8711

// Try to use byte shift instructions.

8712

if (SDValue Shift = lowerVectorShuffleAsByteShift(

8713

DL, MVT::v8i16, V, V, Mask, DAG))

8714

return Shift;

8715

8716

// Use dedicated unpack instructions for masks that match their pattern.

8717

if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))

8718

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);

8719

if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))

8720

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);

8721

8722

// Try to use byte rotation instructions.

8723

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

8724

DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))

8725

return Rotate;

8726

8727

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

8728

// such inputs we can swap two of the dwords across the half mark and end up

8729

// with <=2 inputs to each half in each half. Once there, we can fall through

8730

// to the generic code below. For example:

8731

8732

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

8733

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

8734

8735

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

8736

// and an existing 2-into-2 on the other half. In this case we may have to

8737

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

8738

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

8739

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

8740

// because any other situation (including a 3-into-1 or 1-into-3 in the other

8741

// half than the one we target for fixing) will be fixed when we re-enter this

8742

// path. We will also combine away any sequence of PSHUFD instructions that

8743

// result into a single instruction. Here is an example of the tricky case:

8744

8745

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

8746

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

8747

8748

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

8749

8750

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

8751

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

8752

8753

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

8754

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

8755

8756

// The result is fine to be handled by the generic logic.

8757

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

8758

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

8759

int AOffset, int BOffset) {

8760

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8761, __PRETTY_FUNCTION__))

8761

"Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8761, __PRETTY_FUNCTION__));

8762

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8763, __PRETTY_FUNCTION__))

8763

"Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8763, __PRETTY_FUNCTION__));

8764

assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8765, __PRETTY_FUNCTION__))

8765

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8765, __PRETTY_FUNCTION__));

8766

8767

// Compute the index of dword with only one word among the three inputs in

8768

// a half by taking the sum of the half with three inputs and subtracting

8769

// the sum of the actual three inputs. The difference is the remaining

8770

// slot.

8771

int ADWord, BDWord;

8772

int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;

8773

int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;

8774

int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;

8775

ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;

8776

int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];

8777

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

8778

int TripleNonInputIdx =

8779

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

8780

TripleDWord = TripleNonInputIdx / 2;

8781

8782

// We use xor with one to compute the adjacent DWord to whichever one the

8783

// OneInput is in.

8784

OneInputDWord = (OneInput / 2) ^ 1;

8785

8786

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

8787

// and BToA inputs. If there is also such a problem with the BToB and AToB

8788

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

8789

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

8790

// is essential that we don't *create* a 3<-1 as then we might oscillate.

8791

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

8792

// Compute how many inputs will be flipped by swapping these DWords. We

8793

// need

8794

// to balance this to ensure we don't form a 3-1 shuffle in the other

8795

// half.

8796

int NumFlippedAToBInputs =

8797

std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +

8798

std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);

8799

int NumFlippedBToBInputs =

8800

std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +

8801

std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);

8802

if ((NumFlippedAToBInputs == 1 &&

8803

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

8804

(NumFlippedBToBInputs == 1 &&

8805

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

8806

// We choose whether to fix the A half or B half based on whether that

8807

// half has zero flipped inputs. At zero, we may not be able to fix it

8808

// with that half. We also bias towards fixing the B half because that

8809

// will more commonly be the high half, and we have to bias one way.

8810

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

8811

ArrayRef<int> Inputs) {

8812

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

8813

bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),

8814

PinnedIdx ^ 1) != Inputs.end();

8815

// Determine whether the free index is in the flipped dword or the

8816

// unflipped dword based on where the pinned index is. We use this bit

8817

// in an xor to conditionally select the adjacent dword.

8818

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

8819

bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),

8820

FixFreeIdx) != Inputs.end();

8821

if (IsFixIdxInput == IsFixFreeIdxInput)

8822

FixFreeIdx += 1;

8823

IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),

8824

FixFreeIdx) != Inputs.end();

8825

assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8826, __PRETTY_FUNCTION__))

8826

"We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8826, __PRETTY_FUNCTION__));

8827

int PSHUFHalfMask[] = {0, 1, 2, 3};

8828

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

8829

V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

8830

MVT::v8i16, V,

8831

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));

8832

8833

for (int &M : Mask)

8834

if (M != -1 && M == FixIdx)

8835

M = FixFreeIdx;

8836

else if (M != -1 && M == FixFreeIdx)

8837

M = FixIdx;

8838

};

8839

if (NumFlippedBToBInputs != 0) {

8840

int BPinnedIdx =

8841

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

8842

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

8843

} else {

8844

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!"
) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8844, __PRETTY_FUNCTION__));

8845

int APinnedIdx =

8846

AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

8847

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

8848

}

8849

}

8850

}

8851

8852

int PSHUFDMask[] = {0, 1, 2, 3};

8853

PSHUFDMask[ADWord] = BDWord;

8854

PSHUFDMask[BDWord] = ADWord;

8855

V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,

8856

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

8857

DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),

8858

getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));

8859

8860

// Adjust the mask to match the new locations of A and B.

8861

for (int &M : Mask)

8862

if (M != -1 && M/2 == ADWord)

8863

M = 2 * BDWord + M % 2;

8864

else if (M != -1 && M/2 == BDWord)

8865

M = 2 * ADWord + M % 2;

8866

8867

// Recurse back into this routine to re-compute state now that this isn't

8868

// a 3 and 1 problem.

8869

return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),

8870

Mask);

8871

};

8872

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

8873

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

8874

else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

8875

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

8876

8877

// At this point there are at most two inputs to the low and high halves from

8878

// each half. That means the inputs can always be grouped into dwords and

8879

// those dwords can then be moved to the correct half with a dword shuffle.

8880

// We use at most one low and one high word shuffle to collect these paired

8881

// inputs into dwords, and finally a dword shuffle to place them.

8882

int PSHUFLMask[4] = {-1, -1, -1, -1};

8883

int PSHUFHMask[4] = {-1, -1, -1, -1};

8884

int PSHUFDMask[4] = {-1, -1, -1, -1};

8885

8886

// First fix the masks for all the inputs that are staying in their

8887

// original halves. This will then dictate the targets of the cross-half

8888

// shuffles.

8889

auto fixInPlaceInputs =

8890

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

8891

MutableArrayRef<int> SourceHalfMask,

8892

MutableArrayRef<int> HalfMask, int HalfOffset) {

8893

if (InPlaceInputs.empty())

8894

return;

8895

if (InPlaceInputs.size() == 1) {

8896

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

8897

InPlaceInputs[0] - HalfOffset;

8898

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

8899

return;

8900

}

8901

if (IncomingInputs.empty()) {

8902

// Just fix all of the in place inputs.

8903

for (int Input : InPlaceInputs) {

8904

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

8905

PSHUFDMask[Input / 2] = Input / 2;

8906

}

8907

return;

8908

}

8909

8910

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"
) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8910, __PRETTY_FUNCTION__));

8911

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

8912

InPlaceInputs[0] - HalfOffset;

8913

// Put the second input next to the first so that they are packed into

8914

// a dword. We find the adjacent index by toggling the low bit.

8915

int AdjIndex = InPlaceInputs[0] ^ 1;

8916

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

8917

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

8918

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

8919

};

8920

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

8921

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

8922

8923

// Now gather the cross-half inputs and place them into a free dword of

8924

// their target half.

8925

// FIXME: This operation could almost certainly be simplified dramatically to

8926

// look more like the 3-1 fixing operation.

8927

auto moveInputsToRightHalf = [&PSHUFDMask](

8928

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

8929

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

8930

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

8931

int DestOffset) {

8932

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

8933

return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;

8934

};

8935

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

8936

int Word) {

8937

int LowWord = Word & ~1;

8938

int HighWord = Word | 1;

8939

return isWordClobbered(SourceHalfMask, LowWord) ||

8940

isWordClobbered(SourceHalfMask, HighWord);

8941

};

8942

8943

if (IncomingInputs.empty())

8944

return;

8945

8946

if (ExistingInputs.empty()) {

8947

// Map any dwords with inputs from them into the right half.

8948

for (int Input : IncomingInputs) {

8949

// If the source half mask maps over the inputs, turn those into

8950

// swaps and use the swapped lane.

8951

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

8952

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {

8953

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

8954

Input - SourceOffset;

8955

// We have to swap the uses in our half mask in one sweep.

8956

for (int &M : HalfMask)

8957

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

8958

M = Input;

8959

else if (M == Input)

8960

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

8961

} else {

8962

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8964, __PRETTY_FUNCTION__))

8963

Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8964, __PRETTY_FUNCTION__))

8964

"Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8964, __PRETTY_FUNCTION__));

8965

}

8966

// Note that this correctly re-maps both when we do a swap and when

8967

// we observe the other side of the swap above. We rely on that to

8968

// avoid swapping the members of the input list directly.

8969

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

8970

}

8971

8972

// Map the input's dword into the correct half.

8973

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)

8974

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

8975

else

8976

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8978, __PRETTY_FUNCTION__))

8977

Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8978, __PRETTY_FUNCTION__))

8978

"Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8978, __PRETTY_FUNCTION__));

8979

}

8980

8981

// And just directly shift any other-half mask elements to be same-half

8982

// as we will have mirrored the dword containing the element into the

8983

// same position within that half.

8984

for (int &M : HalfMask)

8985

if (M >= SourceOffset && M < SourceOffset + 4) {

8986

M = M - SourceOffset + DestOffset;

8987

assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ?
static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8987, __PRETTY_FUNCTION__));

8988

}

8989

return;

8990

}

8991

8992

// Ensure we have the input in a viable dword of its current half. This

8993

// is particularly tricky because the original position may be clobbered

8994

// by inputs being moved and *staying* in that half.

8995

if (IncomingInputs.size() == 1) {

8996

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

8997

int InputFixed = std::find(std::begin(SourceHalfMask),

8998

std::end(SourceHalfMask), -1) -

8999

std::begin(SourceHalfMask) + SourceOffset;

9000

SourceHalfMask[InputFixed - SourceOffset] =

9001

IncomingInputs[0] - SourceOffset;

9002

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

9003

InputFixed);

9004

IncomingInputs[0] = InputFixed;

9005

}

9006

} else if (IncomingInputs.size() == 2) {

9007

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

9008

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

9009

// We have two non-adjacent or clobbered inputs we need to extract from

9010

// the source half. To do this, we need to map them into some adjacent

9011

// dword slot in the source mask.

9012

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

9013

IncomingInputs[1] - SourceOffset};

9014

9015

// If there is a free slot in the source half mask adjacent to one of

9016

// the inputs, place the other input in it. We use (Index XOR 1) to

9017

// compute an adjacent index.

9018

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

9019

SourceHalfMask[InputsFixed[0] ^ 1] == -1) {

9020

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

9021

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

9022

InputsFixed[1] = InputsFixed[0] ^ 1;

9023

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

9024

SourceHalfMask[InputsFixed[1] ^ 1] == -1) {

9025

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

9026

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

9027

InputsFixed[0] = InputsFixed[1] ^ 1;

9028

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&

9029

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {

9030

// The two inputs are in the same DWord but it is clobbered and the

9031

// adjacent DWord isn't used at all. Move both inputs to the free

9032

// slot.

9033

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

9034

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

9035

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

9036

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

9037

} else {

9038

// The only way we hit this point is if there is no clobbering

9039

// (because there are no off-half inputs to this half) and there is no

9040

// free slot adjacent to one of the inputs. In this case, we have to

9041

// swap an input with a non-input.

9042

for (int i = 0; i < 4; ++i)

9043

assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&(((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9044, __PRETTY_FUNCTION__))

9044

"We can't handle any clobbers here!")(((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9044, __PRETTY_FUNCTION__));

9045

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9046, __PRETTY_FUNCTION__))

9046

"Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9046, __PRETTY_FUNCTION__));

9047

9048

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

9049

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

9050

9051

// We also have to update the final source mask in this case because

9052

// it may need to undo the above swap.

9053

for (int &M : FinalSourceHalfMask)

9054

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

9055

M = InputsFixed[1] + SourceOffset;

9056

else if (M == InputsFixed[1] + SourceOffset)

9057

M = (InputsFixed[0] ^ 1) + SourceOffset;

9058

9059

InputsFixed[1] = InputsFixed[0] ^ 1;

9060

}

9061

9062

// Point everything at the fixed inputs.

9063

for (int &M : HalfMask)

9064

if (M == IncomingInputs[0])

9065

M = InputsFixed[0] + SourceOffset;

9066

else if (M == IncomingInputs[1])

9067

M = InputsFixed[1] + SourceOffset;

9068

9069

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

9070

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

9071

}

9072

} else {

9073

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9073);

9074

}

9075

9076

// Now hoist the DWord down to the right half.

9077

int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;

9078

assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free")((PSHUFDMask[FreeDWord] == -1 && "DWord not free") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] == -1 && \"DWord not free\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9078, __PRETTY_FUNCTION__));

9079

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

9080

for (int &M : HalfMask)

9081

for (int Input : IncomingInputs)

9082

if (M == Input)

9083

M = FreeDWord * 2 + Input % 2;

9084

};

9085

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

9086

/*SourceOffset*/ 4, /*DestOffset*/ 0);

9087

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

9088

/*SourceOffset*/ 0, /*DestOffset*/ 4);

9089

9090

// Now enact all the shuffles we've computed to move the inputs into their

9091

// target half.

9092

if (!isNoopShuffleMask(PSHUFLMask))

9093

V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,

9094

getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));

9095

if (!isNoopShuffleMask(PSHUFHMask))

9096

V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,

9097

getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));

9098

if (!isNoopShuffleMask(PSHUFDMask))

9099

V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,

9100

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

9101

DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),

9102

getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));

9103

9104

// At this point, each half should contain all its inputs, and we can then

9105

// just shuffle them into their final position.

9106

assert(std::count_if(LoMask.begin(), LoMask.end(),((std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9108, __PRETTY_FUNCTION__))

9107

[](int M) { return M >= 4; }) == 0 &&((std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9108, __PRETTY_FUNCTION__))

9108

"Failed to lift all the high half inputs to the low mask!")((std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9108, __PRETTY_FUNCTION__));

9109

assert(std::count_if(HiMask.begin(), HiMask.end(),((std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9111, __PRETTY_FUNCTION__))

9110

[](int M) { return M >= 0 && M < 4; }) == 0 &&((std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9111, __PRETTY_FUNCTION__))

9111

"Failed to lift all the low half inputs to the high mask!")((std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9111, __PRETTY_FUNCTION__));

9112

9113

// Do a half shuffle for the low mask.

9114

if (!isNoopShuffleMask(LoMask))

9115

V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,

9116

getV4X86ShuffleImm8ForMask(LoMask, DAG));

9117

9118

// Do a half shuffle with the high mask after shifting its values down.

9119

for (int &M : HiMask)

9120

if (M >= 0)

9121

M -= 4;

9122

if (!isNoopShuffleMask(HiMask))

9123

V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,

9124

getV4X86ShuffleImm8ForMask(HiMask, DAG));

9125

9126

return V;

9127

}

9128

9129

/// \brief Detect whether the mask pattern should be lowered through

9130

/// interleaving.

9131

///

9132

/// This essentially tests whether viewing the mask as an interleaving of two

9133

/// sub-sequences reduces the cross-input traffic of a blend operation. If so,

9134

/// lowering it through interleaving is a significantly better strategy.

9135

static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {

9136

int NumEvenInputs[2] = {0, 0};

9137

int NumOddInputs[2] = {0, 0};

9138

int NumLoInputs[2] = {0, 0};

9139

int NumHiInputs[2] = {0, 0};

9140

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

9141

if (Mask[i] < 0)

9142

continue;

9143

9144

int InputIdx = Mask[i] >= Size;

9145

9146

if (i < Size / 2)

9147

++NumLoInputs[InputIdx];

9148

else

9149

++NumHiInputs[InputIdx];

9150

9151

if ((i % 2) == 0)

9152

++NumEvenInputs[InputIdx];

9153

else

9154

++NumOddInputs[InputIdx];

9155

}

9156

9157

// The minimum number of cross-input results for both the interleaved and

9158

// split cases. If interleaving results in fewer cross-input results, return

9159

// true.

9160

int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],

9161

NumEvenInputs[0] + NumOddInputs[1]);

9162

int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],

9163

NumLoInputs[0] + NumHiInputs[1]);

9164

return InterleavedCrosses < SplitCrosses;

9165

}

9166

9167

/// \brief Blend two v8i16 vectors using a naive unpack strategy.

9168

///

9169

/// This strategy only works when the inputs from each vector fit into a single

9170

/// half of that vector, and generally there are not so many inputs as to leave

9171

/// the in-place shuffles required highly constrained (and thus expensive). It

9172

/// shifts all the inputs into a single side of both input vectors and then

9173

/// uses an unpack to interleave these inputs in a single vector. At that

9174

/// point, we will fall back on the generic single input shuffle lowering.

9175

static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,

9176

SDValue V2,

9177

MutableArrayRef<int> Mask,

9178

const X86Subtarget *Subtarget,

9179

SelectionDAG &DAG) {

9180

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad input type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9180, __PRETTY_FUNCTION__));

9181

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad input type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9181, __PRETTY_FUNCTION__));

9182

SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;

9183

for (int i = 0; i < 8; ++i)

9184

if (Mask[i] >= 0 && Mask[i] < 4)

9185

LoV1Inputs.push_back(i);

9186

else if (Mask[i] >= 4 && Mask[i] < 8)

9187

HiV1Inputs.push_back(i);

9188

else if (Mask[i] >= 8 && Mask[i] < 12)

9189

LoV2Inputs.push_back(i);

9190

else if (Mask[i] >= 12)

9191

HiV2Inputs.push_back(i);

9192

9193

int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();

9194

int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();

9195

(void)NumV1Inputs;

9196

(void)NumV2Inputs;

9197

assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported")((NumV1Inputs > 0 && NumV1Inputs <= 3 &&
"At most 3 inputs supported") ? static_cast<void> (0) :
__assert_fail ("NumV1Inputs > 0 && NumV1Inputs <= 3 && \"At most 3 inputs supported\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9197, __PRETTY_FUNCTION__));

9198

assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported")((NumV2Inputs > 0 && NumV2Inputs <= 3 &&
"At most 3 inputs supported") ? static_cast<void> (0) :
__assert_fail ("NumV2Inputs > 0 && NumV2Inputs <= 3 && \"At most 3 inputs supported\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9198, __PRETTY_FUNCTION__));

9199

assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs")((NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"
) ? static_cast<void> (0) : __assert_fail ("NumV1Inputs + NumV2Inputs <= 4 && \"At most 4 combined inputs\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9199, __PRETTY_FUNCTION__));

9200

9201

bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=

9202

HiV1Inputs.size() + HiV2Inputs.size();

9203

9204

auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,

9205

ArrayRef<int> HiInputs, bool MoveToLo,

9206

int MaskOffset) {

9207

ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;

9208

ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;

9209

if (BadInputs.empty())

9210

return V;

9211

9212

int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};

9213

int MoveOffset = MoveToLo ? 0 : 4;

9214

9215

if (GoodInputs.empty()) {

9216

for (int BadInput : BadInputs) {

9217

MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;

9218

Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;

9219

}

9220

} else {

9221

if (GoodInputs.size() == 2) {

9222

// If the low inputs are spread across two dwords, pack them into

9223

// a single dword.

9224

MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;

9225

MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;

9226

Mask[GoodInputs[0]] = MoveOffset + MaskOffset;

9227

Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;

9228

} else {

9229

// Otherwise pin the good inputs.

9230

for (int GoodInput : GoodInputs)

9231

MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;

9232

}

9233

9234

if (BadInputs.size() == 2) {

9235

// If we have two bad inputs then there may be either one or two good

9236

// inputs fixed in place. Find a fixed input, and then find the *other*

9237

// two adjacent indices by using modular arithmetic.

9238

int GoodMaskIdx =

9239

std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),

9240

[](int M) { return M >= 0; }) -

9241

std::begin(MoveMask);

9242

int MoveMaskIdx =

9243

((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;

9244

assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot")((MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"
) ? static_cast<void> (0) : __assert_fail ("MoveMask[MoveMaskIdx] == -1 && \"Expected empty slot\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9244, __PRETTY_FUNCTION__));

9245

assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot")((MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"
) ? static_cast<void> (0) : __assert_fail ("MoveMask[MoveMaskIdx + 1] == -1 && \"Expected empty slot\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9245, __PRETTY_FUNCTION__));

9246

MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;

9247

MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;

9248

Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;

9249

Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;

9250

} else {

9251

assert(BadInputs.size() == 1 && "All sizes handled")((BadInputs.size() == 1 && "All sizes handled") ? static_cast
<void> (0) : __assert_fail ("BadInputs.size() == 1 && \"All sizes handled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9251, __PRETTY_FUNCTION__));

9252

int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,

9253

std::end(MoveMask), -1) -

9254

std::begin(MoveMask);

9255

MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;

9256

Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;

9257

}

9258

}

9259

9260

return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),

9261

MoveMask);

9262

};

9263

V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,

9264

/*MaskOffset*/ 0);

9265

V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,

9266

/*MaskOffset*/ 8);

9267

9268

// FIXME: Select an interleaving of the merge of V1 and V2 that minimizes

9269

// cross-half traffic in the final shuffle.

9270

9271

// Munge the mask to be a single-input mask after the unpack merges the

9272

// results.

9273

for (int &M : Mask)

9274

if (M != -1)

9275

M = 2 * (M % 4) + (M / 8);

9276

9277

return DAG.getVectorShuffle(

9278

MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,

9279

DL, MVT::v8i16, V1, V2),

9280

DAG.getUNDEF(MVT::v8i16), Mask);

9281

}

9282

9283

/// \brief Generic lowering of 8-lane i16 shuffles.

9284

///

9285

/// This handles both single-input shuffles and combined shuffle/blends with

9286

/// two inputs. The single input shuffles are immediately delegated to

9287

/// a dedicated lowering routine.

9288

///

9289

/// The blends are lowered in one of three fundamental ways. If there are few

9290

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

9291

/// of the input is significantly cheaper when lowered as an interleaving of

9292

/// the two inputs, try to interleave them. Otherwise, blend the low and high

9293

/// halves of the inputs separately (making them have relatively few inputs)

9294

/// and then concatenate them.

9295

static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

9296

const X86Subtarget *Subtarget,

9297

SelectionDAG &DAG) {

9298

SDLoc DL(Op);

9299

assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v8i16 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9299, __PRETTY_FUNCTION__));

9300

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9300, __PRETTY_FUNCTION__));

9301

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9301, __PRETTY_FUNCTION__));

9302

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

9303

ArrayRef<int> OrigMask = SVOp->getMask();

9304

int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],

9305

OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};

9306

MutableArrayRef<int> Mask(MaskStorage);

9307

9308

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9308, __PRETTY_FUNCTION__));

9309

9310

// Whenever we can lower this as a zext, that instruction is strictly faster

9311

// than any alternative.

9312

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

9313

DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))

9314

return ZExt;

9315

9316

auto isV1 = [](int M) { return M >= 0 && M < 8; };

9317

auto isV2 = [](int M) { return M >= 8; };

9318

9319

int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);

9320

int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);

9321

9322

if (NumV2Inputs == 0)

9323

return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);

9324

9325

assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "((NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.") ? static_cast<void> (0) : __assert_fail
("NumV1Inputs > 0 && \"All single-input shuffles should be canonicalized \" \"to be V1-input shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9326, __PRETTY_FUNCTION__))

9326

"to be V1-input shuffles.")((NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.") ? static_cast<void> (0) : __assert_fail
("NumV1Inputs > 0 && \"All single-input shuffles should be canonicalized \" \"to be V1-input shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9326, __PRETTY_FUNCTION__));

9327

9328

// Try to use byte shift instructions.

9329

if (SDValue Shift = lowerVectorShuffleAsByteShift(

9330

DL, MVT::v8i16, V1, V2, Mask, DAG))

9331

return Shift;

9332

9333

// There are special ways we can lower some single-element blends.

9334

if (NumV2Inputs == 1)

9335

if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,

9336

Mask, Subtarget, DAG))

9337

return V;

9338

9339

// Use dedicated unpack instructions for masks that match their pattern.

9340

if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))

9341

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);

9342

if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))

9343

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);

9344

9345

if (Subtarget->hasSSE41())

9346

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

9347

Subtarget, DAG))

9348

return Blend;

9349

9350

// Try to use byte rotation instructions.

9351

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

9352

DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))

9353

return Rotate;

9354

9355

if (NumV1Inputs + NumV2Inputs <= 4)

9356

return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);

9357

9358

// Check whether an interleaving lowering is likely to be more efficient.

9359

// This isn't perfect but it is a strong heuristic that tends to work well on

9360

// the kinds of shuffles that show up in practice.

9361

9362

// FIXME: Handle 1x, 2x, and 4x interleaving.

9363

if (shouldLowerAsInterleaving(Mask)) {

9364

// FIXME: Figure out whether we should pack these into the low or high

9365

// halves.

9366

9367

int EMask[8], OMask[8];

9368

for (int i = 0; i < 4; ++i) {

9369

EMask[i] = Mask[2*i];

9370

OMask[i] = Mask[2*i + 1];

9371

EMask[i + 4] = -1;

9372

OMask[i + 4] = -1;

9373

}

9374

9375

SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);

9376

SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);

9377

9378

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);

9379

}

9380

9381

int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

9382

int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

9383

9384

for (int i = 0; i < 4; ++i) {

9385

LoBlendMask[i] = Mask[i];

9386

HiBlendMask[i] = Mask[i + 4];

9387

}

9388

9389

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);

9390

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);

9391

LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);

9392

HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);

9393

9394

return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,

9395

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));

9396

}

9397

9398

/// \brief Check whether a compaction lowering can be done by dropping even

9399

/// elements and compute how many times even elements must be dropped.

9400

///

9401

/// This handles shuffles which take every Nth element where N is a power of

9402

/// two. Example shuffle masks:

9403

///

9404

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

9405

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

9406

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

9407

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

9408

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

9409

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

9410

///

9411

/// Any of these lanes can of course be undef.

9412

///

9413

/// This routine only supports N <= 3.

9414

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

9415

/// for larger N.

9416

///

9417

/// \returns N above, or the number of times even elements must be dropped if

9418

/// there is such a number. Otherwise returns zero.

9419

static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {

9420

// Figure out whether we're looping over two inputs or just one.

9421

bool IsSingleInput = isSingleInputShuffleMask(Mask);

9422

9423

// The modulus for the shuffle vector entries is based on whether this is

9424

// a single input or not.

9425

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

9426

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9427, __PRETTY_FUNCTION__))

9427

"We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9427, __PRETTY_FUNCTION__));

9428

9429

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

9430

9431

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

9432

// and 2^3 simultaneously. This is because we may have ambiguity with

9433

// partially undef inputs.

9434

bool ViableForN[3] = {true, true, true};

9435

9436

for (int i = 0, e = Mask.size(); i < e; ++i) {

9437

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

9438

// want.

9439

if (Mask[i] == -1)

9440

continue;

9441

9442

bool IsAnyViable = false;

9443

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

9444

if (ViableForN[j]) {

9445

uint64_t N = j + 1;

9446

9447

// The shuffle mask must be equal to (i * 2^N) % M.

9448

if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))

9449

IsAnyViable = true;

9450

else

9451

ViableForN[j] = false;

9452

}

9453

// Early exit if we exhaust the possible powers of two.

9454

if (!IsAnyViable)

9455

break;

9456

}

9457

9458

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

9459

if (ViableForN[j])

9460

return j + 1;

9461

9462

// Return 0 as there is no viable power of two.

9463

return 0;

9464

}

9465

9466

/// \brief Generic lowering of v16i8 shuffles.

9467

///

9468

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

9469

/// detect any complexity reducing interleaving. If that doesn't help, it uses

9470

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

9471

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

9472

/// back together.

9473

static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

9474

const X86Subtarget *Subtarget,

9475

SelectionDAG &DAG) {

9476

SDLoc DL(Op);

9477

assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v16i8 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9477, __PRETTY_FUNCTION__));

9478

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9478, __PRETTY_FUNCTION__));

9479

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9479, __PRETTY_FUNCTION__));

9480

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

9481

ArrayRef<int> OrigMask = SVOp->getMask();

9482

assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!")((OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("OrigMask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9482, __PRETTY_FUNCTION__));

9483

9484

// Try to use byte shift instructions.

9485

if (SDValue Shift = lowerVectorShuffleAsByteShift(

9486

DL, MVT::v16i8, V1, V2, OrigMask, DAG))

9487

return Shift;

9488

9489

// Try to use byte rotation instructions.

9490

if (SDValue Rotate = lowerVectorShuffleAsByteRotate(

9491

DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))

9492

return Rotate;

9493

9494

// Try to use a zext lowering.

9495

if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(

9496

DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))

9497

return ZExt;

9498

9499

int MaskStorage[16] = {

9500

OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],

9501

OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],

9502

OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],

9503

OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};

9504

MutableArrayRef<int> Mask(MaskStorage);

9505

MutableArrayRef<int> LoMask = Mask.slice(0, 8);

9506

MutableArrayRef<int> HiMask = Mask.slice(8, 8);

9507

9508

int NumV2Elements =

9509

std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });

9510

9511

// For single-input shuffles, there are some nicer lowering tricks we can use.

9512

if (NumV2Elements == 0) {

9513

// Check for being able to broadcast a single element.

9514

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,

9515

Mask, Subtarget, DAG))

9516

return Broadcast;

9517

9518

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

9519

// Notably, this handles splat and partial-splat shuffles more efficiently.

9520

// However, it only makes sense if the pre-duplication shuffle simplifies

9521

// things significantly. Currently, this means we need to be able to

9522

// express the pre-duplication shuffle as an i16 shuffle.

9523

9524

// FIXME: We should check for other patterns which can be widened into an

9525

// i16 shuffle as well.

9526

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

9527

for (int i = 0; i < 16; i += 2)

9528

if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])

9529

return false;

9530

9531

return true;

9532

};

9533

auto tryToWidenViaDuplication = [&]() -> SDValue {

9534

if (!canWidenViaDuplication(Mask))

9535

return SDValue();

9536

SmallVector<int, 4> LoInputs;

9537

std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),

9538

[](int M) { return M >= 0 && M < 8; });

9539

std::sort(LoInputs.begin(), LoInputs.end());

9540

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

9541

LoInputs.end());

9542

SmallVector<int, 4> HiInputs;

9543

std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),

9544

[](int M) { return M >= 8; });

9545

std::sort(HiInputs.begin(), HiInputs.end());

9546

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

9547

HiInputs.end());

9548

9549

bool TargetLo = LoInputs.size() >= HiInputs.size();

9550

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

9551

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

9552

9553

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

9554

SmallDenseMap<int, int, 8> LaneMap;

9555

for (int I : InPlaceInputs) {

9556

PreDupI16Shuffle[I/2] = I/2;

9557

LaneMap[I] = I;

9558

}

9559

int j = TargetLo ? 0 : 4, je = j + 4;

9560

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

9561

// Check if j is already a shuffle of this input. This happens when

9562

// there are two adjacent bytes after we move the low one.

9563

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

9564

// If we haven't yet mapped the input, search for a slot into which

9565

// we can map it.

9566

while (j < je && PreDupI16Shuffle[j] != -1)

9567

++j;

9568

9569

if (j == je)

9570

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

9571

return SDValue();

9572

9573

// Map this input with the i16 shuffle.

9574

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

9575

}

9576

9577

// Update the lane map based on the mapping we ended up with.

9578

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

9579

}

9580

V1 = DAG.getNode(

9581

ISD::BITCAST, DL, MVT::v16i8,

9582

DAG.getVectorShuffle(MVT::v8i16, DL,

9583

DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),

9584

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

9585

9586

// Unpack the bytes to form the i16s that will be shuffled into place.

9587

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

9588

MVT::v16i8, V1, V1);

9589

9590

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

9591

for (int i = 0; i < 16; ++i)

9592

if (Mask[i] != -1) {

9593

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

9594

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast
<void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9594, __PRETTY_FUNCTION__));

9595

if (PostDupI16Shuffle[i / 2] == -1)

9596

PostDupI16Shuffle[i / 2] = MappedMask;

9597

else

9598

assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entrties in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entrties in the original shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9599, __PRETTY_FUNCTION__))

9599

"Conflicting entrties in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entrties in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entrties in the original shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9599, __PRETTY_FUNCTION__));

9600

}

9601

return DAG.getNode(

9602

ISD::BITCAST, DL, MVT::v16i8,

9603

DAG.getVectorShuffle(MVT::v8i16, DL,

9604

DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),

9605

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

9606

};

9607

if (SDValue V = tryToWidenViaDuplication())

9608

return V;

9609

}

9610

9611

// Check whether an interleaving lowering is likely to be more efficient.

9612

// This isn't perfect but it is a strong heuristic that tends to work well on

9613

// the kinds of shuffles that show up in practice.

9614

9615

// FIXME: We need to handle other interleaving widths (i16, i32, ...).

9616

if (shouldLowerAsInterleaving(Mask)) {

9617

int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {

9618

return (M >= 0 && M < 8) || (M >= 16 && M < 24);

9619

});

9620

int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {

9621

return (M >= 8 && M < 16) || M >= 24;

9622

});

9623

int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,

9624

-1, -1, -1, -1, -1, -1, -1, -1};

9625

int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,

9626

-1, -1, -1, -1, -1, -1, -1, -1};

9627

bool UnpackLo = NumLoHalf >= NumHiHalf;

9628

MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);

9629

MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);

9630

for (int i = 0; i < 8; ++i) {

9631

TargetEMask[i] = Mask[2 * i];

9632

TargetOMask[i] = Mask[2 * i + 1];

9633

}

9634

9635

SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);

9636

SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);

9637

9638

return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

9639

MVT::v16i8, Evens, Odds);

9640

}

9641

9642

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

9643

// with PSHUFB. It is important to do this before we attempt to generate any

9644

// blends but after all of the single-input lowerings. If the single input

9645

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

9646

// want to preserve that and we can DAG combine any longer sequences into

9647

// a PSHUFB in the end. But once we start blending from multiple inputs,

9648

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

9649

// and there are *very* few patterns that would actually be faster than the

9650

// PSHUFB approach because of its ability to zero lanes.

9651

9652

// FIXME: The only exceptions to the above are blends which are exact

9653

// interleavings with direct instructions supporting them. We currently don't

9654

// handle those well here.

9655

if (Subtarget->hasSSSE3()) {

9656

SDValue V1Mask[16];

9657

SDValue V2Mask[16];

9658

bool V1InUse = false;

9659

bool V2InUse = false;

9660

SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

9661

9662

for (int i = 0; i < 16; ++i) {

9663

if (Mask[i] == -1) {

9664

V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);

9665

} else {

9666

const int ZeroMask = 0x80;

9667

int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);

9668

int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);

9669

if (Zeroable[i])

9670

V1Idx = V2Idx = ZeroMask;

9671

V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);

9672

V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);

9673

V1InUse |= (ZeroMask != V1Idx);

9674

V2InUse |= (ZeroMask != V2Idx);

9675

}

9676

}

9677

9678

if (V1InUse)

9679

V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,

9680

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));

9681

if (V2InUse)

9682

V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,

9683

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));

9684

9685

// If we need shuffled inputs from both, blend the two.

9686

if (V1InUse && V2InUse)

9687

return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);

9688

if (V1InUse)

9689

return V1; // Single inputs are easy.

9690

if (V2InUse)

9691

return V2; // Single inputs are easy.

9692

// Shuffling to a zeroable vector.

9693

return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

9694

}

9695

9696

// There are special ways we can lower some single-element blends.

9697

if (NumV2Elements == 1)

9698

if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,

9699

Mask, Subtarget, DAG))

9700

return V;

9701

9702

// Check whether a compaction lowering can be done. This handles shuffles

9703

// which take every Nth element for some even N. See the helper function for

9704

// details.

9705

9706

// We special case these as they can be particularly efficiently handled with

9707

// the PACKUSB instruction on x86 and they show up in common patterns of

9708

// rearranging bytes to truncate wide elements.

9709

if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {

9710

// NumEvenDrops is the power of two stride of the elements. Another way of

9711

// thinking about it is that we need to drop the even elements this many

9712

// times to get the original input.

9713

bool IsSingleInput = isSingleInputShuffleMask(Mask);

9714

9715

// First we need to zero all the dropped bytes.

9716

assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9717, __PRETTY_FUNCTION__))

9717

"No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9717, __PRETTY_FUNCTION__));

9718

// We use the mask type to pick which bytes are preserved based on how many

9719

// elements are dropped.

9720

MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };

9721

SDValue ByteClearMask =

9722

DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,

9723

DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));

9724

V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);

9725

if (!IsSingleInput)

9726

V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

9727

9728

// Now pack things back together.

9729

V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);

9730

V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);

9731

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);

9732

for (int i = 1; i < NumEvenDrops; ++i) {

9733

Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);

9734

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

9735

}

9736

9737

return Result;

9738

}

9739

9740

int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

9741

int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

9742

int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

9743

int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

9744

9745

auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,

9746

MutableArrayRef<int> V1HalfBlendMask,

9747

MutableArrayRef<int> V2HalfBlendMask) {

9748

for (int i = 0; i < 8; ++i)

9749

if (HalfMask[i] >= 0 && HalfMask[i] < 16) {

9750

V1HalfBlendMask[i] = HalfMask[i];

9751

HalfMask[i] = i;

9752

} else if (HalfMask[i] >= 16) {

9753

V2HalfBlendMask[i] = HalfMask[i] - 16;

9754

HalfMask[i] = i + 8;

9755

}

9756

};

9757

buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);

9758

buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);

9759

9760

SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);

9761

9762

auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,

9763

MutableArrayRef<int> HiBlendMask) {

9764

SDValue V1, V2;

9765

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

9766

// them out and avoid using UNPCK{L,H} to extract the elements of V as

9767

// i16s.

9768

if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),

9769

[](int M) { return M >= 0 && M % 2 == 1; }) &&

9770

std::none_of(HiBlendMask.begin(), HiBlendMask.end(),

9771

[](int M) { return M >= 0 && M % 2 == 1; })) {

9772

// Use a mask to drop the high bytes.

9773

V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);

9774

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,

9775

DAG.getConstant(0x00FF, MVT::v8i16));

9776

9777

// This will be a single vector shuffle instead of a blend so nuke V2.

9778

V2 = DAG.getUNDEF(MVT::v8i16);

9779

9780

// Squash the masks to point directly into V1.

9781

for (int &M : LoBlendMask)

9782

if (M >= 0)

9783

M /= 2;

9784

for (int &M : HiBlendMask)

9785

if (M >= 0)

9786

M /= 2;

9787

} else {

9788

// Otherwise just unpack the low half of V into V1 and the high half into

9789

// V2 so that we can blend them as i16s.

9790

V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,

9791

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

9792

V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,

9793

DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

9794

}

9795

9796

SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);

9797

SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);

9798

return std::make_pair(BlendedLo, BlendedHi);

9799

};

9800

SDValue V1Lo, V1Hi, V2Lo, V2Hi;

9801

std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);

9802

std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);

9803

9804

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);

9805

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);

9806

9807

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

9808

}

9809

9810

/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.

9811

///

9812

/// This routine breaks down the specific type of 128-bit shuffle and

9813

/// dispatches to the lowering routines accordingly.

9814

static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,

9815

MVT VT, const X86Subtarget *Subtarget,

9816

SelectionDAG &DAG) {

9817

switch (VT.SimpleTy) {

9818

case MVT::v2i64:

9819

return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);

9820

case MVT::v2f64:

9821

return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);

9822

case MVT::v4i32:

9823

return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);

9824

case MVT::v4f32:

9825

return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);

9826

case MVT::v8i16:

9827

return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);

9828

case MVT::v16i8:

9829

return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);

9830

9831

default:

9832

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9832);

9833

}

9834

}

9835

9836

/// \brief Helper function to test whether a shuffle mask could be

9837

/// simplified by widening the elements being shuffled.

9838

///

9839

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

9840

/// leaves it in an unspecified state.

9841

///

9842

/// NOTE: This must handle normal vector shuffle masks and *target* vector

9843

/// shuffle masks. The latter have the special property of a '-2' representing

9844

/// a zero-ed lane of a vector.

9845

static bool canWidenShuffleElements(ArrayRef<int> Mask,

9846

SmallVectorImpl<int> &WidenedMask) {

9847

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

9848

// If both elements are undef, its trivial.

9849

if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {

9850

WidenedMask.push_back(SM_SentinelUndef);

9851

continue;

9852

}

9853

9854

// Check for an undef mask and a mask value properly aligned to fit with

9855

// a pair of values. If we find such a case, use the non-undef mask's value.

9856

if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {

9857

WidenedMask.push_back(Mask[i + 1] / 2);

9858

continue;

9859

}

9860

if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {

9861

WidenedMask.push_back(Mask[i] / 2);

9862

continue;

9863

}

9864

9865

// When zeroing, we need to spread the zeroing across both lanes to widen.

9866

if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {

9867

if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&

9868

(Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {

9869

WidenedMask.push_back(SM_SentinelZero);

9870

continue;

9871

}

9872

return false;

9873

}

9874

9875

// Finally check if the two mask values are adjacent and aligned with

9876

// a pair.

9877

if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {

9878

WidenedMask.push_back(Mask[i] / 2);

9879

continue;

9880

}

9881

9882

// Otherwise we can't safely widen the elements used in this shuffle.

9883

return false;

9884

}

9885

assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9886, __PRETTY_FUNCTION__))

9886

"Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9886, __PRETTY_FUNCTION__));

9887

9888

return true;

9889

}

9890

9891

/// \brief Generic routine to split ector shuffle into half-sized shuffles.

9892

///

9893

/// This routine just extracts two subvectors, shuffles them independently, and

9894

/// then concatenates them back together. This should work effectively with all

9895

/// AVX vector shuffle types.

9896

static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,

9897

SDValue V2, ArrayRef<int> Mask,

9898

SelectionDAG &DAG) {

9899

assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9900, __PRETTY_FUNCTION__))

9900

"Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9900, __PRETTY_FUNCTION__));

9901

assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9901, __PRETTY_FUNCTION__));

9902

assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9902, __PRETTY_FUNCTION__));

9903

9904

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

9905

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

9906

9907

int NumElements = VT.getVectorNumElements();

9908

int SplitNumElements = NumElements / 2;

9909

MVT ScalarVT = VT.getScalarType();

9910

MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

9911

9912

SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,

9913

DAG.getIntPtrConstant(0));

9914

SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,

9915

DAG.getIntPtrConstant(SplitNumElements));

9916

SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,

9917

DAG.getIntPtrConstant(0));

9918

SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,

9919

DAG.getIntPtrConstant(SplitNumElements));

9920

9921

// Now create two 4-way blends of these half-width vectors.

9922

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

9923

bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;

9924

SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;

9925

for (int i = 0; i < SplitNumElements; ++i) {

9926

int M = HalfMask[i];

9927

if (M >= NumElements) {

9928

if (M >= NumElements + SplitNumElements)

9929

UseHiV2 = true;

9930

else

9931

UseLoV2 = true;

9932

V2BlendMask.push_back(M - NumElements);

9933

V1BlendMask.push_back(-1);

9934

BlendMask.push_back(SplitNumElements + i);

9935

} else if (M >= 0) {

9936

if (M >= SplitNumElements)

9937

UseHiV1 = true;

9938

else

9939

UseLoV1 = true;

9940

V2BlendMask.push_back(-1);

9941

V1BlendMask.push_back(M);

9942

BlendMask.push_back(i);

9943

} else {

9944

V2BlendMask.push_back(-1);

9945

V1BlendMask.push_back(-1);

9946

BlendMask.push_back(-1);

9947

}

9948

}

9949

9950

// Because the lowering happens after all combining takes place, we need to

9951

// manually combine these blend masks as much as possible so that we create

9952

// a minimal number of high-level vector shuffle nodes.

9953

9954

// First try just blending the halves of V1 or V2.

9955

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

9956

return DAG.getUNDEF(SplitVT);

9957

if (!UseLoV2 && !UseHiV2)

9958

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

9959

if (!UseLoV1 && !UseHiV1)

9960

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

9961

9962

SDValue V1Blend, V2Blend;

9963

if (UseLoV1 && UseHiV1) {

9964

V1Blend =

9965

DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

9966

} else {

9967

// We only use half of V1 so map the usage down into the final blend mask.

9968

V1Blend = UseLoV1 ? LoV1 : HiV1;

9969

for (int i = 0; i < SplitNumElements; ++i)

9970

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

9971

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

9972

}

9973

if (UseLoV2 && UseHiV2) {

9974

V2Blend =

9975

DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

9976

} else {

9977

// We only use half of V2 so map the usage down into the final blend mask.

9978

V2Blend = UseLoV2 ? LoV2 : HiV2;

9979

for (int i = 0; i < SplitNumElements; ++i)

9980

if (BlendMask[i] >= SplitNumElements)

9981

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

9982

}

9983

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

9984

};

9985

SDValue Lo = HalfBlend(LoMask);

9986

SDValue Hi = HalfBlend(HiMask);

9987

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

9988

}

9989

9990

/// \brief Either split a vector in halves or decompose the shuffles and the

9991

/// blend.

9992

///

9993

/// This is provided as a good fallback for many lowerings of non-single-input

9994

/// shuffles with more than one 128-bit lane. In those cases, we want to select

9995

/// between splitting the shuffle into 128-bit components and stitching those

9996

/// back together vs. extracting the single-input shuffles and blending those

9997

/// results.

9998

static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,

9999

SDValue V2, ArrayRef<int> Mask,

10000

SelectionDAG &DAG) {

10001

assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "((!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
"lower single-input shuffles as it " "could then recurse on itself."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This routine must not be used to \" \"lower single-input shuffles as it \" \"could then recurse on itself.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10003, __PRETTY_FUNCTION__))

10002

"lower single-input shuffles as it "((!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
"lower single-input shuffles as it " "could then recurse on itself."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This routine must not be used to \" \"lower single-input shuffles as it \" \"could then recurse on itself.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10003, __PRETTY_FUNCTION__))

10003

"could then recurse on itself.")((!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
"lower single-input shuffles as it " "could then recurse on itself."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This routine must not be used to \" \"lower single-input shuffles as it \" \"could then recurse on itself.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10003, __PRETTY_FUNCTION__));

10004

int Size = Mask.size();

10005

10006

// If this can be modeled as a broadcast of two elements followed by a blend,

10007

// prefer that lowering. This is especially important because broadcasts can

10008

// often fold with memory operands.

10009

auto DoBothBroadcast = [&] {

10010

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

10011

for (int M : Mask)

10012

if (M >= Size) {

10013

if (V2BroadcastIdx == -1)

10014

V2BroadcastIdx = M - Size;

10015

else if (M - Size != V2BroadcastIdx)

10016

return false;

10017

} else if (M >= 0) {

10018

if (V1BroadcastIdx == -1)

10019

V1BroadcastIdx = M;

10020

else if (M != V1BroadcastIdx)

10021

return false;

10022

}

10023

return true;

10024

};

10025

if (DoBothBroadcast())

10026

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,

10027

DAG);

10028

10029

// If the inputs all stem from a single 128-bit lane of each input, then we

10030

// split them rather than blending because the split will decompose to

10031

// unusually few instructions.

10032

int LaneCount = VT.getSizeInBits() / 128;

10033

int LaneSize = Size / LaneCount;

10034

SmallBitVector LaneInputs[2];

10035

LaneInputs[0].resize(LaneCount, false);

10036

LaneInputs[1].resize(LaneCount, false);

10037

for (int i = 0; i < Size; ++i)

10038

if (Mask[i] >= 0)

10039

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

10040

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

10041

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

10042

10043

// Otherwise, just fall back to decomposed shuffles and a blend. This requires

10044

// that the decomposed single-input shuffles don't end up here.

10045

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);

10046

}

10047

10048

/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as

10049

/// a permutation and blend of those lanes.

10050

///

10051

/// This essentially blends the out-of-lane inputs to each lane into the lane

10052

/// from a permuted copy of the vector. This lowering strategy results in four

10053

/// instructions in the worst case for a single-input cross lane shuffle which

10054

/// is lower than any other fully general cross-lane shuffle strategy I'm aware

10055

/// of. Special cases for each particular shuffle pattern should be handled

10056

/// prior to trying this lowering.

10057

static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,

10058

SDValue V1, SDValue V2,

10059

ArrayRef<int> Mask,

10060

SelectionDAG &DAG) {

10061

// FIXME: This should probably be generalized for 512-bit vectors as well.

10062

assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!")((VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == 256 && \"Only for 256-bit vector shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10062, __PRETTY_FUNCTION__));

10063

int LaneSize = Mask.size() / 2;

10064

10065

// If there are only inputs from one 128-bit lane, splitting will in fact be

10066

// less expensive. The flags track wether the given lane contains an element

10067

// that crosses to another lane.

10068

bool LaneCrossing[2] = {false, false};

10069

for (int i = 0, Size = Mask.size(); i < Size; ++i)

10070

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

10071

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

10072

if (!LaneCrossing[0] || !LaneCrossing[1])

10073

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

10074

10075

if (isSingleInputShuffleMask(Mask)) {

10076

SmallVector<int, 32> FlippedBlendMask;

10077

for (int i = 0, Size = Mask.size(); i < Size; ++i)

10078

FlippedBlendMask.push_back(

10079

Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)

10080

? Mask[i]

10081

: Mask[i] % LaneSize +

10082

(i / LaneSize) * LaneSize + Size));

10083

10084

// Flip the vector, and blend the results which should now be in-lane. The

10085

// VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and

10086

// 5 for the high source. The value 3 selects the high half of source 2 and

10087

// the value 2 selects the low half of source 2. We only use source 2 to

10088

// allow folding it into a memory operand.

10089

unsigned PERMMask = 3 | 2 << 4;

10090

SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),

10091

V1, DAG.getConstant(PERMMask, MVT::i8));

10092

return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);

10093

}

10094

10095

// This now reduces to two single-input shuffles of V1 and V2 which at worst

10096

// will be handled by the above logic and a blend of the results, much like

10097

// other patterns in AVX.

10098

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);

10099

}

10100

10101

/// \brief Handle lowering 2-lane 128-bit shuffles.

10102

static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,

10103

SDValue V2, ArrayRef<int> Mask,

10104

const X86Subtarget *Subtarget,

10105

SelectionDAG &DAG) {

10106

// Blends are faster and handle all the non-lane-crossing cases.

10107

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,

10108

Subtarget, DAG))

10109

return Blend;

10110

10111

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

10112

VT.getVectorNumElements() / 2);

10113

// Check for patterns which can be matched with a single insert of a 128-bit

10114

// subvector.

10115

if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||

10116

isShuffleEquivalent(Mask, 0, 1, 4, 5)) {

10117

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

10118

DAG.getIntPtrConstant(0));

10119

SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

10120

Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));

10121

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);

10122

}

10123

if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {

10124

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

10125

DAG.getIntPtrConstant(0));

10126

SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

10127

DAG.getIntPtrConstant(2));

10128

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);

10129

}

10130

10131

// Otherwise form a 128-bit permutation.

10132

// FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.

10133

unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;

10134

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

10135

DAG.getConstant(PermMask, MVT::i8));

10136

}

10137

10138

/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then

10139

/// shuffling each lane.

10140

///

10141

/// This will only succeed when the result of fixing the 128-bit lanes results

10142

/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in

10143

/// each 128-bit lanes. This handles many cases where we can quickly blend away

10144

/// the lane crosses early and then use simpler shuffles within each lane.

10145

///

10146

/// FIXME: It might be worthwhile at some point to support this without

10147

/// requiring the 128-bit lane-relative shuffles to be repeating, but currently

10148

/// in x86 only floating point has interesting non-repeating shuffles, and even

10149

/// those are still *marginally* more expensive.

10150

static SDValue lowerVectorShuffleByMerging128BitLanes(

10151

SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

10152

const X86Subtarget *Subtarget, SelectionDAG &DAG) {

10153

assert(!isSingleInputShuffleMask(Mask) &&((!isSingleInputShuffleMask(Mask) && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This is only useful with multiple inputs.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10154, __PRETTY_FUNCTION__))

10154

"This is only useful with multiple inputs.")((!isSingleInputShuffleMask(Mask) && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This is only useful with multiple inputs.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10154, __PRETTY_FUNCTION__));

10155

10156

int Size = Mask.size();

10157

int LaneSize = 128 / VT.getScalarSizeInBits();

10158

int NumLanes = Size / LaneSize;

10159

assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.")((NumLanes > 1 && "Only handles 256-bit and wider shuffles."
) ? static_cast<void> (0) : __assert_fail ("NumLanes > 1 && \"Only handles 256-bit and wider shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10159, __PRETTY_FUNCTION__));

10160

10161

// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also

10162

// check whether the in-128-bit lane shuffles share a repeating pattern.

10163

SmallVector<int, 4> Lanes;

10164

Lanes.resize(NumLanes, -1);

10165

SmallVector<int, 4> InLaneMask;

10166

InLaneMask.resize(LaneSize, -1);

10167

for (int i = 0; i < Size; ++i) {

10168

if (Mask[i] < 0)

10169

continue;

10170

10171

int j = i / LaneSize;

10172

10173

if (Lanes[j] < 0) {

10174

// First entry we've seen for this lane.

10175

Lanes[j] = Mask[i] / LaneSize;

10176

} else if (Lanes[j] != Mask[i] / LaneSize) {

10177

// This doesn't match the lane selected previously!

10178

return SDValue();

10179

}

10180

10181

// Check that within each lane we have a consistent shuffle mask.

10182

int k = i % LaneSize;

10183

if (InLaneMask[k] < 0) {

10184

InLaneMask[k] = Mask[i] % LaneSize;

10185

} else if (InLaneMask[k] != Mask[i] % LaneSize) {

10186

// This doesn't fit a repeating in-lane mask.

10187

return SDValue();

10188

}

10189

}

10190

10191

// First shuffle the lanes into place.

10192

MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,

10193

VT.getSizeInBits() / 64);

10194

SmallVector<int, 8> LaneMask;

10195

LaneMask.resize(NumLanes * 2, -1);

10196

for (int i = 0; i < NumLanes; ++i)

10197

if (Lanes[i] >= 0) {

10198

LaneMask[2 * i + 0] = 2*Lanes[i] + 0;

10199

LaneMask[2 * i + 1] = 2*Lanes[i] + 1;

10200

}

10201

10202

V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);

10203

V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);

10204

SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);

10205

10206

// Cast it back to the type we actually want.

10207

LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);

10208

10209

// Now do a simple shuffle that isn't lane crossing.

10210

SmallVector<int, 8> NewMask;

10211

NewMask.resize(Size, -1);

10212

for (int i = 0; i < Size; ++i)

10213

if (Mask[i] >= 0)

10214

NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;

10215

assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&((!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10216, __PRETTY_FUNCTION__))

10216

"Must not introduce lane crosses at this point!")((!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10216, __PRETTY_FUNCTION__));

10217

10218

return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);

10219

}

10220

10221

/// \brief Test whether the specified input (0 or 1) is in-place blended by the

10222

/// given mask.

10223

///

10224

/// This returns true if the elements from a particular input are already in the

10225

/// slot required by the given mask and require no permutation.

10226

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

10227

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(((Input == 0 || Input == 1) && "Only two inputs to shuffles."
) ? static_cast<void> (0) : __assert_fail ("(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10227, __PRETTY_FUNCTION__));

10228

int Size = Mask.size();

10229

for (int i = 0; i < Size; ++i)

10230

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

10231

return false;

10232

10233

return true;

10234

}

10235

10236

/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.

10237

///

10238

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

10239

/// isn't available.

10240

static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10241

const X86Subtarget *Subtarget,

10242

SelectionDAG &DAG) {

10243

SDLoc DL(Op);

10244

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10244, __PRETTY_FUNCTION__));

10245

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10245, __PRETTY_FUNCTION__));

10246

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10247

ArrayRef<int> Mask = SVOp->getMask();

10248

10249

10250

SmallVector<int, 4> WidenedMask;

10251

if (canWidenShuffleElements(Mask, WidenedMask))

10252

return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,

10253

DAG);

10254

10255

if (isSingleInputShuffleMask(Mask)) {

10256

// Check for being able to broadcast a single element.

10257

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,

10258

Mask, Subtarget, DAG))

10259

return Broadcast;

10260

10261

// Use low duplicate instructions for masks that match their pattern.

10262

if (isShuffleEquivalent(Mask, 0, 0, 2, 2))

10263

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

10264

10265

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

10266

// Non-half-crossing single input shuffles can be lowerid with an

10267

// interleaved permutation.

10268

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

10269

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

10270

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

10271

DAG.getConstant(VPERMILPMask, MVT::i8));

10272

}

10273

10274

// With AVX2 we have direct support for this permutation.

10275

if (Subtarget->hasAVX2())

10276

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

10277

getV4X86ShuffleImm8ForMask(Mask, DAG));

10278

10279

// Otherwise, fall back.

10280

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,

10281

DAG);

10282

}

10283

10284

// X86 has dedicated unpack instructions that can handle specific blend

10285

// operations: UNPCKH and UNPCKL.

10286

if (isShuffleEquivalent(Mask, 0, 4, 2, 6))

10287

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);

10288

if (isShuffleEquivalent(Mask, 1, 5, 3, 7))

10289

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);

10290

10291

// If we have a single input to the zero element, insert that into V1 if we

10292

// can do so cheaply.

10293

int NumV2Elements =

10294

std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });

10295

if (NumV2Elements == 1 && Mask[0] >= 4)

10296

if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(

10297

MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))

10298

return Insertion;

10299

10300

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

10301

Subtarget, DAG))

10302

return Blend;

10303

10304

// Check if the blend happens to exactly fit that of SHUFPD.

10305

if ((Mask[0] == -1 || Mask[0] < 2) &&

10306

(Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&

10307

(Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&

10308

(Mask[3] == -1 || Mask[3] >= 6)) {

10309

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |

10310

((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);

10311

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,

10312

DAG.getConstant(SHUFPDMask, MVT::i8));

10313

}

10314

if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&

10315

(Mask[1] == -1 || Mask[1] < 2) &&

10316

(Mask[2] == -1 || Mask[2] >= 6) &&

10317

(Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {

10318

unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |

10319

((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);

10320

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,

10321

DAG.getConstant(SHUFPDMask, MVT::i8));

10322

}

10323

10324

// Try to simplify this by merging 128-bit lanes to enable a lane-based

10325

// shuffle. However, if we have AVX2 and either inputs are already in place,

10326

// we will be able to shuffle even across lanes the other input in a single

10327

// instruction so skip this pattern.

10328

if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||

10329

isShuffleMaskInputInPlace(1, Mask))))

10330

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

10331

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

10332

return Result;

10333

10334

// If we have AVX2 then we always want to lower with a blend because an v4 we

10335

// can fully permute the elements.

10336

if (Subtarget->hasAVX2())

10337

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,

10338

Mask, DAG);

10339

10340

// Otherwise fall back on generic lowering.

10341

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);

10342

}

10343

10344

/// \brief Handle lowering of 4-lane 64-bit integer shuffles.

10345

///

10346

/// This routine is only called when we have AVX2 and thus a reasonable

10347

/// instruction set for v4i64 shuffling..

10348

static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10349

const X86Subtarget *Subtarget,

10350

SelectionDAG &DAG) {

10351

SDLoc DL(Op);

10352

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10352, __PRETTY_FUNCTION__));

10353

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10353, __PRETTY_FUNCTION__));

10354

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10355

ArrayRef<int> Mask = SVOp->getMask();

10356

10357

assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10357, __PRETTY_FUNCTION__));

10358

10359

SmallVector<int, 4> WidenedMask;

10360

if (canWidenShuffleElements(Mask, WidenedMask))

10361

return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,

10362

DAG);

10363

10364

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

10365

Subtarget, DAG))

10366

return Blend;

10367

10368

// Check for being able to broadcast a single element.

10369

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,

10370

Mask, Subtarget, DAG))

10371

return Broadcast;

10372

10373

// When the shuffle is mirrored between the 128-bit lanes of the unit, we can

10374

// use lower latency instructions that will operate on both 128-bit lanes.

10375

SmallVector<int, 2> RepeatedMask;

10376

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

10377

if (isSingleInputShuffleMask(Mask)) {

10378

int PSHUFDMask[] = {-1, -1, -1, -1};

10379

for (int i = 0; i < 2; ++i)

10380

if (RepeatedMask[i] >= 0) {

10381

PSHUFDMask[2 * i] = 2 * RepeatedMask[i];

10382

PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;

10383

}

10384

return DAG.getNode(

10385

ISD::BITCAST, DL, MVT::v4i64,

10386

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

10387

DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),

10388

getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));

10389

}

10390

10391

// Use dedicated unpack instructions for masks that match their pattern.

10392

if (isShuffleEquivalent(Mask, 0, 4, 2, 6))

10393

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);

10394

if (isShuffleEquivalent(Mask, 1, 5, 3, 7))

10395

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);

10396

}

10397

10398

// AVX2 provides a direct instruction for permuting a single input across

10399

// lanes.

10400

if (isSingleInputShuffleMask(Mask))

10401

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

10402

getV4X86ShuffleImm8ForMask(Mask, DAG));

10403

10404

// Try to simplify this by merging 128-bit lanes to enable a lane-based

10405

// shuffle. However, if we have AVX2 and either inputs are already in place,

10406

// we will be able to shuffle even across lanes the other input in a single

10407

// instruction so skip this pattern.

10408

if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||

10409

isShuffleMaskInputInPlace(1, Mask))))

10410

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

10411

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

10412

return Result;

10413

10414

// Otherwise fall back on generic blend lowering.

10415

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,

10416

Mask, DAG);

10417

}

10418

10419

/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.

10420

///

10421

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

10422

/// isn't available.

10423

static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10424

const X86Subtarget *Subtarget,

10425

SelectionDAG &DAG) {

10426

SDLoc DL(Op);

10427

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10427, __PRETTY_FUNCTION__));

10428

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10428, __PRETTY_FUNCTION__));

10429

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10430

ArrayRef<int> Mask = SVOp->getMask();

10431

10432

10433

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

10434

Subtarget, DAG))

10435

return Blend;

10436

10437

// Check for being able to broadcast a single element.

10438

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,

10439

Mask, Subtarget, DAG))

10440

return Broadcast;

10441

10442

// If the shuffle mask is repeated in each 128-bit lane, we have many more

10443

// options to efficiently lower the shuffle.

10444

SmallVector<int, 4> RepeatedMask;

10445

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

10446

assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10447, __PRETTY_FUNCTION__))

10447

"Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10447, __PRETTY_FUNCTION__));

10448

10449

// Use even/odd duplicate instructions for masks that match their pattern.

10450

if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))

10451

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

10452

if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))

10453

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

10454

10455

if (isSingleInputShuffleMask(Mask))

10456

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

10457

getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));

10458

10459

// Use dedicated unpack instructions for masks that match their pattern.

10460

if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))

10461

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);

10462

if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))

10463

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);

10464

10465

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

10466

// have already handled any direct blends. We also need to squash the

10467

// repeated mask into a simulated v4f32 mask.

10468

for (int i = 0; i < 4; ++i)

10469

if (RepeatedMask[i] >= 8)

10470

RepeatedMask[i] -= 4;

10471

return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

10472

}

10473

10474

// If we have a single input shuffle with different shuffle patterns in the

10475

// two 128-bit lanes use the variable mask to VPERMILPS.

10476

if (isSingleInputShuffleMask(Mask)) {

10477

SDValue VPermMask[8];

10478

for (int i = 0; i < 8; ++i)

10479

VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)

10480

: DAG.getConstant(Mask[i], MVT::i32);

10481

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))

10482

return DAG.getNode(

10483

X86ISD::VPERMILPV, DL, MVT::v8f32, V1,

10484

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));

10485

10486

if (Subtarget->hasAVX2())

10487

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,

10488

DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,

10489

DAG.getNode(ISD::BUILD_VECTOR, DL,

10490

MVT::v8i32, VPermMask)),

10491

V1);

10492

10493

// Otherwise, fall back.

10494

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,

10495

DAG);

10496

}

10497

10498

// Try to simplify this by merging 128-bit lanes to enable a lane-based

10499

// shuffle.

10500

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

10501

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

10502

return Result;

10503

10504

// If we have AVX2 then we always want to lower with a blend because at v8 we

10505

// can fully permute the elements.

10506

if (Subtarget->hasAVX2())

10507

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,

10508

Mask, DAG);

10509

10510

// Otherwise fall back on generic lowering.

10511

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);

10512

}

10513

10514

/// \brief Handle lowering of 8-lane 32-bit integer shuffles.

10515

///

10516

/// This routine is only called when we have AVX2 and thus a reasonable

10517

/// instruction set for v8i32 shuffling..

10518

static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10519

const X86Subtarget *Subtarget,

10520

SelectionDAG &DAG) {

10521

SDLoc DL(Op);

10522

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10522, __PRETTY_FUNCTION__));

10523

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10523, __PRETTY_FUNCTION__));

10524

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10525

ArrayRef<int> Mask = SVOp->getMask();

10526

10527

assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10527, __PRETTY_FUNCTION__));

10528

10529

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

10530

Subtarget, DAG))

10531

return Blend;

10532

10533

// Check for being able to broadcast a single element.

10534

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,

10535

Mask, Subtarget, DAG))

10536

return Broadcast;

10537

10538

// If the shuffle mask is repeated in each 128-bit lane we can use more

10539

// efficient instructions that mirror the shuffles across the two 128-bit

10540

// lanes.

10541

SmallVector<int, 4> RepeatedMask;

10542

if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {

10543

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10543, __PRETTY_FUNCTION__));

10544

if (isSingleInputShuffleMask(Mask))

10545

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

10546

getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));

10547

10548

// Use dedicated unpack instructions for masks that match their pattern.

10549

if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))

10550

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);

10551

if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))

10552

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);

10553

}

10554

10555

// If the shuffle patterns aren't repeated but it is a single input, directly

10556

// generate a cross-lane VPERMD instruction.

10557

if (isSingleInputShuffleMask(Mask)) {

10558

SDValue VPermMask[8];

10559

for (int i = 0; i < 8; ++i)

10560

VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)

10561

: DAG.getConstant(Mask[i], MVT::i32);

10562

return DAG.getNode(

10563

X86ISD::VPERMV, DL, MVT::v8i32,

10564

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);

10565

}

10566

10567

// Try to simplify this by merging 128-bit lanes to enable a lane-based

10568

// shuffle.

10569

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

10570

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

10571

return Result;

10572

10573

// Otherwise fall back on generic blend lowering.

10574

return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,

10575

Mask, DAG);

10576

}

10577

10578

/// \brief Handle lowering of 16-lane 16-bit integer shuffles.

10579

///

10580

/// This routine is only called when we have AVX2 and thus a reasonable

10581

/// instruction set for v16i16 shuffling..

10582

static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10583

const X86Subtarget *Subtarget,

10584

SelectionDAG &DAG) {

10585

SDLoc DL(Op);

10586

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __PRETTY_FUNCTION__));

10587

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10587, __PRETTY_FUNCTION__));

10588

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10589

ArrayRef<int> Mask = SVOp->getMask();

10590

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10590, __PRETTY_FUNCTION__));

10591

assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10591, __PRETTY_FUNCTION__));

10592

10593

// Check for being able to broadcast a single element.

10594

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,

10595

Mask, Subtarget, DAG))

10596

return Broadcast;

10597

10598

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

10599

Subtarget, DAG))

10600

return Blend;

10601

10602

// Use dedicated unpack instructions for masks that match their pattern.

10603

if (isShuffleEquivalent(Mask,

10604

// First 128-bit lane:

10605

0, 16, 1, 17, 2, 18, 3, 19,

10606

// Second 128-bit lane:

10607

8, 24, 9, 25, 10, 26, 11, 27))

10608

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);

10609

if (isShuffleEquivalent(Mask,

10610

// First 128-bit lane:

10611

4, 20, 5, 21, 6, 22, 7, 23,

10612

// Second 128-bit lane:

10613

12, 28, 13, 29, 14, 30, 15, 31))

10614

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);

10615

10616

if (isSingleInputShuffleMask(Mask)) {

10617

// There are no generalized cross-lane shuffle operations available on i16

10618

// element types.

10619

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))

10620

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,

10621

Mask, DAG);

10622

10623

SDValue PSHUFBMask[32];

10624

for (int i = 0; i < 16; ++i) {

10625

if (Mask[i] == -1) {

10626

PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);

10627

continue;

10628

}

10629

10630

int M = i < 8 ? Mask[i] : Mask[i] - 8;

10631

assert(M >= 0 && M < 8 && "Invalid single-input mask!")((M >= 0 && M < 8 && "Invalid single-input mask!"
) ? static_cast<void> (0) : __assert_fail ("M >= 0 && M < 8 && \"Invalid single-input mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10631, __PRETTY_FUNCTION__));

10632

PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);

10633

PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);

10634

}

10635

return DAG.getNode(

10636

ISD::BITCAST, DL, MVT::v16i16,

10637

DAG.getNode(

10638

X86ISD::PSHUFB, DL, MVT::v32i8,

10639

DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),

10640

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));

10641

}

10642

10643

// Try to simplify this by merging 128-bit lanes to enable a lane-based

10644

// shuffle.

10645

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

10646

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

10647

return Result;

10648

10649

// Otherwise fall back on generic lowering.

10650

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);

10651

}

10652

10653

/// \brief Handle lowering of 32-lane 8-bit integer shuffles.

10654

///

10655

/// This routine is only called when we have AVX2 and thus a reasonable

10656

/// instruction set for v32i8 shuffling..

10657

static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10658

const X86Subtarget *Subtarget,

10659

SelectionDAG &DAG) {

10660

SDLoc DL(Op);

10661

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10661, __PRETTY_FUNCTION__));

10662

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10662, __PRETTY_FUNCTION__));

10663

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10664

ArrayRef<int> Mask = SVOp->getMask();

10665

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10665, __PRETTY_FUNCTION__));

10666

assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10666, __PRETTY_FUNCTION__));

10667

10668

// Check for being able to broadcast a single element.

10669

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,

10670

Mask, Subtarget, DAG))

10671

return Broadcast;

10672

10673

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

10674

Subtarget, DAG))

10675

return Blend;

10676

10677

// Use dedicated unpack instructions for masks that match their pattern.

10678

// Note that these are repeated 128-bit lane unpacks, not unpacks across all

10679

// 256-bit lanes.

10680

if (isShuffleEquivalent(

10681

Mask,

10682

// First 128-bit lane:

10683

0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,

10684

// Second 128-bit lane:

10685

16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))

10686

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);

10687

if (isShuffleEquivalent(

10688

Mask,

10689

// First 128-bit lane:

10690

8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,

10691

// Second 128-bit lane:

10692

24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))

10693

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);

10694

10695

if (isSingleInputShuffleMask(Mask)) {

10696

// There are no generalized cross-lane shuffle operations available on i8

10697

// element types.

10698

if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))

10699

return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,

10700

Mask, DAG);

10701

10702

SDValue PSHUFBMask[32];

10703

for (int i = 0; i < 32; ++i)

10704

PSHUFBMask[i] =

10705

Mask[i] < 0

10706

? DAG.getUNDEF(MVT::i8)

10707

: DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);

10708

10709

return DAG.getNode(

10710

X86ISD::PSHUFB, DL, MVT::v32i8, V1,

10711

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));

10712

}

10713

10714

// Try to simplify this by merging 128-bit lanes to enable a lane-based

10715

// shuffle.

10716

if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(

10717

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

10718

return Result;

10719

10720

// Otherwise fall back on generic lowering.

10721

return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);

10722

}

10723

10724

/// \brief High-level routine to lower various 256-bit x86 vector shuffles.

10725

///

10726

/// This routine either breaks down the specific type of a 256-bit x86 vector

10727

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

10728

/// together based on the available instructions.

10729

static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10730

MVT VT, const X86Subtarget *Subtarget,

10731

SelectionDAG &DAG) {

10732

SDLoc DL(Op);

10733

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10734

ArrayRef<int> Mask = SVOp->getMask();

10735

10736

// There is a really nice hard cut-over between AVX1 and AVX2 that means we can

10737

// check for those subtargets here and avoid much of the subtarget querying in

10738

// the per-vector-type lowering routines. With AVX1 we have essentially *zero*

10739

// ability to manipulate a 256-bit vector with integer types. Since we'll use

10740

// floating point types there eventually, just immediately cast everything to

10741

// a float and operate entirely in that domain.

10742

if (VT.isInteger() && !Subtarget->hasAVX2()) {

10743

int ElementBits = VT.getScalarSizeInBits();

10744

if (ElementBits < 32)

10745

// No floating point type available, decompose into 128-bit vectors.

10746

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

10747

10748

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

10749

VT.getVectorNumElements());

10750

V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);

10751

V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);

10752

return DAG.getNode(ISD::BITCAST, DL, VT,

10753

DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

10754

}

10755

10756

switch (VT.SimpleTy) {

10757

case MVT::v4f64:

10758

return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);

10759

case MVT::v4i64:

10760

return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);

10761

case MVT::v8f32:

10762

return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);

10763

case MVT::v8i32:

10764

return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);

10765

case MVT::v16i16:

10766

return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);

10767

case MVT::v32i8:

10768

return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);

10769

10770

default:

10771

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10771);

10772

}

10773

}

10774

10775

/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.

10776

static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10777

const X86Subtarget *Subtarget,

10778

SelectionDAG &DAG) {

10779

SDLoc DL(Op);

10780

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10780, __PRETTY_FUNCTION__));

10781

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10781, __PRETTY_FUNCTION__));

10782

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10783

ArrayRef<int> Mask = SVOp->getMask();

10784

10785

10786

// X86 has dedicated unpack instructions that can handle specific blend

10787

// operations: UNPCKH and UNPCKL.

10788

if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))

10789

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);

10790

if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))

10791

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);

10792

10793

// FIXME: Implement direct support for this type!

10794

return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);

10795

}

10796

10797

/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.

10798

static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10799

const X86Subtarget *Subtarget,

10800

SelectionDAG &DAG) {

10801

SDLoc DL(Op);

10802

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10802, __PRETTY_FUNCTION__));

10803

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10803, __PRETTY_FUNCTION__));

10804

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10805

ArrayRef<int> Mask = SVOp->getMask();

10806

10807

10808

// Use dedicated unpack instructions for masks that match their pattern.

10809

if (isShuffleEquivalent(Mask,

10810

0, 16, 1, 17, 4, 20, 5, 21,

10811

8, 24, 9, 25, 12, 28, 13, 29))

10812

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);

10813

if (isShuffleEquivalent(Mask,

10814

2, 18, 3, 19, 6, 22, 7, 23,

10815

10, 26, 11, 27, 14, 30, 15, 31))

10816

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);

10817

10818

// FIXME: Implement direct support for this type!

10819

return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);

10820

}

10821

10822

/// \brief Handle lowering of 8-lane 64-bit integer shuffles.

10823

static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10824

const X86Subtarget *Subtarget,

10825

SelectionDAG &DAG) {

10826

SDLoc DL(Op);

10827

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10827, __PRETTY_FUNCTION__));

10828

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10828, __PRETTY_FUNCTION__));

10829

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10830

ArrayRef<int> Mask = SVOp->getMask();

10831

10832

10833

// X86 has dedicated unpack instructions that can handle specific blend

10834

// operations: UNPCKH and UNPCKL.

10835

if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))

10836

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);

10837

if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))

10838

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);

10839

10840

// FIXME: Implement direct support for this type!

10841

return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);

10842

}

10843

10844

/// \brief Handle lowering of 16-lane 32-bit integer shuffles.

10845

static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10846

const X86Subtarget *Subtarget,

10847

SelectionDAG &DAG) {

10848

SDLoc DL(Op);

10849

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10849, __PRETTY_FUNCTION__));

10850

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10850, __PRETTY_FUNCTION__));

10851

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10852

ArrayRef<int> Mask = SVOp->getMask();

10853

10854

10855

// Use dedicated unpack instructions for masks that match their pattern.

10856

if (isShuffleEquivalent(Mask,

10857

0, 16, 1, 17, 4, 20, 5, 21,

10858

8, 24, 9, 25, 12, 28, 13, 29))

10859

return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);

10860

if (isShuffleEquivalent(Mask,

10861

2, 18, 3, 19, 6, 22, 7, 23,

10862

10, 26, 11, 27, 14, 30, 15, 31))

10863

return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);

10864

10865

// FIXME: Implement direct support for this type!

10866

return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);

10867

}

10868

10869

/// \brief Handle lowering of 32-lane 16-bit integer shuffles.

10870

static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10871

const X86Subtarget *Subtarget,

10872

SelectionDAG &DAG) {

10873

SDLoc DL(Op);

10874

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10874, __PRETTY_FUNCTION__));

10875

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10875, __PRETTY_FUNCTION__));

10876

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10877

ArrayRef<int> Mask = SVOp->getMask();

10878

10879

assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10879, __PRETTY_FUNCTION__));

10880

10881

// FIXME: Implement direct support for this type!

10882

return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);

10883

}

10884

10885

/// \brief Handle lowering of 64-lane 8-bit integer shuffles.

10886

static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10887

const X86Subtarget *Subtarget,

10888

SelectionDAG &DAG) {

10889

SDLoc DL(Op);

10890

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10890, __PRETTY_FUNCTION__));

10891

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10891, __PRETTY_FUNCTION__));

10892

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10893

ArrayRef<int> Mask = SVOp->getMask();

10894

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10894, __PRETTY_FUNCTION__));

10895

assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10895, __PRETTY_FUNCTION__));

10896

10897

// FIXME: Implement direct support for this type!

10898

return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);

10899

}

10900

10901

/// \brief High-level routine to lower various 512-bit x86 vector shuffles.

10902

///

10903

/// This routine either breaks down the specific type of a 512-bit x86 vector

10904

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

10905

/// together based on the available instructions.

10906

static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,

10907

MVT VT, const X86Subtarget *Subtarget,

10908

SelectionDAG &DAG) {

10909

SDLoc DL(Op);

10910

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10911

ArrayRef<int> Mask = SVOp->getMask();

10912

assert(Subtarget->hasAVX512() &&((Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10913, __PRETTY_FUNCTION__))

10913

"Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10913, __PRETTY_FUNCTION__));

10914

10915

// Check for being able to broadcast a single element.

10916

if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,

10917

Mask, Subtarget, DAG))

10918

return Broadcast;

10919

10920

// Dispatch to each element type for lowering. If we don't have supprot for

10921

// specific element type shuffles at 512 bits, immediately split them and

10922

// lower them. Each lowering routine of a given type is allowed to assume that

10923

// the requisite ISA extensions for that element type are available.

10924

switch (VT.SimpleTy) {

10925

case MVT::v8f64:

10926

return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);

10927

case MVT::v16f32:

10928

return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);

10929

case MVT::v8i64:

10930

return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);

10931

case MVT::v16i32:

10932

return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);

10933

case MVT::v32i16:

10934

if (Subtarget->hasBWI())

10935

return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);

10936

break;

10937

case MVT::v64i8:

10938

if (Subtarget->hasBWI())

10939

return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);

10940

break;

10941

10942

default:

10943

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10943);

10944

}

10945

10946

// Otherwise fall back on splitting.

10947

return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

10948

}

10949

10950

/// \brief Top-level lowering for x86 vector shuffles.

10951

///

10952

/// This handles decomposition, canonicalization, and lowering of all x86

10953

/// vector shuffles. Most of the specific lowering strategies are encapsulated

10954

/// above in helper routines. The canonicalization attempts to widen shuffles

10955

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

10956

/// s.t. only one of the two inputs needs to be tested, etc.

10957

static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,

10958

SelectionDAG &DAG) {

10959

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

10960

ArrayRef<int> Mask = SVOp->getMask();

10961

SDValue V1 = Op.getOperand(0);

10962

SDValue V2 = Op.getOperand(1);

10963

MVT VT = Op.getSimpleValueType();

10964

int NumElements = VT.getVectorNumElements();

10965

SDLoc dl(Op);

10966

10967

assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles")((VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() != 64 && \"Can't lower MMX shuffles\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10967, __PRETTY_FUNCTION__));

10968

10969

bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;

10970

bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;

10971

if (V1IsUndef && V2IsUndef)

10972

return DAG.getUNDEF(VT);

10973

10974

// When we create a shuffle node we put the UNDEF node to second operand,

10975

// but in some cases the first operand may be transformed to UNDEF.

10976

// In this case we should just commute the node.

10977

if (V1IsUndef)

10978

return DAG.getCommutedVectorShuffle(*SVOp);

10979

10980

// Check for non-undef masks pointing at an undef vector and make the masks

10981

// undef as well. This makes it easier to match the shuffle based solely on

10982

// the mask.

10983

if (V2IsUndef)

10984

for (int M : Mask)

10985

if (M >= NumElements) {

10986

SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());

10987

for (int &M : NewMask)

10988

if (M >= NumElements)

10989

M = -1;

10990

return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);

10991

}

10992

10993

// Try to collapse shuffles into using a vector type with fewer elements but

10994

// wider element types. We cap this to not form integers or floating point

10995

// elements wider than 64 bits, but it might be interesting to form i128

10996

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

10997

SmallVector<int, 16> WidenedMask;

10998

if (VT.getScalarSizeInBits() < 64 &&

10999

canWidenShuffleElements(Mask, WidenedMask)) {

11000

MVT NewEltVT = VT.isFloatingPoint()

11001

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

11002

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

11003

MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);

11004

// Make sure that the new vector type is legal. For example, v2f64 isn't

11005

// legal on SSE1.

11006

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

11007

V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);

11008

V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);

11009

return DAG.getNode(ISD::BITCAST, dl, VT,

11010

DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));

11011

}

11012

}

11013

11014

int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;

11015

for (int M : SVOp->getMask())

11016

if (M < 0)

11017

++NumUndefElements;

11018

else if (M < NumElements)

11019

++NumV1Elements;

11020

else

11021

++NumV2Elements;

11022

11023

// Commute the shuffle as needed such that more elements come from V1 than

11024

// V2. This allows us to match the shuffle pattern strictly on how many

11025

// elements come from V1 without handling the symmetric cases.

11026

if (NumV2Elements > NumV1Elements)

11027

return DAG.getCommutedVectorShuffle(*SVOp);

11028

11029

// When the number of V1 and V2 elements are the same, try to minimize the

11030

// number of uses of V2 in the low half of the vector. When that is tied,

11031

// ensure that the sum of indices for V1 is equal to or lower than the sum

11032

// indices for V2. When those are equal, try to ensure that the number of odd

11033

// indices for V1 is lower than the number of odd indices for V2.

11034

if (NumV1Elements == NumV2Elements) {

11035

int LowV1Elements = 0, LowV2Elements = 0;

11036

for (int M : SVOp->getMask().slice(0, NumElements / 2))

11037

if (M >= NumElements)

11038

++LowV2Elements;

11039

else if (M >= 0)

11040

++LowV1Elements;

11041

if (LowV2Elements > LowV1Elements) {

11042

return DAG.getCommutedVectorShuffle(*SVOp);

11043

} else if (LowV2Elements == LowV1Elements) {

11044

int SumV1Indices = 0, SumV2Indices = 0;

11045

for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)

11046

if (SVOp->getMask()[i] >= NumElements)

11047

SumV2Indices += i;

11048

else if (SVOp->getMask()[i] >= 0)

11049

SumV1Indices += i;

11050

if (SumV2Indices < SumV1Indices) {

11051

return DAG.getCommutedVectorShuffle(*SVOp);

11052

} else if (SumV2Indices == SumV1Indices) {

11053

int NumV1OddIndices = 0, NumV2OddIndices = 0;

11054

for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)

11055

if (SVOp->getMask()[i] >= NumElements)

11056

NumV2OddIndices += i % 2;

11057

else if (SVOp->getMask()[i] >= 0)

11058

NumV1OddIndices += i % 2;

11059

if (NumV2OddIndices < NumV1OddIndices)

11060

return DAG.getCommutedVectorShuffle(*SVOp);

11061

}

11062

}

11063

}

11064

11065

// For each vector width, delegate to a specialized lowering routine.

11066

if (VT.getSizeInBits() == 128)

11067

return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);

11068

11069

if (VT.getSizeInBits() == 256)

11070

return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);

11071

11072

// Force AVX-512 vectors to be scalarized for now.

11073

// FIXME: Implement AVX-512 support!

11074

if (VT.getSizeInBits() == 512)

11075

return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);

11076

11077

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11077);

11078

}

11079

11080

11081

//===----------------------------------------------------------------------===//

11082

// Legacy vector shuffle lowering

11083

11084

// This code is the legacy code handling vector shuffles until the above

11085

// replaces its functionality and performance.

11086

//===----------------------------------------------------------------------===//

11087

11088

static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,

11089

bool hasInt256, unsigned *MaskOut = nullptr) {

11090

MVT EltVT = VT.getVectorElementType();

11091

11092

// There is no blend with immediate in AVX-512.

11093

if (VT.is512BitVector())

11094

return false;

11095

11096

if (!hasSSE41 || EltVT == MVT::i8)

11097

return false;

11098

if (!hasInt256 && VT == MVT::v16i16)

11099

return false;

11100

11101

unsigned MaskValue = 0;

11102

unsigned NumElems = VT.getVectorNumElements();

11103

// There are 2 lanes if (NumElems > 8), and 1 lane otherwise.

11104

unsigned NumLanes = (NumElems - 1) / 8 + 1;

11105

unsigned NumElemsInLane = NumElems / NumLanes;

11106

11107

// Blend for v16i16 should be symetric for the both lanes.

11108

for (unsigned i = 0; i < NumElemsInLane; ++i) {

11109

11110

int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;

11111

int EltIdx = MaskVals[i];

11112

11113

if ((EltIdx < 0 || EltIdx == (int)i) &&

11114

(SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))

11115

continue;

11116

11117

if (((unsigned)EltIdx == (i + NumElems)) &&

11118

(SndLaneEltIdx < 0 ||

11119

(unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))

11120

MaskValue |= (1 << i);

11121

else

11122

return false;

11123

}

11124

11125

if (MaskOut)

11126

*MaskOut = MaskValue;

11127

return true;

11128

}

11129

11130

// Try to lower a shuffle node into a simple blend instruction.

11131

// This function assumes isBlendMask returns true for this

11132

// SuffleVectorSDNode

11133

static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,

11134

unsigned MaskValue,

11135

const X86Subtarget *Subtarget,

11136

SelectionDAG &DAG) {

11137

MVT VT = SVOp->getSimpleValueType(0);

11138

MVT EltVT = VT.getVectorElementType();

11139

assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__))

11140

Subtarget->hasInt256() && "Trying to lower a "((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__))

11141

"VECTOR_SHUFFLE to a Blend but "((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__))

11142

"with the wrong mask"))((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__));

11143

SDValue V1 = SVOp->getOperand(0);

11144

SDValue V2 = SVOp->getOperand(1);

11145

SDLoc dl(SVOp);

11146

unsigned NumElems = VT.getVectorNumElements();

11147

11148

// Convert i32 vectors to floating point if it is not AVX2.

11149

// AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.

11150

MVT BlendVT = VT;

11151

if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {

11152

BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),

11153

NumElems);

11154

V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);

11155

V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);

11156

}

11157

11158

SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,

11159

DAG.getConstant(MaskValue, MVT::i32));

11160

return DAG.getNode(ISD::BITCAST, dl, VT, Ret);

11161

}

11162

11163

/// In vector type \p VT, return true if the element at index \p InputIdx

11164

/// falls on a different 128-bit lane than \p OutputIdx.

11165

static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,

11166

unsigned OutputIdx) {

11167

unsigned EltSize = VT.getVectorElementType().getSizeInBits();

11168

return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;

11169

}

11170

11171

/// Generate a PSHUFB if possible. Selects elements from \p V1 according to

11172

/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to

11173

/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p

11174

/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a

11175

/// zero.

11176

static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,

11177

SelectionDAG &DAG) {

11178

MVT VT = V1.getSimpleValueType();

11179

assert(VT.is128BitVector() || VT.is256BitVector())((VT.is128BitVector() || VT.is256BitVector()) ? static_cast<
void> (0) : __assert_fail ("VT.is128BitVector() || VT.is256BitVector()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11179, __PRETTY_FUNCTION__));

11180

11181

MVT EltVT = VT.getVectorElementType();

11182

unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;

11183

unsigned NumElts = VT.getVectorNumElements();

11184

11185

SmallVector<SDValue, 32> PshufbMask;

11186

for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {

11187

int InputIdx = MaskVals[OutputIdx];

11188

unsigned InputByteIdx;

11189

11190

if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)

11191

InputByteIdx = 0x80;

11192

else {

11193

// Cross lane is not allowed.

11194

if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))

11195

return SDValue();

11196

InputByteIdx = InputIdx * EltSizeInBytes;

11197

// Index is an byte offset within the 128-bit lane.

11198

InputByteIdx &= 0xf;

11199

}

11200

11201

for (unsigned j = 0; j < EltSizeInBytes; ++j) {

11202

PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));

11203

if (InputByteIdx != 0x80)

11204

++InputByteIdx;

11205

}

11206

}

11207

11208

MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());

11209

if (ShufVT != VT)

11210

V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);

11211

return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,

11212

DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));

11213

}

11214

11215

// v8i16 shuffles - Prefer shuffles in the following order:

11216

// 1. [all] pshuflw, pshufhw, optional move

11217

// 2. [ssse3] 1 x pshufb

11218

// 3. [ssse3] 2 x pshufb + 1 x por

11219

// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)

11220

static SDValue

11221

LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,

11222

SelectionDAG &DAG) {

11223

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

11224

SDValue V1 = SVOp->getOperand(0);

11225

SDValue V2 = SVOp->getOperand(1);

11226

SDLoc dl(SVOp);

11227

SmallVector<int, 8> MaskVals;

11228

11229

// Determine if more than 1 of the words in each of the low and high quadwords

11230

// of the result come from the same quadword of one of the two inputs. Undef

11231

// mask values count as coming from any quadword, for better codegen.

11232

11233

// Lo/HiQuad[i] = j indicates how many words from the ith quad of the input

11234

// feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.

11235

unsigned LoQuad[] = { 0, 0, 0, 0 };

11236

unsigned HiQuad[] = { 0, 0, 0, 0 };

11237

// Indices of quads used.

11238

std::bitset<4> InputQuads;

11239

for (unsigned i = 0; i < 8; ++i) {

11240

unsigned *Quad = i < 4 ? LoQuad : HiQuad;

11241

int EltIdx = SVOp->getMaskElt(i);

11242

MaskVals.push_back(EltIdx);

11243

if (EltIdx < 0) {

11244

++Quad[0];

11245

++Quad[1];

11246

++Quad[2];

11247

++Quad[3];

11248

continue;

11249

}

11250

++Quad[EltIdx / 4];

11251

InputQuads.set(EltIdx / 4);

11252

}

11253

11254

int BestLoQuad = -1;

11255

unsigned MaxQuad = 1;

11256

for (unsigned i = 0; i < 4; ++i) {

11257

if (LoQuad[i] > MaxQuad) {

11258

BestLoQuad = i;

11259

MaxQuad = LoQuad[i];

11260

}

11261

}

11262

11263

int BestHiQuad = -1;

11264

MaxQuad = 1;

11265

for (unsigned i = 0; i < 4; ++i) {

11266

if (HiQuad[i] > MaxQuad) {

11267

BestHiQuad = i;

11268

MaxQuad = HiQuad[i];

11269

}

11270

}

11271

11272

// For SSSE3, If all 8 words of the result come from only 1 quadword of each

11273

// of the two input vectors, shuffle them into one input vector so only a

11274

// single pshufb instruction is necessary. If there are more than 2 input

11275

// quads, disable the next transformation since it does not help SSSE3.

11276

bool V1Used = InputQuads[0] || InputQuads[1];

11277

bool V2Used = InputQuads[2] || InputQuads[3];

11278

if (Subtarget->hasSSSE3()) {

11279

if (InputQuads.count() == 2 && V1Used && V2Used) {

11280

BestLoQuad = InputQuads[0] ? 0 : 1;

11281

BestHiQuad = InputQuads[2] ? 2 : 3;

11282

}

11283

if (InputQuads.count() > 2) {

11284

BestLoQuad = -1;

11285

BestHiQuad = -1;

11286

}

11287

}

11288

11289

// If BestLoQuad or BestHiQuad are set, shuffle the quads together and update

11290

// the shuffle mask. If a quad is scored as -1, that means that it contains

11291

// words from all 4 input quadwords.

11292

SDValue NewV;

11293

if (BestLoQuad >= 0 || BestHiQuad >= 0) {

11294

int MaskV[] = {

11295

BestLoQuad < 0 ? 0 : BestLoQuad,

11296

BestHiQuad < 0 ? 1 : BestHiQuad

11297

};

11298

NewV = DAG.getVectorShuffle(MVT::v2i64, dl,

11299

DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),

11300

DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);

11301

NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);

11302

11303

// Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the

11304

// source words for the shuffle, to aid later transformations.

11305

bool AllWordsInNewV = true;

11306

bool InOrder[2] = { true, true };

11307

for (unsigned i = 0; i != 8; ++i) {

11308

int idx = MaskVals[i];

11309

if (idx != (int)i)

11310

InOrder[i/4] = false;

11311

if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)

11312

continue;

11313

AllWordsInNewV = false;

11314

break;

11315

}

11316

11317

bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;

11318

if (AllWordsInNewV) {

11319

for (int i = 0; i != 8; ++i) {

11320

int idx = MaskVals[i];

11321

if (idx < 0)

11322

continue;

11323

idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;

11324

if ((idx != i) && idx < 4)

11325

pshufhw = false;

11326

if ((idx != i) && idx > 3)

11327

pshuflw = false;

11328

}

11329

V1 = NewV;

11330

V2Used = false;

11331

BestLoQuad = 0;

11332

BestHiQuad = 1;

11333

}

11334

11335

// If we've eliminated the use of V2, and the new mask is a pshuflw or

11336

// pshufhw, that's as cheap as it gets. Return the new shuffle.

11337

if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {

11338

unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;

11339

unsigned TargetMask = 0;

11340

NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,

11341

DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);

11342

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());

11343

TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):

11344

getShufflePSHUFLWImmediate(SVOp);

11345

V1 = NewV.getOperand(0);

11346

return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);

11347

}

11348

}

11349

11350

// Promote splats to a larger type which usually leads to more efficient code.

11351

// FIXME: Is this true if pshufb is available?

11352

if (SVOp->isSplat())

11353

return PromoteSplat(SVOp, DAG);

11354

11355

// If we have SSSE3, and all words of the result are from 1 input vector,

11356

// case 2 is generated, otherwise case 3 is generated. If no SSSE3

11357

// is present, fall back to case 4.

11358

if (Subtarget->hasSSSE3()) {

11359

SmallVector<SDValue,16> pshufbMask;

11360

11361

// If we have elements from both input vectors, set the high bit of the

11362

// shuffle mask element to zero out elements that come from V2 in the V1

11363

// mask, and elements that come from V1 in the V2 mask, so that the two

11364

// results can be OR'd together.

11365

bool TwoInputs = V1Used && V2Used;

11366

V1 = getPSHUFB(MaskVals, V1, dl, DAG);

11367

if (!TwoInputs)

11368

return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);

11369

11370

// Calculate the shuffle mask for the second input, shuffle it, and

11371

// OR it with the first shuffled input.

11372

CommuteVectorShuffleMask(MaskVals, 8);

11373

V2 = getPSHUFB(MaskVals, V2, dl, DAG);

11374

V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);

11375

return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);

11376

}

11377

11378

// If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,

11379

// and update MaskVals with new element order.

11380

std::bitset<8> InOrder;

11381

if (BestLoQuad >= 0) {

11382

int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };

11383

for (int i = 0; i != 4; ++i) {

11384

int idx = MaskVals[i];

11385

if (idx < 0) {

11386

InOrder.set(i);

11387

} else if ((idx / 4) == BestLoQuad) {

11388

MaskV[i] = idx & 3;

11389

InOrder.set(i);

11390

}

11391

}

11392

NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),

11393

&MaskV[0]);

11394

11395

if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {

11396

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());

11397

NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,

11398

NewV.getOperand(0),

11399

getShufflePSHUFLWImmediate(SVOp), DAG);

11400

}

11401

}

11402

11403

// If BestHi >= 0, generate a pshufhw to put the high elements in order,

11404

// and update MaskVals with the new element order.

11405

if (BestHiQuad >= 0) {

11406

int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };

11407

for (unsigned i = 4; i != 8; ++i) {

11408

int idx = MaskVals[i];

11409

if (idx < 0) {

11410

InOrder.set(i);

11411

} else if ((idx / 4) == BestHiQuad) {

11412

MaskV[i] = (idx & 3) + 4;

11413

InOrder.set(i);

11414

}

11415

}

11416

NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),

11417

&MaskV[0]);

11418

11419

if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {

11420

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());

11421

NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,

11422

NewV.getOperand(0),

11423

getShufflePSHUFHWImmediate(SVOp), DAG);

11424

}

11425

}

11426

11427

// In case BestHi & BestLo were both -1, which means each quadword has a word

11428

// from each of the four input quadwords, calculate the InOrder bitvector now

11429

// before falling through to the insert/extract cleanup.

11430

if (BestLoQuad == -1 && BestHiQuad == -1) {

11431

NewV = V1;

11432

for (int i = 0; i != 8; ++i)

11433

if (MaskVals[i] < 0 || MaskVals[i] == i)

11434

InOrder.set(i);

11435

}

11436

11437

// The other elements are put in the right place using pextrw and pinsrw.

11438

for (unsigned i = 0; i != 8; ++i) {

11439

if (InOrder[i])

11440

continue;

11441

int EltIdx = MaskVals[i];

11442

if (EltIdx < 0)

11443

continue;

11444

SDValue ExtOp = (EltIdx < 8) ?

11445

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,

11446

DAG.getIntPtrConstant(EltIdx)) :

11447

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,

11448

DAG.getIntPtrConstant(EltIdx - 8));

11449

NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,

11450

DAG.getIntPtrConstant(i));

11451

}

11452

return NewV;

11453

}

11454

11455

/// \brief v16i16 shuffles

11456

///

11457

/// FIXME: We only support generation of a single pshufb currently. We can

11458

/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as

11459

/// well (e.g 2 x pshufb + 1 x por).

11460

static SDValue

11461

LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {

11462

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

11463

SDValue V1 = SVOp->getOperand(0);

11464

SDValue V2 = SVOp->getOperand(1);

11465

SDLoc dl(SVOp);

11466

11467

if (V2.getOpcode() != ISD::UNDEF)

11468

return SDValue();

11469

11470

SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());

11471

return getPSHUFB(MaskVals, V1, dl, DAG);

11472

}

11473

11474

// v16i8 shuffles - Prefer shuffles in the following order:

11475

// 1. [ssse3] 1 x pshufb

11476

// 2. [ssse3] 2 x pshufb + 1 x por

11477

// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw

11478

static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,

11479

const X86Subtarget* Subtarget,

11480

SelectionDAG &DAG) {

11481

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

11482

SDValue V1 = SVOp->getOperand(0);

11483

SDValue V2 = SVOp->getOperand(1);

11484

SDLoc dl(SVOp);

11485

ArrayRef<int> MaskVals = SVOp->getMask();

11486

11487

// Promote splats to a larger type which usually leads to more efficient code.

11488

// FIXME: Is this true if pshufb is available?

11489

if (SVOp->isSplat())

11490

return PromoteSplat(SVOp, DAG);

11491

11492

// If we have SSSE3, case 1 is generated when all result bytes come from

11493

// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is

11494

// present, fall back to case 3.

11495

11496

// If SSSE3, use 1 pshufb instruction per vector with elements in the result.

11497

if (Subtarget->hasSSSE3()) {

11498

SmallVector<SDValue,16> pshufbMask;

11499

11500

// If all result elements are from one input vector, then only translate

11501

// undef mask values to 0x80 (zero out result) in the pshufb mask.

11502

11503

// Otherwise, we have elements from both input vectors, and must zero out

11504

// elements that come from V2 in the first mask, and V1 in the second mask

11505

// so that we can OR them together.

11506

for (unsigned i = 0; i != 16; ++i) {

11507

int EltIdx = MaskVals[i];

11508

if (EltIdx < 0 || EltIdx >= 16)

11509

EltIdx = 0x80;

11510

pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));

11511

}

11512

V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,

11513

DAG.getNode(ISD::BUILD_VECTOR, dl,

11514

MVT::v16i8, pshufbMask));

11515

11516

// As PSHUFB will zero elements with negative indices, it's safe to ignore

11517

// the 2nd operand if it's undefined or zero.

11518

if (V2.getOpcode() == ISD::UNDEF ||

11519

ISD::isBuildVectorAllZeros(V2.getNode()))

11520

return V1;

11521

11522

// Calculate the shuffle mask for the second input, shuffle it, and

11523

// OR it with the first shuffled input.

11524

pshufbMask.clear();

11525

for (unsigned i = 0; i != 16; ++i) {

11526

int EltIdx = MaskVals[i];

11527

EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;

11528

pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));

11529

}

11530

V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,

11531

DAG.getNode(ISD::BUILD_VECTOR, dl,

11532

MVT::v16i8, pshufbMask));

11533

return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);

11534

}

11535

11536

// No SSSE3 - Calculate in place words and then fix all out of place words

11537

// With 0-16 extracts & inserts. Worst case is 16 bytes out of order from

11538

// the 16 different words that comprise the two doublequadword input vectors.

11539

V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);

11540

V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);

11541

SDValue NewV = V1;

11542

for (int i = 0; i != 8; ++i) {

11543

int Elt0 = MaskVals[i*2];

11544

int Elt1 = MaskVals[i*2+1];

11545

11546

// This word of the result is all undef, skip it.

11547

if (Elt0 < 0 && Elt1 < 0)

11548

continue;

11549

11550

// This word of the result is already in the correct place, skip it.

11551

if ((Elt0 == i*2) && (Elt1 == i*2+1))

11552

continue;

11553

11554

SDValue Elt0Src = Elt0 < 16 ? V1 : V2;

11555

SDValue Elt1Src = Elt1 < 16 ? V1 : V2;

11556

SDValue InsElt;

11557

11558

// If Elt0 and Elt1 are defined, are consecutive, and can be load

11559

// using a single extract together, load it and store it.

11560

if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {

11561

InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,

11562

DAG.getIntPtrConstant(Elt1 / 2));

11563

NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,

11564

DAG.getIntPtrConstant(i));

11565

continue;

11566

}

11567

11568

// If Elt1 is defined, extract it from the appropriate source. If the

11569

// source byte is not also odd, shift the extracted word left 8 bits

11570

// otherwise clear the bottom 8 bits if we need to do an or.

11571

if (Elt1 >= 0) {

11572

InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,

11573

DAG.getIntPtrConstant(Elt1 / 2));

11574

if ((Elt1 & 1) == 0)

11575

InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,

11576

DAG.getConstant(8,

11577

TLI.getShiftAmountTy(InsElt.getValueType())));

11578

else if (Elt0 >= 0)

11579

InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,

11580

DAG.getConstant(0xFF00, MVT::i16));

11581

}

11582

// If Elt0 is defined, extract it from the appropriate source. If the

11583

// source byte is not also even, shift the extracted word right 8 bits. If

11584

// Elt1 was also defined, OR the extracted values together before

11585

// inserting them in the result.

11586

if (Elt0 >= 0) {

11587

SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

11588

Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));

11589

if ((Elt0 & 1) != 0)

11590

InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,

11591

DAG.getConstant(8,

11592

TLI.getShiftAmountTy(InsElt0.getValueType())));

11593

else if (Elt1 >= 0)

11594

InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,

11595

DAG.getConstant(0x00FF, MVT::i16));

11596

InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)

11597

: InsElt0;

11598

}

11599

NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,

11600

DAG.getIntPtrConstant(i));

11601

}

11602

return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);

11603

}

11604

11605

// v32i8 shuffles - Translate to VPSHUFB if possible.

11606

static

11607

SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,

11608

const X86Subtarget *Subtarget,

11609

SelectionDAG &DAG) {

11610

MVT VT = SVOp->getSimpleValueType(0);

11611

SDValue V1 = SVOp->getOperand(0);

11612

SDValue V2 = SVOp->getOperand(1);

11613

SDLoc dl(SVOp);

11614

SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());

11615

11616

bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;

11617

bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());

11618

bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());

11619

11620

// VPSHUFB may be generated if

11621

// (1) one of input vector is undefined or zeroinitializer.

11622

// The mask value 0x80 puts 0 in the corresponding slot of the vector.

11623

// And (2) the mask indexes don't cross the 128-bit lane.

11624

if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||

11625

(!V2IsUndef && !V2IsAllZero && !V1IsAllZero))

11626

return SDValue();

11627

11628

if (V1IsAllZero && !V2IsAllZero) {

11629

CommuteVectorShuffleMask(MaskVals, 32);

11630

V1 = V2;

11631

}

11632

return getPSHUFB(MaskVals, V1, dl, DAG);

11633

}

11634

11635

/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide

11636

/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be

11637

/// done when every pair / quad of shuffle mask elements point to elements in

11638

/// the right sequence. e.g.

11639

/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>

11640

static

11641

SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,

11642

SelectionDAG &DAG) {

11643

MVT VT = SVOp->getSimpleValueType(0);

11644

SDLoc dl(SVOp);

11645

unsigned NumElems = VT.getVectorNumElements();

11646

MVT NewVT;

11647

unsigned Scale;

11648

switch (VT.SimpleTy) {

11649

default: llvm_unreachable("Unexpected!")::llvm::llvm_unreachable_internal("Unexpected!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11649);

11650

case MVT::v2i64:

11651

case MVT::v2f64:

11652

return SDValue(SVOp, 0);

11653

case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;

11654

case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;

11655

case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;

11656

case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;

11657

case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;

11658

case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;

11659

}

11660

11661

SmallVector<int, 8> MaskVec;

11662

for (unsigned i = 0; i != NumElems; i += Scale) {

11663

int StartIdx = -1;

11664

for (unsigned j = 0; j != Scale; ++j) {

11665

int EltIdx = SVOp->getMaskElt(i+j);

11666

if (EltIdx < 0)

11667

continue;

11668

if (StartIdx < 0)

11669

StartIdx = (EltIdx / Scale);

11670

if (EltIdx != (int)(StartIdx*Scale + j))

11671

return SDValue();

11672

}

11673

MaskVec.push_back(StartIdx);

11674

}

11675

11676

SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));

11677

SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));

11678

return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);

11679

}

11680

11681

/// getVZextMovL - Return a zero-extending vector move low node.

11682

///

11683

static SDValue getVZextMovL(MVT VT, MVT OpVT,

11684

SDValue SrcOp, SelectionDAG &DAG,

11685

const X86Subtarget *Subtarget, SDLoc dl) {

11686

if (VT == MVT::v2f64 || VT == MVT::v4f32) {

11687

LoadSDNode *LD = nullptr;

11688

if (!isScalarLoadToVector(SrcOp.getNode(), &LD))

11689

LD = dyn_cast<LoadSDNode>(SrcOp);

11690

if (!LD) {

11691

// movssrr and movsdrr do not clear top bits. Try to use movd, movq

11692

// instead.

11693

MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;

11694

if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&

11695

SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&

11696

SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&

11697

SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {

11698

// PR2108

11699

OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;

11700

return DAG.getNode(ISD::BITCAST, dl, VT,

11701

DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,

11702

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

11703

OpVT,

11704

SrcOp.getOperand(0)

11705

.getOperand(0))));

11706

}

11707

}

11708

}

11709

11710

return DAG.getNode(ISD::BITCAST, dl, VT,

11711

DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,

11712

DAG.getNode(ISD::BITCAST, dl,

11713

OpVT, SrcOp)));

11714

}

11715

11716

/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles

11717

/// which could not be matched by any known target speficic shuffle

11718

static SDValue

11719

LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {

11720

11721

SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);

11722

if (NewOp.getNode())

11723

return NewOp;

11724

11725

MVT VT = SVOp->getSimpleValueType(0);

11726

11727

unsigned NumElems = VT.getVectorNumElements();

11728

unsigned NumLaneElems = NumElems / 2;

11729

11730

SDLoc dl(SVOp);

11731

MVT EltVT = VT.getVectorElementType();

11732

MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);

11733

SDValue Output[2];

11734

11735

SmallVector<int, 16> Mask;

11736

for (unsigned l = 0; l < 2; ++l) {

11737

// Build a shuffle mask for the output, discovering on the fly which

11738

// input vectors to use as shuffle operands (recorded in InputUsed).

11739

// If building a suitable shuffle vector proves too hard, then bail

11740

// out with UseBuildVector set.

11741

bool UseBuildVector = false;

11742

int InputUsed[2] = { -1, -1 }; // Not yet discovered.

11743

unsigned LaneStart = l * NumLaneElems;

11744

for (unsigned i = 0; i != NumLaneElems; ++i) {

11745

// The mask element. This indexes into the input.

11746

int Idx = SVOp->getMaskElt(i+LaneStart);

11747

if (Idx < 0) {

11748

// the mask element does not index into any input vector.

11749

Mask.push_back(-1);

11750

continue;

11751

}

11752

11753

// The input vector this mask element indexes into.

11754

int Input = Idx / NumLaneElems;

11755

11756

// Turn the index into an offset from the start of the input vector.

11757

Idx -= Input * NumLaneElems;

11758

11759

// Find or create a shuffle vector operand to hold this input.

11760

unsigned OpNo;

11761

for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {

11762

if (InputUsed[OpNo] == Input)

11763

// This input vector is already an operand.

11764

break;

11765

if (InputUsed[OpNo] < 0) {

11766

// Create a new operand for this input vector.

11767

InputUsed[OpNo] = Input;

11768

break;

11769

}

11770

}

11771

11772

if (OpNo >= array_lengthof(InputUsed)) {

11773

// More than two input vectors used! Give up on trying to create a

11774

// shuffle vector. Insert all elements into a BUILD_VECTOR instead.

11775

UseBuildVector = true;

11776

break;

11777

}

11778

11779

// Add the mask index for the new shuffle vector.

11780

Mask.push_back(Idx + OpNo * NumLaneElems);

11781

}

11782

11783

if (UseBuildVector) {

11784

SmallVector<SDValue, 16> SVOps;

11785

for (unsigned i = 0; i != NumLaneElems; ++i) {

11786

// The mask element. This indexes into the input.

11787

int Idx = SVOp->getMaskElt(i+LaneStart);

11788

if (Idx < 0) {

11789

SVOps.push_back(DAG.getUNDEF(EltVT));

11790

continue;

11791

}

11792

11793

// The input vector this mask element indexes into.

11794

int Input = Idx / NumElems;

11795

11796

// Turn the index into an offset from the start of the input vector.

11797

Idx -= Input * NumElems;

11798

11799

// Extract the vector element by hand.

11800

SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,

11801

SVOp->getOperand(Input),

11802

DAG.getIntPtrConstant(Idx)));

11803

}

11804

11805

// Construct the output using a BUILD_VECTOR.

11806

Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);

11807

} else if (InputUsed[0] < 0) {

11808

// No input vectors were used! The result is undefined.

11809

Output[l] = DAG.getUNDEF(NVT);

11810

} else {

11811

SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),

11812

(InputUsed[0] % 2) * NumLaneElems,

11813

DAG, dl);

11814

// If only one input was used, use an undefined vector for the other.

11815

SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :

11816

Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),

11817

(InputUsed[1] % 2) * NumLaneElems, DAG, dl);

11818

// At least one input vector was used. Create a new shuffle vector.

11819

Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);

11820

}

11821

11822

Mask.clear();

11823

}

11824

11825

// Concatenate the result back

11826

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);

11827

}

11828

11829

/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with

11830

/// 4 elements, and match them with several different shuffle types.

11831

static SDValue

11832

LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {

11833

SDValue V1 = SVOp->getOperand(0);

11834

SDValue V2 = SVOp->getOperand(1);

11835

SDLoc dl(SVOp);

11836

MVT VT = SVOp->getSimpleValueType(0);

11837

11838

assert(VT.is128BitVector() && "Unsupported vector size")((VT.is128BitVector() && "Unsupported vector size") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unsupported vector size\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11838, __PRETTY_FUNCTION__));

11839

11840

std::pair<int, int> Locs[4];

11841

int Mask1[] = { -1, -1, -1, -1 };

11842

SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());

11843

11844

unsigned NumHi = 0;

11845

unsigned NumLo = 0;

11846

for (unsigned i = 0; i != 4; ++i) {

11847

int Idx = PermMask[i];

11848

if (Idx < 0) {

11849

Locs[i] = std::make_pair(-1, -1);

11850

} else {

11851

assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!")((Idx < 8 && "Invalid VECTOR_SHUFFLE index!") ? static_cast
<void> (0) : __assert_fail ("Idx < 8 && \"Invalid VECTOR_SHUFFLE index!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11851, __PRETTY_FUNCTION__));

11852

if (Idx < 4) {

11853

Locs[i] = std::make_pair(0, NumLo);

11854

Mask1[NumLo] = Idx;

11855

NumLo++;

11856

} else {

11857

Locs[i] = std::make_pair(1, NumHi);

11858

if (2+NumHi < 4)

11859

Mask1[2+NumHi] = Idx;

11860

NumHi++;

11861

}

11862

}

11863

}

11864

11865

if (NumLo <= 2 && NumHi <= 2) {

11866

// If no more than two elements come from either vector. This can be

11867

// implemented with two shuffles. First shuffle gather the elements.

11868

// The second shuffle, which takes the first shuffle as both of its

11869

// vector operands, put the elements into the right order.

11870

V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);

11871

11872

int Mask2[] = { -1, -1, -1, -1 };

11873

11874

for (unsigned i = 0; i != 4; ++i)

11875

if (Locs[i].first != -1) {

11876

unsigned Idx = (i < 2) ? 0 : 4;

11877

Idx += Locs[i].first * 2 + Locs[i].second;

11878

Mask2[i] = Idx;

11879

}

11880

11881

return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);

11882

}

11883

11884

if (NumLo == 3 || NumHi == 3) {

11885

// Otherwise, we must have three elements from one vector, call it X, and

11886

// one element from the other, call it Y. First, use a shufps to build an

11887

// intermediate vector with the one element from Y and the element from X

11888

// that will be in the same half in the final destination (the indexes don't

11889

// matter). Then, use a shufps to build the final vector, taking the half

11890

// containing the element from Y from the intermediate, and the other half

11891

// from X.

11892

if (NumHi == 3) {

11893

// Normalize it so the 3 elements come from V1.

11894

CommuteVectorShuffleMask(PermMask, 4);

11895

std::swap(V1, V2);

11896

}

11897

11898

// Find the element from V2.

11899

unsigned HiIndex;

11900

for (HiIndex = 0; HiIndex < 3; ++HiIndex) {

11901

int Val = PermMask[HiIndex];

11902

if (Val < 0)

11903

continue;

11904

if (Val >= 4)

11905

break;

11906

}

11907

11908

Mask1[0] = PermMask[HiIndex];

11909

Mask1[1] = -1;

11910

Mask1[2] = PermMask[HiIndex^1];

11911

Mask1[3] = -1;

11912

V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);

11913

11914

if (HiIndex >= 2) {

11915

Mask1[0] = PermMask[0];

11916

Mask1[1] = PermMask[1];

11917

Mask1[2] = HiIndex & 1 ? 6 : 4;

11918

Mask1[3] = HiIndex & 1 ? 4 : 6;

11919

return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);

11920

}

11921

11922

Mask1[0] = HiIndex & 1 ? 2 : 0;

11923

Mask1[1] = HiIndex & 1 ? 0 : 2;

11924

Mask1[2] = PermMask[2];

11925

Mask1[3] = PermMask[3];

11926

if (Mask1[2] >= 0)

11927

Mask1[2] += 4;

11928

if (Mask1[3] >= 0)

11929

Mask1[3] += 4;

11930

return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);

11931

}

11932

11933

// Break it into (shuffle shuffle_hi, shuffle_lo).

11934

int LoMask[] = { -1, -1, -1, -1 };

11935

int HiMask[] = { -1, -1, -1, -1 };

11936

11937

int *MaskPtr = LoMask;

11938

unsigned MaskIdx = 0;

11939

unsigned LoIdx = 0;

11940

unsigned HiIdx = 2;

11941

for (unsigned i = 0; i != 4; ++i) {

11942

if (i == 2) {

11943

MaskPtr = HiMask;

11944

MaskIdx = 1;

11945

LoIdx = 0;

11946

HiIdx = 2;

11947

}

11948

int Idx = PermMask[i];

11949

if (Idx < 0) {

11950

Locs[i] = std::make_pair(-1, -1);

11951

} else if (Idx < 4) {

11952

Locs[i] = std::make_pair(MaskIdx, LoIdx);

11953

MaskPtr[LoIdx] = Idx;

11954

LoIdx++;

11955

} else {

11956

Locs[i] = std::make_pair(MaskIdx, HiIdx);

11957

MaskPtr[HiIdx] = Idx;

11958

HiIdx++;

11959

}

11960

}

11961

11962

SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);

11963

SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);

11964

int MaskOps[] = { -1, -1, -1, -1 };

11965

for (unsigned i = 0; i != 4; ++i)

11966

if (Locs[i].first != -1)

11967

MaskOps[i] = Locs[i].first * 4 + Locs[i].second;

11968

return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);

11969

}

11970

11971

static bool MayFoldVectorLoad(SDValue V) {

11972

while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)

11973

V = V.getOperand(0);

11974

11975

if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)

11976

V = V.getOperand(0);

11977

if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&

11978

V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)

11979

// BUILD_VECTOR (load), undef

11980

V = V.getOperand(0);

11981

11982

return MayFoldLoad(V);

11983

}

11984

11985

static

11986

SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {

11987

MVT VT = Op.getSimpleValueType();

11988

11989

// Canonizalize to v2f64.

11990

V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);

11991

return DAG.getNode(ISD::BITCAST, dl, VT,

11992

getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,

11993

V1, DAG));

11994

}

11995

11996

static

11997

SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,

11998

bool HasSSE2) {

11999

SDValue V1 = Op.getOperand(0);

12000

SDValue V2 = Op.getOperand(1);

12001

MVT VT = Op.getSimpleValueType();

12002

12003

assert(VT != MVT::v2i64 && "unsupported shuffle type")((VT != MVT::v2i64 && "unsupported shuffle type") ? static_cast
<void> (0) : __assert_fail ("VT != MVT::v2i64 && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12003, __PRETTY_FUNCTION__));

12004

12005

if (HasSSE2 && VT == MVT::v2f64)

12006

return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);

12007

12008

// v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)

12009

return DAG.getNode(ISD::BITCAST, dl, VT,

12010

getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,

12011

DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),

12012

DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));

12013

}

12014

12015

static

12016

SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {

12017

SDValue V1 = Op.getOperand(0);

12018

SDValue V2 = Op.getOperand(1);

12019

MVT VT = Op.getSimpleValueType();

12020

12021

assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&(((VT == MVT::v4i32 || VT == MVT::v4f32) && "unsupported shuffle type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v4f32) && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12022, __PRETTY_FUNCTION__))

12022

"unsupported shuffle type")(((VT == MVT::v4i32 || VT == MVT::v4f32) && "unsupported shuffle type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v4f32) && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12022, __PRETTY_FUNCTION__));

12023

12024

if (V2.getOpcode() == ISD::UNDEF)

12025

V2 = V1;

12026

12027

// v4i32 or v4f32

12028

return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);

12029

}

12030

12031

static

12032

SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {

12033

SDValue V1 = Op.getOperand(0);

12034

SDValue V2 = Op.getOperand(1);

12035

MVT VT = Op.getSimpleValueType();

12036

unsigned NumElems = VT.getVectorNumElements();

12037

12038

// Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second

12039

// operand of these instructions is only memory, so check if there's a

12040

// potencial load folding here, otherwise use SHUFPS or MOVSD to match the

12041

// same masks.

12042

bool CanFoldLoad = false;

12043

12044

// Trivial case, when V2 comes from a load.

12045

if (MayFoldVectorLoad(V2))

12046

CanFoldLoad = true;

12047

12048

// When V1 is a load, it can be folded later into a store in isel, example:

12049

// (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)

12050

// turns into:

12051

// (MOVLPSmr addr:$src1, VR128:$src2)

12052

// So, recognize this potential and also use MOVLPS or MOVLPD

12053

else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))

12054

CanFoldLoad = true;

12055

12056

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

12057

if (CanFoldLoad) {

12058

if (HasSSE2 && NumElems == 2)

12059

return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);

12060

12061

if (NumElems == 4)

12062

// If we don't care about the second element, proceed to use movss.

12063

if (SVOp->getMaskElt(1) != -1)

12064

return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);

12065

}

12066

12067

// movl and movlp will both match v2i64, but v2i64 is never matched by

12068

// movl earlier because we make it strict to avoid messing with the movlp load

12069

// folding logic (see the code above getMOVLP call). Match it here then,

12070

// this is horrible, but will stay like this until we move all shuffle

12071

// matching to x86 specific nodes. Note that for the 1st condition all

12072

// types are matched with movsd.

12073

if (HasSSE2) {

12074

// FIXME: isMOVLMask should be checked and matched before getMOVLP,

12075

// as to remove this logic from here, as much as possible

12076

if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))

12077

return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);

12078

return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);

12079

}

12080

12081

assert(VT != MVT::v4i32 && "unsupported shuffle type")((VT != MVT::v4i32 && "unsupported shuffle type") ? static_cast
<void> (0) : __assert_fail ("VT != MVT::v4i32 && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12081, __PRETTY_FUNCTION__));

12082

12083

// Invert the operand order and use SHUFPS to match it.

12084

return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,

12085

getShuffleSHUFImmediate(SVOp), DAG);

12086

}

12087

12088

static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,

12089

SelectionDAG &DAG) {

12090

SDLoc dl(Load);

12091

MVT VT = Load->getSimpleValueType(0);

12092

MVT EVT = VT.getVectorElementType();

12093

SDValue Addr = Load->getOperand(1);

12094

SDValue NewAddr = DAG.getNode(

12095

ISD::ADD, dl, Addr.getSimpleValueType(), Addr,

12096

DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));

12097

12098

SDValue NewLoad =

12099

DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,

12100

DAG.getMachineFunction().getMachineMemOperand(

12101

Load->getMemOperand(), 0, EVT.getStoreSize()));

12102

return NewLoad;

12103

}

12104

12105

// It is only safe to call this function if isINSERTPSMask is true for

12106

// this shufflevector mask.

12107

static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,

12108

SelectionDAG &DAG) {

12109

// Generate an insertps instruction when inserting an f32 from memory onto a

12110

// v4f32 or when copying a member from one v4f32 to another.

12111

// We also use it for transferring i32 from one register to another,

12112

// since it simply copies the same bits.

12113

// If we're transferring an i32 from memory to a specific element in a

12114

// register, we output a generic DAG that will match the PINSRD

12115

// instruction.

12116

MVT VT = SVOp->getSimpleValueType(0);

12117

MVT EVT = VT.getVectorElementType();

12118

SDValue V1 = SVOp->getOperand(0);

12119

SDValue V2 = SVOp->getOperand(1);

12120

auto Mask = SVOp->getMask();

12121

assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&(((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"unsupported vector type for insertps/pinsrd\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12122, __PRETTY_FUNCTION__))

12122

"unsupported vector type for insertps/pinsrd")(((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"unsupported vector type for insertps/pinsrd\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12122, __PRETTY_FUNCTION__));

12123

12124

auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };

12125

auto FromV2Predicate = [](const int &i) { return i >= 4; };

12126

int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);

12127

12128

SDValue From;

12129

SDValue To;

12130

unsigned DestIndex;

12131

if (FromV1 == 1) {

12132

From = V1;

12133

To = V2;

12134

DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -

12135

Mask.begin();

12136

12137

// If we have 1 element from each vector, we have to check if we're

12138

// changing V1's element's place. If so, we're done. Otherwise, we

12139

// should assume we're changing V2's element's place and behave

12140

// accordingly.

12141

int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);

12142

assert(DestIndex <= INT32_MAX && "truncated destination index")((DestIndex <= (2147483647) && "truncated destination index"
) ? static_cast<void> (0) : __assert_fail ("DestIndex <= (2147483647) && \"truncated destination index\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12142, __PRETTY_FUNCTION__));

12143

if (FromV1 == FromV2 &&

12144

static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {

12145

From = V2;

12146

To = V1;

12147

DestIndex =

12148

std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();

12149

}

12150

} else {

12151

assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__))

12152

"More than one element from V1 and from V2, or no elements from one "((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__))

12153

"of the vectors. This case should not have returned true from "((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__))

12154

"isINSERTPSMask")((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__));

12155

From = V2;

12156

To = V1;

12157

DestIndex =

12158

std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();

12159

}

12160

12161

// Get an index into the source vector in the range [0,4) (the mask is

12162

// in the range [0,8) because it can address V1 and V2)

12163

unsigned SrcIndex = Mask[DestIndex] % 4;

12164

if (MayFoldLoad(From)) {

12165

// Trivial case, when From comes from a load and is only used by the

12166

// shuffle. Make it use insertps from the vector that we need from that

12167

// load.

12168

SDValue NewLoad =

12169

NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);

12170

if (!NewLoad.getNode())

12171

return SDValue();

12172

12173

if (EVT == MVT::f32) {

12174

// Create this as a scalar to vector to match the instruction pattern.

12175

SDValue LoadScalarToVector =

12176

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);

12177

SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);

12178

return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,

12179

InsertpsMask);

12180

} else { // EVT == MVT::i32

12181

// If we're getting an i32 from memory, use an INSERT_VECTOR_ELT

12182

// instruction, to match the PINSRD instruction, which loads an i32 to a

12183

// certain vector element.

12184

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,

12185

DAG.getConstant(DestIndex, MVT::i32));

12186

}

12187

}

12188

12189

// Vector-element-to-vector

12190

SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);

12191

return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);

12192

}

12193

12194

// Reduce a vector shuffle to zext.

12195

static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,

12196

SelectionDAG &DAG) {

12197

// PMOVZX is only available from SSE41.

12198

if (!Subtarget->hasSSE41())

12199

return SDValue();

12200

12201

MVT VT = Op.getSimpleValueType();

12202

12203

// Only AVX2 support 256-bit vector integer extending.

12204

if (!Subtarget->hasInt256() && VT.is256BitVector())

12205

return SDValue();

12206

12207

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

12208

SDLoc DL(Op);

12209

SDValue V1 = Op.getOperand(0);

12210

SDValue V2 = Op.getOperand(1);

12211

unsigned NumElems = VT.getVectorNumElements();

12212

12213

// Extending is an unary operation and the element type of the source vector

12214

// won't be equal to or larger than i64.

12215

if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||

12216

VT.getVectorElementType() == MVT::i64)

12217

return SDValue();

12218

12219

// Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.

12220

unsigned Shift = 1; // Start from 2, i.e. 1 << 1.

12221

while ((1U << Shift) < NumElems) {

12222

if (SVOp->getMaskElt(1U << Shift) == 1)

12223

break;

12224

Shift += 1;

12225

// The maximal ratio is 8, i.e. from i8 to i64.

12226

if (Shift > 3)

12227

return SDValue();

12228

}

12229

12230

// Check the shuffle mask.

12231

unsigned Mask = (1U << Shift) - 1;

12232

for (unsigned i = 0; i != NumElems; ++i) {

12233

int EltIdx = SVOp->getMaskElt(i);

12234

if ((i & Mask) != 0 && EltIdx != -1)

12235

return SDValue();

12236

if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))

12237

return SDValue();

12238

}

12239

12240

unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;

12241

MVT NeVT = MVT::getIntegerVT(NBits);

12242

MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);

12243

12244

if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))

12245

return SDValue();

12246

12247

return DAG.getNode(ISD::BITCAST, DL, VT,

12248

DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));

12249

}

12250

12251

static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,

12252

SelectionDAG &DAG) {

12253

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

12254

MVT VT = Op.getSimpleValueType();

12255

SDLoc dl(Op);

12256

SDValue V1 = Op.getOperand(0);

12257

SDValue V2 = Op.getOperand(1);

12258

12259

if (isZeroShuffle(SVOp))

12260

return getZeroVector(VT, Subtarget, DAG, dl);

12261

12262

// Handle splat operations

12263

if (SVOp->isSplat()) {

12264

// Use vbroadcast whenever the splat comes from a foldable load

12265

SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);

12266

if (Broadcast.getNode())

12267

return Broadcast;

12268

}

12269

12270

// Check integer expanding shuffles.

12271

SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);

12272

if (NewOp.getNode())

12273

return NewOp;

12274

12275

// If the shuffle can be profitably rewritten as a narrower shuffle, then

12276

// do it!

12277

if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||

12278

VT == MVT::v32i8) {

12279

SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);

12280

if (NewOp.getNode())

12281

return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);

12282

} else if (VT.is128BitVector() && Subtarget->hasSSE2()) {

12283

// FIXME: Figure out a cleaner way to do this.

12284

if (ISD::isBuildVectorAllZeros(V2.getNode())) {

12285

SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);

12286

if (NewOp.getNode()) {

12287

MVT NewVT = NewOp.getSimpleValueType();

12288

if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),

12289

NewVT, true, false))

12290

return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,

12291

dl);

12292

}

12293

} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {

12294

SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);

12295

if (NewOp.getNode()) {

12296

MVT NewVT = NewOp.getSimpleValueType();

12297

if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))

12298

return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,

12299

dl);

12300

}

12301

}

12302

}

12303

return SDValue();

12304

}

12305

12306

SDValue

12307

X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {

12308

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

12309

SDValue V1 = Op.getOperand(0);

12310

SDValue V2 = Op.getOperand(1);

12311

MVT VT = Op.getSimpleValueType();

12312

SDLoc dl(Op);

12313

unsigned NumElems = VT.getVectorNumElements();

12314

bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;

12315

bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;

12316

bool V1IsSplat = false;

12317

bool V2IsSplat = false;

12318

bool HasSSE2 = Subtarget->hasSSE2();

12319

bool HasFp256 = Subtarget->hasFp256();

12320

bool HasInt256 = Subtarget->hasInt256();

12321

MachineFunction &MF = DAG.getMachineFunction();

12322

bool OptForSize = MF.getFunction()->getAttributes().

12323

hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);

12324

12325

// Check if we should use the experimental vector shuffle lowering. If so,

12326

// delegate completely to that code path.

12327

if (ExperimentalVectorShuffleLowering)

12328

return lowerVectorShuffle(Op, Subtarget, DAG);

12329

12330

12331

12332

if (V1IsUndef && V2IsUndef)

12333

return DAG.getUNDEF(VT);

12334

12335

// When we create a shuffle node we put the UNDEF node to second operand,

12336

// but in some cases the first operand may be transformed to UNDEF.

12337

// In this case we should just commute the node.

12338

if (V1IsUndef)

12339

return DAG.getCommutedVectorShuffle(*SVOp);

12340

12341

// Vector shuffle lowering takes 3 steps:

12342

12343

// 1) Normalize the input vectors. Here splats, zeroed vectors, profitable

12344

// narrowing and commutation of operands should be handled.

12345

// 2) Matching of shuffles with known shuffle masks to x86 target specific

12346

// shuffle nodes.

12347

// 3) Rewriting of unmatched masks into new generic shuffle operations,

12348

// so the shuffle can be broken into other shuffles and the legalizer can

12349

// try the lowering again.

12350

12351

// The general idea is that no vector_shuffle operation should be left to

12352

// be matched during isel, all of them must be converted to a target specific

12353

// node here.

12354

12355

// Normalize the input vectors. Here splats, zeroed vectors, profitable

12356

// narrowing and commutation of operands should be handled. The actual code

12357

// doesn't include all of those, work in progress...

12358

SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);

12359

if (NewOp.getNode())

12360

return NewOp;

12361

12362

SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());

12363

12364

// NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and

12365

// unpckh_undef). Only use pshufd if speed is more important than size.

12366

if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))

12367

return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);

12368

if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))

12369

return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);

12370

12371

if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&

12372

V2IsUndef && MayFoldVectorLoad(V1))

12373

return getMOVDDup(Op, dl, V1, DAG);

12374

12375

if (isMOVHLPS_v_undef_Mask(M, VT))

12376

return getMOVHighToLow(Op, dl, DAG);

12377

12378

// Use to match splats

12379

if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&

12380

(VT == MVT::v2f64 || VT == MVT::v2i64))

12381

return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);

12382

12383

if (isPSHUFDMask(M, VT)) {

12384

// The actual implementation will match the mask in the if above and then

12385

// during isel it can match several different instructions, not only pshufd

12386

// as its name says, sad but true, emulate the behavior for now...

12387

if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))

12388

return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);

12389

12390

unsigned TargetMask = getShuffleSHUFImmediate(SVOp);

12391

12392

if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))

12393

return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);

12394

12395

if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))

12396

return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,

12397

DAG);

12398

12399

return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,

12400

TargetMask, DAG);

12401

}

12402

12403

if (isPALIGNRMask(M, VT, Subtarget))

12404

return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,

12405

getShufflePALIGNRImmediate(SVOp),

12406

DAG);

12407

12408

if (isVALIGNMask(M, VT, Subtarget))

12409

return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,

12410

getShuffleVALIGNImmediate(SVOp),

12411

DAG);

12412

12413

// Check if this can be converted into a logical shift.

12414

bool isLeft = false;

12415

unsigned ShAmt = 0;

12416

SDValue ShVal;

12417

bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);

12418

if (isShift && ShVal.hasOneUse()) {

12419

// If the shifted value has multiple uses, it may be cheaper to use

12420

// v_set0 + movlhps or movhlps, etc.

12421

MVT EltVT = VT.getVectorElementType();

12422

ShAmt *= EltVT.getSizeInBits();

12423

return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);

12424

}

12425

12426

if (isMOVLMask(M, VT)) {

12427

if (ISD::isBuildVectorAllZeros(V1.getNode()))

12428

return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);

12429

if (!isMOVLPMask(M, VT)) {

12430

if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))

12431

return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);

12432

12433

if (VT == MVT::v4i32 || VT == MVT::v4f32)

12434

return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);

12435

}

12436

}

12437

12438

// FIXME: fold these into legal mask.

12439

if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))

12440

return getMOVLowToHigh(Op, dl, DAG, HasSSE2);

12441

12442

if (isMOVHLPSMask(M, VT))

12443

return getMOVHighToLow(Op, dl, DAG);

12444

12445

if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))

12446

return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);

12447

12448

if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))

12449

return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);

12450

12451

if (isMOVLPMask(M, VT))

12452

return getMOVLP(Op, dl, DAG, HasSSE2);

12453

12454

if (ShouldXformToMOVHLPS(M, VT) ||

12455

ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))

12456

return DAG.getCommutedVectorShuffle(*SVOp);

12457

12458

if (isShift) {

12459

// No better options. Use a vshldq / vsrldq.

12460

MVT EltVT = VT.getVectorElementType();

12461

ShAmt *= EltVT.getSizeInBits();

12462

return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);

12463

}

12464

12465

bool Commuted = false;

12466

// FIXME: This should also accept a bitcast of a splat? Be careful, not

12467

// 1,1,1,1 -> v8i16 though.

12468

BitVector UndefElements;

12469

if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))

12470

if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())

12471

V1IsSplat = true;

12472

if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))

12473

if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())

12474

V2IsSplat = true;

12475

12476

// Canonicalize the splat or undef, if present, to be on the RHS.

12477

if (!V2IsUndef && V1IsSplat && !V2IsSplat) {

12478

CommuteVectorShuffleMask(M, NumElems);

12479

std::swap(V1, V2);

12480

std::swap(V1IsSplat, V2IsSplat);

12481

Commuted = true;

12482

}

12483

12484

if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {

12485

// Shuffling low element of v1 into undef, just return v1.

12486

if (V2IsUndef)

12487

return V1;

12488

// If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which

12489

// the instruction selector will not match, so get a canonical MOVL with

12490

// swapped operands to undo the commute.

12491

return getMOVL(DAG, dl, VT, V2, V1);

12492

}

12493

12494

if (isUNPCKLMask(M, VT, HasInt256))

12495

return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);

12496

12497

if (isUNPCKHMask(M, VT, HasInt256))

12498

return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);

12499

12500

if (V2IsSplat) {

12501

// Normalize mask so all entries that point to V2 points to its first

12502

// element then try to match unpck{h|l} again. If match, return a

12503

// new vector_shuffle with the corrected mask.p

12504

SmallVector<int, 8> NewMask(M.begin(), M.end());

12505

NormalizeMask(NewMask, NumElems);

12506

if (isUNPCKLMask(NewMask, VT, HasInt256, true))

12507

return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);

12508

if (isUNPCKHMask(NewMask, VT, HasInt256, true))

12509

return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);

12510

}

12511

12512

if (Commuted) {

12513

// Commute is back and try unpck* again.

12514

// FIXME: this seems wrong.

12515

CommuteVectorShuffleMask(M, NumElems);

12516

std::swap(V1, V2);

12517

std::swap(V1IsSplat, V2IsSplat);

12518

12519

if (isUNPCKLMask(M, VT, HasInt256))

12520

return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);

12521

12522

if (isUNPCKHMask(M, VT, HasInt256))

12523

return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);

12524

}

12525

12526

// Normalize the node to match x86 shuffle ops if needed

12527

if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))

12528

return DAG.getCommutedVectorShuffle(*SVOp);

12529

12530

// The checks below are all present in isShuffleMaskLegal, but they are

12531

// inlined here right now to enable us to directly emit target specific

12532

// nodes, and remove one by one until they don't return Op anymore.

12533

12534

if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&

12535

SVOp->getSplatIndex() == 0 && V2IsUndef) {

12536

if (VT == MVT::v2f64 || VT == MVT::v2i64)

12537

return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);

12538

}

12539

12540

if (isPSHUFHWMask(M, VT, HasInt256))

12541

return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,

12542

getShufflePSHUFHWImmediate(SVOp),

12543

DAG);

12544

12545

if (isPSHUFLWMask(M, VT, HasInt256))

12546

return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,

12547

getShufflePSHUFLWImmediate(SVOp),

12548

DAG);

12549

12550

unsigned MaskValue;

12551

if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),

12552

&MaskValue))

12553

return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);

12554

12555

if (isSHUFPMask(M, VT))

12556

return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,

12557

getShuffleSHUFImmediate(SVOp), DAG);

12558

12559

if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))

12560

return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);

12561

if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))

12562

return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);

12563

12564

//===--------------------------------------------------------------------===//

12565

// Generate target specific nodes for 128 or 256-bit shuffles only

12566

// supported in the AVX instruction set.

12567

12568

12569

// Handle VMOVDDUPY permutations

12570

if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))

12571

return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);

12572

12573

// Handle VPERMILPS/D* permutations

12574

if (isVPERMILPMask(M, VT)) {

12575

if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)

12576

return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,

12577

getShuffleSHUFImmediate(SVOp), DAG);

12578

return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,

12579

getShuffleSHUFImmediate(SVOp), DAG);

12580

}

12581

12582

unsigned Idx;

12583

if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))

12584

return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),

12585

Idx*(NumElems/2), DAG, dl);

12586

12587

// Handle VPERM2F128/VPERM2I128 permutations

12588

if (isVPERM2X128Mask(M, VT, HasFp256))

12589

return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,

12590

V2, getShuffleVPERM2X128Immediate(SVOp), DAG);

12591

12592

if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))

12593

return getINSERTPS(SVOp, dl, DAG);

12594

12595

unsigned Imm8;

12596

if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))

12597

return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);

12598

12599

if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||

12600

VT.is512BitVector()) {

12601

MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());

12602

MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);

12603

SmallVector<SDValue, 16> permclMask;

12604

for (unsigned i = 0; i != NumElems; ++i) {

12605

permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));

12606

}

12607

12608

SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);

12609

if (V2IsUndef)

12610

// Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32

12611

return DAG.getNode(X86ISD::VPERMV, dl, VT,

12612

DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);

12613

return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,

12614

DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);

12615

}

12616

12617

//===--------------------------------------------------------------------===//

12618

// Since no target specific shuffle was selected for this generic one,

12619

// lower it into other known shuffles. FIXME: this isn't true yet, but

12620

// this is the plan.

12621

12622

12623

// Handle v8i16 specifically since SSE can do byte extraction and insertion.

12624

if (VT == MVT::v8i16) {

12625

SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);

12626

if (NewOp.getNode())

12627

return NewOp;

12628

}

12629

12630

if (VT == MVT::v16i16 && Subtarget->hasInt256()) {

12631

SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);

12632

if (NewOp.getNode())

12633

return NewOp;

12634

}

12635

12636

if (VT == MVT::v16i8) {

12637

SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);

12638

if (NewOp.getNode())

12639

return NewOp;

12640

}

12641

12642

if (VT == MVT::v32i8) {

12643

SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);

12644

if (NewOp.getNode())

12645

return NewOp;

12646

}

12647

12648

// Handle all 128-bit wide vectors with 4 elements, and match them with

12649

// several different shuffle types.

12650

if (NumElems == 4 && VT.is128BitVector())

12651

return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);

12652

12653

// Handle general 256-bit shuffles

12654

if (VT.is256BitVector())

12655

return LowerVECTOR_SHUFFLE_256(SVOp, DAG);

12656

12657

return SDValue();

12658

}

12659

12660

// This function assumes its argument is a BUILD_VECTOR of constants or

12661

// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is

12662

// true.

12663

static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,

12664

unsigned &MaskValue) {

12665

MaskValue = 0;

12666

unsigned NumElems = BuildVector->getNumOperands();

12667

// There are 2 lanes if (NumElems > 8), and 1 lane otherwise.

12668

unsigned NumLanes = (NumElems - 1) / 8 + 1;

12669

unsigned NumElemsInLane = NumElems / NumLanes;

12670

12671

// Blend for v16i16 should be symetric for the both lanes.

12672

for (unsigned i = 0; i < NumElemsInLane; ++i) {

12673

SDValue EltCond = BuildVector->getOperand(i);

12674

SDValue SndLaneEltCond =

12675

(NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;

12676

12677

int Lane1Cond = -1, Lane2Cond = -1;

12678

if (isa<ConstantSDNode>(EltCond))

12679

Lane1Cond = !isZero(EltCond);

12680

if (isa<ConstantSDNode>(SndLaneEltCond))

12681

Lane2Cond = !isZero(SndLaneEltCond);

12682

12683

if (Lane1Cond == Lane2Cond || Lane2Cond < 0)

12684

// Lane1Cond != 0, means we want the first argument.

12685

// Lane1Cond == 0, means we want the second argument.

12686

// The encoding of this argument is 0 for the first argument, 1

12687

// for the second. Therefore, invert the condition.

12688

MaskValue |= !Lane1Cond << i;

12689

else if (Lane1Cond < 0)

12690

MaskValue |= !Lane2Cond << i;

12691

else

12692

return false;

12693

}

12694

return true;

12695

}

12696

12697

/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend

12698

/// instruction.

12699

static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,

12700

SelectionDAG &DAG) {

12701

SDValue Cond = Op.getOperand(0);

12702

SDValue LHS = Op.getOperand(1);

12703

SDValue RHS = Op.getOperand(2);

12704

SDLoc dl(Op);

12705

MVT VT = Op.getSimpleValueType();

12706

MVT EltVT = VT.getVectorElementType();

12707

unsigned NumElems = VT.getVectorNumElements();

12708

12709

// There is no blend with immediate in AVX-512.

12710

if (VT.is512BitVector())

12711

return SDValue();

12712

12713

if (!Subtarget->hasSSE41() || EltVT == MVT::i8)

12714

return SDValue();

12715

if (!Subtarget->hasInt256() && VT == MVT::v16i16)

12716

return SDValue();

12717

12718

if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

12719

return SDValue();

12720

12721

// Check the mask for BLEND and build the value.

12722

unsigned MaskValue = 0;

12723

if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))

12724

return SDValue();

12725

12726

// Convert i32 vectors to floating point if it is not AVX2.

12727

// AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.

12728

MVT BlendVT = VT;

12729

if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {

12730

BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),

12731

NumElems);

12732

LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);

12733

RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);

12734

}

12735

12736

SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,

12737

DAG.getConstant(MaskValue, MVT::i32));

12738

return DAG.getNode(ISD::BITCAST, dl, VT, Ret);

12739

}

12740

12741

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

12742

// A vselect where all conditions and data are constants can be optimized into

12743

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

12744

if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&

12745

ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&

12746

ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))

12747

return SDValue();

12748

12749

SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);

12750

if (BlendOp.getNode())

12751

return BlendOp;

12752

12753

// Some types for vselect were previously set to Expand, not Legal or

12754

// Custom. Return an empty SDValue so we fall-through to Expand, after

12755

// the Custom lowering phase.

12756

MVT VT = Op.getSimpleValueType();

12757

switch (VT.SimpleTy) {

12758

default:

12759

break;

12760

case MVT::v8i16:

12761

case MVT::v16i16:

12762

if (Subtarget->hasBWI() && Subtarget->hasVLX())

12763

break;

12764

return SDValue();

12765

}

12766

12767

// We couldn't create a "Blend with immediate" node.

12768

// This node should still be legal, but we'll have to emit a blendv*

12769

// instruction.

12770

return Op;

12771

}

12772

12773

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

12774

MVT VT = Op.getSimpleValueType();

12775

SDLoc dl(Op);

12776

12777

if (!Op.getOperand(0).getSimpleValueType().is128BitVector())

12778

return SDValue();

12779

12780

if (VT.getSizeInBits() == 8) {

12781

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,

12782

Op.getOperand(0), Op.getOperand(1));

12783

SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,

12784

DAG.getValueType(VT));

12785

return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);

12786

}

12787

12788

if (VT.getSizeInBits() == 16) {

12789

unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

12790

// If Idx is 0, it's cheaper to do a move instead of a pextrw.

12791

if (Idx == 0)

12792

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

12793

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

12794

DAG.getNode(ISD::BITCAST, dl,

12795

MVT::v4i32,

12796

Op.getOperand(0)),

12797

Op.getOperand(1)));

12798

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,

12799

Op.getOperand(0), Op.getOperand(1));

12800

SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,

12801

DAG.getValueType(VT));

12802

return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);

12803

}

12804

12805

if (VT == MVT::f32) {

12806

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

12807

// the result back to FR32 register. It's only worth matching if the

12808

// result has a single use which is a store or a bitcast to i32. And in

12809

// the case of a store, it's not worth it if the index is a constant 0,

12810

// because a MOVSSmr can be used instead, which is smaller and faster.

12811

if (!Op.hasOneUse())

12812

return SDValue();

12813

SDNode *User = *Op.getNode()->use_begin();

12814

if ((User->getOpcode() != ISD::STORE ||

12815

(isa<ConstantSDNode>(Op.getOperand(1)) &&

12816

cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&

12817

(User->getOpcode() != ISD::BITCAST ||

12818

User->getValueType(0) != MVT::i32))

12819

return SDValue();

12820

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

12821

DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,

12822

Op.getOperand(0)),

12823

Op.getOperand(1));

12824

return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);

12825

}

12826

12827

if (VT == MVT::i32 || VT == MVT::i64) {

12828

// ExtractPS/pextrq works with constant index.

12829

if (isa<ConstantSDNode>(Op.getOperand(1)))

12830

return Op;

12831

}

12832

return SDValue();

12833

}

12834

12835

/// Extract one bit from mask vector, like v16i1 or v8i1.

12836

/// AVX-512 feature.

12837

SDValue

12838

X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {

12839

SDValue Vec = Op.getOperand(0);

12840

SDLoc dl(Vec);

12841

MVT VecVT = Vec.getSimpleValueType();

12842

SDValue Idx = Op.getOperand(1);

12843

MVT EltVT = Op.getSimpleValueType();

12844

12845

assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector")(((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i1) && \"Unexpected operands in ExtractBitFromMaskVector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12845, __PRETTY_FUNCTION__));

12846

assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI
()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12847, __PRETTY_FUNCTION__))

12847

"Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI
()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12847, __PRETTY_FUNCTION__));

12848

12849

// variable index can't be handled in mask registers,

12850

// extend vector to VR512

12851

if (!isa<ConstantSDNode>(Idx)) {

12852

MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);

12853

SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);

12854

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,

12855

ExtVT.getVectorElementType(), Ext, Idx);

12856

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

12857

}

12858

12859

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

12860

const TargetRegisterClass* rc = getRegClassFor(VecVT);

12861

if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))

12862

rc = getRegClassFor(MVT::v16i1);

12863

unsigned MaxSift = rc->getSize()*8 - 1;

12864

Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,

12865

DAG.getConstant(MaxSift - IdxVal, MVT::i8));

12866

Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,

12867

DAG.getConstant(MaxSift, MVT::i8));

12868

return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,

12869

DAG.getIntPtrConstant(0));

12870

}

12871

12872

SDValue

12873

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

12874

SelectionDAG &DAG) const {

12875

SDLoc dl(Op);

12876

SDValue Vec = Op.getOperand(0);

12877

MVT VecVT = Vec.getSimpleValueType();

12878

SDValue Idx = Op.getOperand(1);

12879

12880

if (Op.getSimpleValueType() == MVT::i1)

12881

return ExtractBitFromMaskVector(Op, DAG);

12882

12883

if (!isa<ConstantSDNode>(Idx)) {

12884

if (VecVT.is512BitVector() ||

12885

(VecVT.is256BitVector() && Subtarget->hasInt256() &&

12886

VecVT.getVectorElementType().getSizeInBits() == 32)) {

12887

12888

MVT MaskEltVT =

12889

MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());

12890

MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /

12891

MaskEltVT.getSizeInBits());

12892

12893

Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);

12894

SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,

12895

getZeroVector(MaskVT, Subtarget, DAG, dl),

12896

Idx, DAG.getConstant(0, getPointerTy()));

12897

SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);

12898

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),

12899

Perm, DAG.getConstant(0, getPointerTy()));

12900

}

12901

return SDValue();

12902

}

12903

12904

// If this is a 256-bit vector result, first extract the 128-bit vector and

12905

// then extract the element from the 128-bit vector.

12906

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

12907

12908

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

12909

// Get the 128-bit vector.

12910

Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);

12911

MVT EltVT = VecVT.getVectorElementType();

12912

12913

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

12914

12915

//if (IdxVal >= NumElems/2)

12916

// IdxVal -= NumElems/2;

12917

IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;

12918

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

12919

DAG.getConstant(IdxVal, MVT::i32));

12920

}

12921

12922

assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length"
) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12922, __PRETTY_FUNCTION__));

12923

12924

if (Subtarget->hasSSE41()) {

12925

SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);

12926

if (Res.getNode())

12927

return Res;

12928

}

12929

12930

MVT VT = Op.getSimpleValueType();

12931

// TODO: handle v16i8.

12932

if (VT.getSizeInBits() == 16) {

12933

SDValue Vec = Op.getOperand(0);

12934

unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

12935

if (Idx == 0)

12936

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

12937

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

12938

DAG.getNode(ISD::BITCAST, dl,

12939

MVT::v4i32, Vec),

12940

Op.getOperand(1)));

12941

// Transform it so it match pextrw which produces a 32-bit result.

12942

MVT EltVT = MVT::i32;

12943

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,

12944

Op.getOperand(0), Op.getOperand(1));

12945

SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,

12946

DAG.getValueType(VT));

12947

return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);

12948

}

12949

12950

if (VT.getSizeInBits() == 32) {

12951

unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

12952

if (Idx == 0)

12953

return Op;

12954

12955

// SHUFPS the element to the lowest double word, then movss.

12956

int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };

12957

MVT VVT = Op.getOperand(0).getSimpleValueType();

12958

SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),

12959

DAG.getUNDEF(VVT), Mask);

12960

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

12961

DAG.getIntPtrConstant(0));

12962

}

12963

12964

if (VT.getSizeInBits() == 64) {

12965

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

12966

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

12967

// to match extract_elt for f64.

12968

unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

12969

if (Idx == 0)

12970

return Op;

12971

12972

// UNPCKHPD the element to the lowest double word, then movsd.

12973

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

12974

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

12975

int Mask[2] = { 1, -1 };

12976

MVT VVT = Op.getOperand(0).getSimpleValueType();

12977

SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),

12978

DAG.getUNDEF(VVT), Mask);

12979

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

12980

DAG.getIntPtrConstant(0));

12981

}

12982

12983

return SDValue();

12984

}

12985

12986

/// Insert one bit to mask vector, like v16i1 or v8i1.

12987

/// AVX-512 feature.

12988

SDValue

12989

X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {

12990

SDLoc dl(Op);

12991

SDValue Vec = Op.getOperand(0);

12992

SDValue Elt = Op.getOperand(1);

12993

SDValue Idx = Op.getOperand(2);

12994

MVT VecVT = Vec.getSimpleValueType();

12995

12996

if (!isa<ConstantSDNode>(Idx)) {

12997

// Non constant index. Extend source and destination,

12998

// insert element and then truncate the result.

12999

MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);

13000

MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);

13001

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

13002

DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),

13003

DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);

13004

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

13005

}

13006

13007

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

13008

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);

13009

if (Vec.getOpcode() == ISD::UNDEF)

13010

return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,

13011

DAG.getConstant(IdxVal, MVT::i8));

13012

const TargetRegisterClass* rc = getRegClassFor(VecVT);

13013

unsigned MaxSift = rc->getSize()*8 - 1;

13014

EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,

13015

DAG.getConstant(MaxSift, MVT::i8));

13016

EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,

13017

DAG.getConstant(MaxSift - IdxVal, MVT::i8));

13018

return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);

13019

}

13020

13021

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

13022

SelectionDAG &DAG) const {

13023

MVT VT = Op.getSimpleValueType();

13024

MVT EltVT = VT.getVectorElementType();

13025

13026

if (EltVT == MVT::i1)

13027

return InsertBitToMaskVector(Op, DAG);

13028

13029

SDLoc dl(Op);

13030

SDValue N0 = Op.getOperand(0);

13031

SDValue N1 = Op.getOperand(1);

13032

SDValue N2 = Op.getOperand(2);

13033

if (!isa<ConstantSDNode>(N2))

13034

return SDValue();

13035

auto *N2C = cast<ConstantSDNode>(N2);

13036

unsigned IdxVal = N2C->getZExtValue();

13037

13038

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

13039

// into that, and then insert the subvector back into the result.

13040

if (VT.is256BitVector() || VT.is512BitVector()) {

13041

// Get the desired 128-bit vector half.

13042

SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);

13043

13044

// Insert the element into the desired half.

13045

unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();

13046

unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;

13047

13048

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

13049

DAG.getConstant(IdxIn128, MVT::i32));

13050

13051

// Insert the changed part back to the 256-bit vector

13052

return Insert128BitVector(N0, V, IdxVal, DAG, dl);

13053

}

13054

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13054, __PRETTY_FUNCTION__));

13055

13056

if (Subtarget->hasSSE41()) {

13057

if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {

13058

unsigned Opc;

13059

if (VT == MVT::v8i16) {

13060

Opc = X86ISD::PINSRW;

13061

} else {

13062

assert(VT == MVT::v16i8)((VT == MVT::v16i8) ? static_cast<void> (0) : __assert_fail
("VT == MVT::v16i8", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13062, __PRETTY_FUNCTION__));

13063

Opc = X86ISD::PINSRB;

13064

}

13065

13066

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

13067

// argument.

13068

if (N1.getValueType() != MVT::i32)

13069

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

13070

if (N2.getValueType() != MVT::i32)

13071

N2 = DAG.getIntPtrConstant(IdxVal);

13072

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

13073

}

13074

13075

if (EltVT == MVT::f32) {

13076

// Bits [7:6] of the constant are the source select. This will always be

13077

// zero here. The DAG Combiner may combine an extract_elt index into

13078

// these

13079

// bits. For example (insert (extract, 3), 2) could be matched by

13080

// putting

13081

// the '3' into bits [7:6] of X86ISD::INSERTPS.

13082

// Bits [5:4] of the constant are the destination select. This is the

13083

// value of the incoming immediate.

13084

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

13085

// combine either bitwise AND or insert of float 0.0 to set these bits.

13086

N2 = DAG.getIntPtrConstant(IdxVal << 4);

13087

// Create this as a scalar to vector..

13088

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

13089

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);

13090

}

13091

13092

if (EltVT == MVT::i32 || EltVT == MVT::i64) {

13093

// PINSR* works with constant index.

13094

return Op;

13095

}

13096

}

13097

13098

if (EltVT == MVT::i8)

13099

return SDValue();

13100

13101

if (EltVT.getSizeInBits() == 16) {

13102

// Transform it so it match pinsrw which expects a 16-bit value in a GR32

13103

// as its second argument.

13104

if (N1.getValueType() != MVT::i32)

13105

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

13106

if (N2.getValueType() != MVT::i32)

13107

N2 = DAG.getIntPtrConstant(IdxVal);

13108

return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);

13109

}

13110

return SDValue();

13111

}

13112

13113

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {

13114

SDLoc dl(Op);

13115

MVT OpVT = Op.getSimpleValueType();

13116

13117

// If this is a 256-bit vector result, first insert into a 128-bit

13118

// vector and then insert into the 256-bit vector.

13119

if (!OpVT.is128BitVector()) {

13120

// Insert into a 128-bit vector.

13121

unsigned SizeFactor = OpVT.getSizeInBits()/128;

13122

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

13123

OpVT.getVectorNumElements() / SizeFactor);

13124

13125

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

13126

13127

// Insert the 128-bit vector.

13128

return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

13129

}

13130

13131

if (OpVT == MVT::v1i64 &&

13132

Op.getOperand(0).getValueType() == MVT::i64)

13133

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));

13134

13135

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

13136

assert(OpVT.is128BitVector() && "Expected an SSE type!")((OpVT.is128BitVector() && "Expected an SSE type!") ?
static_cast<void> (0) : __assert_fail ("OpVT.is128BitVector() && \"Expected an SSE type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13136, __PRETTY_FUNCTION__));

13137

return DAG.getNode(ISD::BITCAST, dl, OpVT,

13138

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));

13139

}

13140

13141

// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in

13142

// a simple subregister reference or explicit instructions to grab

13143

// upper bits of a vector.

13144

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,

13145

SelectionDAG &DAG) {

13146

SDLoc dl(Op);

13147

SDValue In = Op.getOperand(0);

13148

SDValue Idx = Op.getOperand(1);

13149

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

13150

MVT ResVT = Op.getSimpleValueType();

13151

MVT InVT = In.getSimpleValueType();

13152

13153

if (Subtarget->hasFp256()) {

13154

if (ResVT.is128BitVector() &&

13155

(InVT.is256BitVector() || InVT.is512BitVector()) &&

13156

isa<ConstantSDNode>(Idx)) {

13157

return Extract128BitVector(In, IdxVal, DAG, dl);

13158

}

13159

if (ResVT.is256BitVector() && InVT.is512BitVector() &&

13160

isa<ConstantSDNode>(Idx)) {

13161

return Extract256BitVector(In, IdxVal, DAG, dl);

13162

}

13163

}

13164

return SDValue();

13165

}

13166

13167

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

13168

// simple superregister reference or explicit instructions to insert

13169

// the upper bits of a vector.

13170

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,

13171

SelectionDAG &DAG) {

13172

if (!Subtarget->hasAVX())

13173

return SDValue();

13174

13175

SDLoc dl(Op);

13176

SDValue Vec = Op.getOperand(0);

13177

SDValue SubVec = Op.getOperand(1);

13178

SDValue Idx = Op.getOperand(2);

13179

MVT OpVT = Op.getSimpleValueType();

13180

MVT SubVecVT = SubVec.getSimpleValueType();

13181

13182

if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&

13183

SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {

13184

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

13185

return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);

13186

}

13187

13188

if (OpVT.is512BitVector() &&

13189

SubVecVT.is256BitVector() && isa<ConstantSDNode>(Idx)) {

13190

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

13191

return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);

13192

}

13193

13194

return SDValue();

13195

}

13196

13197

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

13198

// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is

13199

// one of the above mentioned nodes. It has to be wrapped because otherwise

13200

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

13201

// be used to form addressing mode. These wrapped nodes will be selected

13202

// into MOV32ri.

13203

SDValue

13204

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

13205

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

13206

13207

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

13208

// global base reg.

13209

unsigned char OpFlag = 0;

13210

unsigned WrapperKind = X86ISD::Wrapper;

13211

CodeModel::Model M = DAG.getTarget().getCodeModel();

13212

13213

if (Subtarget->isPICStyleRIPRel() &&

13214

(M == CodeModel::Small || M == CodeModel::Kernel))

13215

WrapperKind = X86ISD::WrapperRIP;

13216

else if (Subtarget->isPICStyleGOT())

13217

OpFlag = X86II::MO_GOTOFF;

13218

else if (Subtarget->isPICStyleStubPIC())

13219

OpFlag = X86II::MO_PIC_BASE_OFFSET;

13220

13221

SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),

13222

CP->getAlignment(),

13223

CP->getOffset(), OpFlag);

13224

SDLoc DL(CP);

13225

Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);

13226

// With PIC, the address is actually $g + Offset.

13227

if (OpFlag) {

13228

Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),

13229

DAG.getNode(X86ISD::GlobalBaseReg,

13230

SDLoc(), getPointerTy()),

13231

Result);

13232

}

13233

13234

return Result;

13235

}

13236

13237

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

13238

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

13239

13240

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

13241

// global base reg.

13242

unsigned char OpFlag = 0;

13243

unsigned WrapperKind = X86ISD::Wrapper;

13244

CodeModel::Model M = DAG.getTarget().getCodeModel();

13245

13246

if (Subtarget->isPICStyleRIPRel() &&

13247

(M == CodeModel::Small || M == CodeModel::Kernel))

13248

WrapperKind = X86ISD::WrapperRIP;

13249

else if (Subtarget->isPICStyleGOT())

13250

OpFlag = X86II::MO_GOTOFF;

13251

else if (Subtarget->isPICStyleStubPIC())

13252

OpFlag = X86II::MO_PIC_BASE_OFFSET;

13253

13254

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),

13255

OpFlag);

13256

SDLoc DL(JT);

13257

Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);

13258

13259

// With PIC, the address is actually $g + Offset.

13260

if (OpFlag)

13261

Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),

13262

DAG.getNode(X86ISD::GlobalBaseReg,

13263

SDLoc(), getPointerTy()),

13264

Result);

13265

13266

return Result;

13267

}

13268

13269

SDValue

13270

X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {

13271

const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

13272

13273

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

13274

// global base reg.

13275

unsigned char OpFlag = 0;

13276

unsigned WrapperKind = X86ISD::Wrapper;

13277

CodeModel::Model M = DAG.getTarget().getCodeModel();

13278

13279

if (Subtarget->isPICStyleRIPRel() &&

13280

(M == CodeModel::Small || M == CodeModel::Kernel)) {

13281

if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())

13282

OpFlag = X86II::MO_GOTPCREL;

13283

WrapperKind = X86ISD::WrapperRIP;

13284

} else if (Subtarget->isPICStyleGOT()) {

13285

OpFlag = X86II::MO_GOT;

13286

} else if (Subtarget->isPICStyleStubPIC()) {

13287

OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;

13288

} else if (Subtarget->isPICStyleStubNoDynamic()) {

13289

OpFlag = X86II::MO_DARWIN_NONLAZY;

13290

}

13291

13292

SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);

13293

13294

SDLoc DL(Op);

13295

Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);

13296

13297

// With PIC, the address is actually $g + Offset.

13298

if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&

13299

!Subtarget->is64Bit()) {

13300

Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),

13301

DAG.getNode(X86ISD::GlobalBaseReg,

13302

SDLoc(), getPointerTy()),

13303

Result);

13304

}

13305

13306

// For symbols that require a load from a stub to get the address, emit the

13307

// load.

13308

if (isGlobalStubReference(OpFlag))

13309

Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,

13310

MachinePointerInfo::getGOT(), false, false, false, 0);

13311

13312

return Result;

13313

}

13314

13315

SDValue

13316

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

13317

// Create the TargetBlockAddressAddress node.

13318

unsigned char OpFlags =

13319

Subtarget->ClassifyBlockAddressReference();

13320

CodeModel::Model M = DAG.getTarget().getCodeModel();

13321

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

13322

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

13323

SDLoc dl(Op);

13324

SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,

13325

OpFlags);

13326

13327

if (Subtarget->isPICStyleRIPRel() &&

13328

(M == CodeModel::Small || M == CodeModel::Kernel))

13329

Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);

13330

else

13331

Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);

13332

13333

// With PIC, the address is actually $g + Offset.

13334

if (isGlobalRelativeToPICBase(OpFlags)) {

13335

Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),

13336

DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),

13337

Result);

13338

}

13339

13340

return Result;

13341

}

13342

13343

SDValue

13344

X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,

13345

int64_t Offset, SelectionDAG &DAG) const {

13346

// Create the TargetGlobalAddress node, folding in the constant

13347

// offset if it is legal.

13348

unsigned char OpFlags =

13349

Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());

13350

CodeModel::Model M = DAG.getTarget().getCodeModel();

13351

SDValue Result;

13352

if (OpFlags == X86II::MO_NO_FLAG &&

13353

X86::isOffsetSuitableForCodeModel(Offset, M)) {

13354

// A direct static reference to a global.

13355

Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);

13356

Offset = 0;

13357

} else {

13358

Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);

13359

}

13360

13361

if (Subtarget->isPICStyleRIPRel() &&

13362

(M == CodeModel::Small || M == CodeModel::Kernel))

13363

Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);

13364

else

13365

Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);

13366

13367

// With PIC, the address is actually $g + Offset.

13368

if (isGlobalRelativeToPICBase(OpFlags)) {

13369

Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),

13370

DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),

13371

Result);

13372

}

13373

13374

// For globals that require a load from a stub to get the address, emit the

13375

// load.

13376

if (isGlobalStubReference(OpFlags))

13377

Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,

13378

MachinePointerInfo::getGOT(), false, false, false, 0);

13379

13380

// If there was a non-zero offset that we didn't fold, create an explicit

13381

// addition for it.

13382

if (Offset != 0)

13383

Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,

13384

DAG.getConstant(Offset, getPointerTy()));

13385

13386

return Result;

13387

}

13388

13389

SDValue

13390

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

13391

const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

13392

int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();

13393

return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);

13394

}

13395

13396

static SDValue

13397

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

13398

SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,

13399

unsigned char OperandFlags, bool LocalDynamic = false) {

13400

MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();

13401

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

13402

SDLoc dl(GA);

13403

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

13404

GA->getValueType(0),

13405

GA->getOffset(),

13406

OperandFlags);

13407

13408

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

13409

: X86ISD::TLSADDR;

13410

13411

if (InFlag) {

13412

SDValue Ops[] = { Chain, TGA, *InFlag };

13413

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

13414

} else {

13415

SDValue Ops[] = { Chain, TGA };

13416

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

13417

}

13418

13419

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

13420

MFI->setAdjustsStack(true);

13421

MFI->setHasCalls(true);

13422

13423

SDValue Flag = Chain.getValue(1);

13424

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);

13425

}

13426

13427

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

13428

static SDValue

13429

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

13430

const EVT PtrVT) {

13431

SDValue InFlag;

13432

SDLoc dl(GA); // ? function entry point might be better

13433

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

13434

DAG.getNode(X86ISD::GlobalBaseReg,

13435

SDLoc(), PtrVT), InFlag);

13436

InFlag = Chain.getValue(1);

13437

13438

return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);

13439

}

13440

13441

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit

13442

static SDValue

13443

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

13444

const EVT PtrVT) {

13445

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

13446

X86::RAX, X86II::MO_TLSGD);

13447

}

13448

13449

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

13450

SelectionDAG &DAG,

13451

const EVT PtrVT,

13452

bool is64Bit) {

13453

SDLoc dl(GA);

13454

13455

// Get the start address of the TLS block for this module.

13456

X86MachineFunctionInfo* MFI = DAG.getMachineFunction()

13457

.getInfo<X86MachineFunctionInfo>();

13458

MFI->incNumLocalDynamicTLSAccesses();

13459

13460

SDValue Base;

13461

if (is64Bit) {

13462

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,

13463

X86II::MO_TLSLD, /*LocalDynamic=*/true);

13464

} else {

13465

SDValue InFlag;

13466

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

13467

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);

13468

InFlag = Chain.getValue(1);

13469

Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,

13470

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

13471

}

13472

13473

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

13474

// of Base.

13475

13476

// Build x@dtpoff.

13477

unsigned char OperandFlags = X86II::MO_DTPOFF;

13478

unsigned WrapperKind = X86ISD::Wrapper;

13479

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

13480

GA->getValueType(0),

13481

GA->getOffset(), OperandFlags);

13482

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

13483

13484

// Add x@dtpoff with the base.

13485

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

13486

}

13487

13488

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

13489

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

13490

const EVT PtrVT, TLSModel::Model model,

13491

bool is64Bit, bool isPIC) {

13492

SDLoc dl(GA);

13493

13494

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

13495

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

13496

is64Bit ? 257 : 256));

13497

13498

SDValue ThreadPointer =

13499

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),

13500

MachinePointerInfo(Ptr), false, false, false, 0);

13501

13502

unsigned char OperandFlags = 0;

13503

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

13504

// initialexec.

13505

unsigned WrapperKind = X86ISD::Wrapper;

13506

if (model == TLSModel::LocalExec) {

13507

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

13508

} else if (model == TLSModel::InitialExec) {

13509

if (is64Bit) {

13510

OperandFlags = X86II::MO_GOTTPOFF;

13511

WrapperKind = X86ISD::WrapperRIP;

13512

} else {

13513

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

13514

}

13515

} else {

13516

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13516);

13517

}

13518

13519

// emit "addl x@ntpoff,%eax" (local exec)

13520

// or "addl x@indntpoff,%eax" (initial exec)

13521

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

13522

SDValue TGA =

13523

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

13524

GA->getOffset(), OperandFlags);

13525

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

13526

13527

if (model == TLSModel::InitialExec) {

13528

if (isPIC && !is64Bit) {

13529

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

13530

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

13531

Offset);

13532

}

13533

13534

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

13535

MachinePointerInfo::getGOT(), false, false, false, 0);

13536

}

13537

13538

// The address of the thread local variable is the add of the thread

13539

// pointer with the offset of the variable.

13540

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

13541

}

13542

13543

SDValue

13544

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

13545

13546

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

13547

const GlobalValue *GV = GA->getGlobal();

13548

13549

if (Subtarget->isTargetELF()) {

13550

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

13551

13552

switch (model) {

13553

case TLSModel::GeneralDynamic:

13554

if (Subtarget->is64Bit())

13555

return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());

13556

return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());

13557

case TLSModel::LocalDynamic:

13558

return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),

13559

Subtarget->is64Bit());

13560

case TLSModel::InitialExec:

13561

case TLSModel::LocalExec:

13562

return LowerToTLSExecModel(

13563

GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),

13564

DAG.getTarget().getRelocationModel() == Reloc::PIC_);

13565

}

13566

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13566);

13567

}

13568

13569

if (Subtarget->isTargetDarwin()) {

13570

// Darwin only has one model of TLS. Lower to that.

13571

unsigned char OpFlag = 0;

13572

unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?

13573

X86ISD::WrapperRIP : X86ISD::Wrapper;

13574

13575

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

13576

// global base reg.

13577

bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&

13578

!Subtarget->is64Bit();

13579

if (PIC32)

13580

OpFlag = X86II::MO_TLVP_PIC_BASE;

13581

else

13582

OpFlag = X86II::MO_TLVP;

13583

SDLoc DL(Op);

13584

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

13585

GA->getValueType(0),

13586

GA->getOffset(), OpFlag);

13587

SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);

13588

13589

// With PIC32, the address is actually $g + Offset.

13590

if (PIC32)

13591

Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),

13592

DAG.getNode(X86ISD::GlobalBaseReg,

13593

SDLoc(), getPointerTy()),

13594

Offset);

13595

13596

// Lowering the machine isd will make sure everything is in the right

13597

// location.

13598

SDValue Chain = DAG.getEntryNode();

13599

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

13600

SDValue Args[] = { Chain, Offset };

13601

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

13602

13603

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

13604

MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();

13605

MFI->setAdjustsStack(true);

13606

13607

// And our return value (tls address) is in the standard call return value

13608

// location.

13609

unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;

13610

return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),

13611

Chain.getValue(1));

13612

}

13613

13614

if (Subtarget->isTargetKnownWindowsMSVC() ||

13615

Subtarget->isTargetWindowsGNU()) {

13616

// Just use the implicit TLS architecture

13617

// Need to generate someting similar to:

13618

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

13619

// ; from TEB

13620

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

13621

// mov rcx, qword [rdx+rcx*8]

13622

// mov eax, .tls$:tlsvar

13623

// [rax+rcx] contains the address

13624

// Windows 64bit: gs:0x58

13625

// Windows 32bit: fs:__tls_array

13626

13627

SDLoc dl(GA);

13628

SDValue Chain = DAG.getEntryNode();

13629

13630

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

13631

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

13632

// use its literal value of 0x2C.

13633

Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()

13634

? Type::getInt8PtrTy(*DAG.getContext(),

13635

256)

13636

: Type::getInt32PtrTy(*DAG.getContext(),

13637

257));

13638

13639

SDValue TlsArray =

13640

Subtarget->is64Bit()

13641

? DAG.getIntPtrConstant(0x58)

13642

: (Subtarget->isTargetWindowsGNU()

13643

? DAG.getIntPtrConstant(0x2C)

13644

: DAG.getExternalSymbol("_tls_array", getPointerTy()));

13645

13646

SDValue ThreadPointer =

13647

DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,

13648

MachinePointerInfo(Ptr), false, false, false, 0);

13649

13650

// Load the _tls_index variable

13651

SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());

13652

if (Subtarget->is64Bit())

13653

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,

13654

IDX, MachinePointerInfo(), MVT::i32,

13655

false, false, false, 0);

13656

else

13657

IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),

13658

false, false, false, 0);

13659

13660

SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),

13661

getPointerTy());

13662

IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);

13663

13664

SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);

13665

res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),

13666

false, false, false, 0);

13667

13668

// Get the offset of start of .tls section

13669

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

13670

GA->getValueType(0),

13671

GA->getOffset(), X86II::MO_SECREL);

13672

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);

13673

13674

// The address of the thread local variable is the add of the thread

13675

// pointer with the offset of the variable.

13676

return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);

13677

}

13678

13679

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13679);

13680

}

13681

13682

/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values

13683

/// and take a 2 x i32 value to shift plus a shift amount.

13684

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

13685

assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ?
static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13685, __PRETTY_FUNCTION__));

13686

MVT VT = Op.getSimpleValueType();

13687

unsigned VTBits = VT.getSizeInBits();

13688

SDLoc dl(Op);

13689

bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;

13690

SDValue ShOpLo = Op.getOperand(0);

13691

SDValue ShOpHi = Op.getOperand(1);

13692

SDValue ShAmt = Op.getOperand(2);

13693

// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the

13694

// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away

13695

// during isel.

13696

SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,

13697

DAG.getConstant(VTBits - 1, MVT::i8));

13698

SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,

13699

DAG.getConstant(VTBits - 1, MVT::i8))

13700

: DAG.getConstant(0, VT);

13701

13702

SDValue Tmp2, Tmp3;

13703

if (Op.getOpcode() == ISD::SHL_PARTS) {

13704

Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);

13705

Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);

13706

} else {

13707

Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);

13708

Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);

13709

}

13710

13711

// If the shift amount is larger or equal than the width of a part we can't

13712

// rely on the results of shld/shrd. Insert a test and select the appropriate

13713

// values for large shift amounts.

13714

SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,

13715

DAG.getConstant(VTBits, MVT::i8));

13716

SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

13717

AndNode, DAG.getConstant(0, MVT::i8));

13718

13719

SDValue Hi, Lo;

13720

SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);

13721

SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };

13722

SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

13723

13724

if (Op.getOpcode() == ISD::SHL_PARTS) {

13725

Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);

13726

Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);

13727

} else {

13728

Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);

13729

Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);

13730

}

13731

13732

SDValue Ops[2] = { Lo, Hi };

13733

return DAG.getMergeValues(Ops, dl);

13734

}

13735

13736

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

13737

SelectionDAG &DAG) const {

13738

MVT SrcVT = Op.getOperand(0).getSimpleValueType();

13739

SDLoc dl(Op);

13740

13741

if (SrcVT.isVector()) {

13742

if (SrcVT.getVectorElementType() == MVT::i1) {

13743

MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());

13744

return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),

13745

DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,

13746

Op.getOperand(0)));

13747

}

13748

return SDValue();

13749

}

13750

13751

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13752, __PRETTY_FUNCTION__))

13752

"Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13752, __PRETTY_FUNCTION__));

13753

13754

// These are really Legal; return the operand so the caller accepts it as

13755

// Legal.

13756

if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))

13757

return Op;

13758

if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&

13759

Subtarget->is64Bit()) {

13760

return Op;

13761

}

13762

13763

unsigned Size = SrcVT.getSizeInBits()/8;

13764

MachineFunction &MF = DAG.getMachineFunction();

13765

int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);

13766

SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());

13767

SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),

13768

StackSlot,

13769

MachinePointerInfo::getFixedStack(SSFI),

13770

false, false, 0);

13771

return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);

13772

}

13773

13774

SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,

13775

SDValue StackSlot,

13776

SelectionDAG &DAG) const {

13777

// Build the FILD

13778

SDLoc DL(Op);

13779

SDVTList Tys;

13780

bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());

13781

if (useSSE)

13782

Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);

13783

else

13784

Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

13785

13786

unsigned ByteSize = SrcVT.getSizeInBits()/8;

13787

13788

FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);

13789

MachineMemOperand *MMO;

13790

if (FI) {

13791

int SSFI = FI->getIndex();

13792

MMO =

13793

DAG.getMachineFunction()

13794

.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),

13795

MachineMemOperand::MOLoad, ByteSize, ByteSize);

13796

} else {

13797

MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();

13798

StackSlot = StackSlot.getOperand(1);

13799

}

13800

SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };

13801

SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :

13802

X86ISD::FILD, DL,

13803

Tys, Ops, SrcVT, MMO);

13804

13805

if (useSSE) {

13806

Chain = Result.getValue(1);

13807

SDValue InFlag = Result.getValue(2);

13808

13809

// FIXME: Currently the FST is flagged to the FILD_FLAG. This

13810

// shouldn't be necessary except that RFP cannot be live across

13811

// multiple blocks. When stackifier is fixed, they can be uncoupled.

13812

MachineFunction &MF = DAG.getMachineFunction();

13813

unsigned SSFISize = Op.getValueType().getSizeInBits()/8;

13814

int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);

13815

SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());

13816

Tys = DAG.getVTList(MVT::Other);

13817

SDValue Ops[] = {

13818

Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag

13819

};

13820

MachineMemOperand *MMO =

13821

DAG.getMachineFunction()

13822

.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),

13823

MachineMemOperand::MOStore, SSFISize, SSFISize);

13824

13825

Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,

13826

Ops, Op.getValueType(), MMO);

13827

Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,

13828

MachinePointerInfo::getFixedStack(SSFI),

13829

false, false, false, 0);

13830

}

13831

13832

return Result;

13833

}

13834

13835

// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.

13836

SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,

13837

SelectionDAG &DAG) const {

13838

// This algorithm is not obvious. Here it is what we're trying to output:

13839

13840

movq %rax, %xmm0

13841

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

13842

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

13843

#ifdef __SSE3__

13844

haddpd %xmm0, %xmm0

13845

#else

13846

pshufd $0x4e, %xmm0, %xmm1

13847

addpd %xmm1, %xmm0

13848

#endif

13849

13850

13851

SDLoc dl(Op);

13852

LLVMContext *Context = DAG.getContext();

13853

13854

// Build some magic constants.

13855

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

13856

Constant *C0 = ConstantDataVector::get(*Context, CV0);

13857

SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);

13858

13859

SmallVector<Constant*,2> CV1;

13860

CV1.push_back(

13861

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,

13862

APInt(64, 0x4330000000000000ULL))));

13863

CV1.push_back(

13864

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,

13865

APInt(64, 0x4530000000000000ULL))));

13866

Constant *C1 = ConstantVector::get(CV1);

13867

SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);

13868

13869

// Load the 64-bit value into an XMM register.

13870

SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

13871

Op.getOperand(0));

13872

SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

13873

MachinePointerInfo::getConstantPool(),

13874

false, false, false, 16);

13875

SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,

13876

DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),

13877

CLod0);

13878

13879

SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

13880

MachinePointerInfo::getConstantPool(),

13881

false, false, false, 16);

13882

SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);

13883

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

13884

SDValue Result;

13885

13886

if (Subtarget->hasSSE3()) {

13887

// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.

13888

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

13889

} else {

13890

SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);

13891

SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,

13892

S2F, 0x4E, DAG);

13893

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,

13894

DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),

13895

Sub);

13896

}

13897

13898

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

13899

DAG.getIntPtrConstant(0));

13900

}

13901

13902

// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.

13903

SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,

13904

SelectionDAG &DAG) const {

13905

SDLoc dl(Op);

13906

// FP constant to bias correct the final result.

13907

SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),

13908

MVT::f64);

13909

13910

// Load the 32-bit value into an XMM register.

13911

SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,

13912

Op.getOperand(0));

13913

13914

// Zero out the upper parts of the register.

13915

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

13916

13917

Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

13918

DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),

13919

DAG.getIntPtrConstant(0));

13920

13921

// Or the load with the bias.

13922

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,

13923

DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,

13924

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

13925

MVT::v2f64, Load)),

13926

DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,

13927

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

13928

MVT::v2f64, Bias)));

13929

Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

13930

DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),

13931

DAG.getIntPtrConstant(0));

13932

13933

// Subtract the bias.

13934

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

13935

13936

// Handle final rounding.

13937

EVT DestVT = Op.getValueType();

13938

13939

if (DestVT.bitsLT(MVT::f64))

13940

return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,

13941

DAG.getIntPtrConstant(0));

13942

if (DestVT.bitsGT(MVT::f64))

13943

return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);

13944

13945

// Handle final rounding.

13946

return Sub;

13947

}

13948

13949

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

13950

const X86Subtarget &Subtarget) {

13951

// The algorithm is the following:

13952

// #ifdef __SSE4_1__

13953

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

13954

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

13955

// (uint4) 0x53000000, 0xaa);

13956

// #else

13957

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

13958

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

13959

// #endif

13960

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

13961

// return (float4) lo + fhi;

13962

13963

SDLoc DL(Op);

13964

SDValue V = Op->getOperand(0);

13965

EVT VecIntVT = V.getValueType();

13966

bool Is128 = VecIntVT == MVT::v4i32;

13967

EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

13968

// If we convert to something else than the supported type, e.g., to v4f64,

13969

// abort early.

13970

if (VecFloatVT != Op->getValueType(0))

13971

return SDValue();

13972

13973

unsigned NumElts = VecIntVT.getVectorNumElements();

13974

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13975, __PRETTY_FUNCTION__))

13975

"Unsupported custom type")(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13975, __PRETTY_FUNCTION__));

13976

assert(NumElts <= 8 && "The size of the constant array must be fixed")((NumElts <= 8 && "The size of the constant array must be fixed"
) ? static_cast<void> (0) : __assert_fail ("NumElts <= 8 && \"The size of the constant array must be fixed\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13976, __PRETTY_FUNCTION__));

13977

13978

// In the #idef/#else code, we have in common:

13979

// - The vector of constants:

13980

// -- 0x4b000000

13981

// -- 0x53000000

13982

// - A shift:

13983

// -- v >> 16

13984

13985

// Create the splat vector for 0x4b000000.

13986

SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);

13987

SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,

13988

CstLow, CstLow, CstLow, CstLow};

13989

SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,

13990

makeArrayRef(&CstLowArray[0], NumElts));

13991

// Create the splat vector for 0x53000000.

13992

SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);

13993

SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,

13994

CstHigh, CstHigh, CstHigh, CstHigh};

13995

SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,

13996

makeArrayRef(&CstHighArray[0], NumElts));

13997

13998

// Create the right shift.

13999

SDValue CstShift = DAG.getConstant(16, MVT::i32);

14000

SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,

14001

CstShift, CstShift, CstShift, CstShift};

14002

SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,

14003

makeArrayRef(&CstShiftArray[0], NumElts));

14004

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

14005

14006

SDValue Low, High;

14007

if (Subtarget.hasSSE41()) {

14008

EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

14009

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

14010

SDValue VecCstLowBitcast =

14011

DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);

14012

SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);

14013

// Low will be bitcasted right away, so do not bother bitcasting back to its

14014

// original type.

14015

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

14016

VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));

14017

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

14018

// (uint4) 0x53000000, 0xaa);

14019

SDValue VecCstHighBitcast =

14020

DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);

14021

SDValue VecShiftBitcast =

14022

DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);

14023

// High will be bitcasted right away, so do not bother bitcasting back to

14024

// its original type.

14025

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

14026

VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));

14027

} else {

14028

SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);

14029

SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,

14030

CstMask, CstMask, CstMask);

14031

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

14032

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

14033

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

14034

14035

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

14036

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

14037

}

14038

14039

// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).

14040

SDValue CstFAdd = DAG.getConstantFP(

14041

APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);

14042

SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,

14043

CstFAdd, CstFAdd, CstFAdd, CstFAdd};

14044

SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,

14045

makeArrayRef(&CstFAddArray[0], NumElts));

14046

14047

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

14048

SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);

14049

SDValue FHigh =

14050

DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);

14051

// return (float4) lo + fhi;

14052

SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);

14053

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

14054

}

14055

14056

SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,

14057

SelectionDAG &DAG) const {

14058

SDValue N0 = Op.getOperand(0);

14059

MVT SVT = N0.getSimpleValueType();

14060

SDLoc dl(Op);

14061

14062

switch (SVT.SimpleTy) {

14063

default:

14064

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14064);

14065

case MVT::v4i8:

14066

case MVT::v4i16:

14067

case MVT::v8i8:

14068

case MVT::v8i16: {

14069

MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());

14070

return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),

14071

DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));

14072

}

14073

case MVT::v4i32:

14074

case MVT::v8i32:

14075

return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);

14076

}

14077

llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14077);

14078

}

14079

14080

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

14081

SelectionDAG &DAG) const {

14082

SDValue N0 = Op.getOperand(0);

14083

SDLoc dl(Op);

14084

14085

if (Op.getValueType().isVector())

14086

return lowerUINT_TO_FP_vec(Op, DAG);

14087

14088

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

14089

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

14090

// the optimization here.

14091

if (DAG.SignBitIsZero(N0))

14092

return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);

14093

14094

MVT SrcVT = N0.getSimpleValueType();

14095

MVT DstVT = Op.getSimpleValueType();

14096

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)

14097

return LowerUINT_TO_FP_i64(Op, DAG);

14098

if (SrcVT == MVT::i32 && X86ScalarSSEf64)

14099

return LowerUINT_TO_FP_i32(Op, DAG);

14100

if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)

14101

return SDValue();

14102

14103

// Make a 64-bit buffer, and use it to build an FILD.

14104

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);

14105

if (SrcVT == MVT::i32) {

14106

SDValue WordOff = DAG.getConstant(4, getPointerTy());

14107

SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,

14108

getPointerTy(), StackSlot, WordOff);

14109

SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),

14110

StackSlot, MachinePointerInfo(),

14111

false, false, 0);

14112

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),

14113

OffsetSlot, MachinePointerInfo(),

14114

false, false, 0);

14115

SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);

14116

return Fild;

14117

}

14118

14119

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14119, __PRETTY_FUNCTION__));

14120

SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),

14121

StackSlot, MachinePointerInfo(),

14122

false, false, 0);

14123

// For i64 source, we need to add the appropriate power of 2 if the input

14124

// was negative. This is the same as the optimization in

14125

// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,

14126

// we must be careful to do the computation in x87 extended precision, not

14127

// in SSE. (The generic code can't know it's OK to do this, or how to.)

14128

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

14129

MachineMemOperand *MMO =

14130

DAG.getMachineFunction()

14131

.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),

14132

MachineMemOperand::MOLoad, 8, 8);

14133

14134

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

14135

SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };

14136

SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,

14137

MVT::i64, MMO);

14138

14139

APInt FF(32, 0x5F800000ULL);

14140

14141

// Check whether the sign bit is set.

14142

SDValue SignSet = DAG.getSetCC(dl,

14143

getSetCCResultType(*DAG.getContext(), MVT::i64),

14144

Op.getOperand(0), DAG.getConstant(0, MVT::i64),

14145

ISD::SETLT);

14146

14147

// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.

14148

SDValue FudgePtr = DAG.getConstantPool(

14149

ConstantInt::get(*DAG.getContext(), FF.zext(64)),

14150

getPointerTy());

14151

14152

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

14153

SDValue Zero = DAG.getIntPtrConstant(0);

14154

SDValue Four = DAG.getIntPtrConstant(4);

14155

SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,

14156

Zero, Four);

14157

FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);

14158

14159

// Load the value out, extending it from f32 to f80.

14160

// FIXME: Avoid the extend by constructing the right constant pool?

14161

SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),

14162

FudgePtr, MachinePointerInfo::getConstantPool(),

14163

MVT::f32, false, false, false, 4);

14164

// Extend everything to 80 bits to force it to be done on x87.

14165

SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);

14166

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));

14167

}

14168

14169

std::pair<SDValue,SDValue>

14170

X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

14171

bool IsSigned, bool IsReplace) const {

14172

SDLoc DL(Op);

14173

14174

EVT DstTy = Op.getValueType();

14175

14176

if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {

14177

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast
<void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14177, __PRETTY_FUNCTION__));

14178

DstTy = MVT::i64;

14179

}

14180

14181

assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14183, __PRETTY_FUNCTION__))

14182

DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14183, __PRETTY_FUNCTION__))

14183

"Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14183, __PRETTY_FUNCTION__));

14184

14185

// These are really Legal.

14186

if (DstTy == MVT::i32 &&

14187

isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))

14188

return std::make_pair(SDValue(), SDValue());

14189

if (Subtarget->is64Bit() &&

14190

DstTy == MVT::i64 &&

14191

isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))

14192

return std::make_pair(SDValue(), SDValue());

14193

14194

// We lower FP->int64 either into FISTP64 followed by a load from a temporary

14195

// stack slot, or into the FTOL runtime function.

14196

MachineFunction &MF = DAG.getMachineFunction();

14197

unsigned MemSize = DstTy.getSizeInBits()/8;

14198

int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);

14199

SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());

14200

14201

unsigned Opc;

14202

if (!IsSigned && isIntegerTypeFTOL(DstTy))

14203

Opc = X86ISD::WIN_FTOL;

14204

else

14205

switch (DstTy.getSimpleVT().SimpleTy) {

14206

default: llvm_unreachable("Invalid FP_TO_SINT to lower!")::llvm::llvm_unreachable_internal("Invalid FP_TO_SINT to lower!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14206);

14207

case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;

14208

case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;

14209

case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;

14210

}

14211

14212

SDValue Chain = DAG.getEntryNode();

14213

SDValue Value = Op.getOperand(0);

14214

EVT TheVT = Op.getOperand(0).getValueType();

14215

// FIXME This causes a redundant load/store if the SSE-class value is already

14216

// in memory, such as if it is on the callstack.

14217

if (isScalarFPTypeInSSEReg(TheVT)) {

14218

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14218, __PRETTY_FUNCTION__));

14219

Chain = DAG.getStore(Chain, DL, Value, StackSlot,

14220

MachinePointerInfo::getFixedStack(SSFI),

14221

false, false, 0);

14222

SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);

14223

SDValue Ops[] = {

14224

Chain, StackSlot, DAG.getValueType(TheVT)

14225

};

14226

14227

MachineMemOperand *MMO =

14228

MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),

14229

MachineMemOperand::MOLoad, MemSize, MemSize);

14230

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);

14231

Chain = Value.getValue(1);

14232

SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);

14233

StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());

14234

}

14235

14236

MachineMemOperand *MMO =

14237

MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),

14238

MachineMemOperand::MOStore, MemSize, MemSize);

14239

14240

if (Opc != X86ISD::WIN_FTOL) {

14241

// Build the FP_TO_INT*_IN_MEM

14242

SDValue Ops[] = { Chain, Value, StackSlot };

14243

SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),

14244

Ops, DstTy, MMO);

14245

return std::make_pair(FIST, StackSlot);

14246

} else {

14247

SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,

14248

DAG.getVTList(MVT::Other, MVT::Glue),

14249

Chain, Value);

14250

SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,

14251

MVT::i32, ftol.getValue(1));

14252

SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,

14253

MVT::i32, eax.getValue(2));

14254

SDValue Ops[] = { eax, edx };

14255

SDValue pair = IsReplace

14256

? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)

14257

: DAG.getMergeValues(Ops, DL);

14258

return std::make_pair(pair, SDValue());

14259

}

14260

}

14261

14262

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

14263

const X86Subtarget *Subtarget) {

14264

MVT VT = Op->getSimpleValueType(0);

14265

SDValue In = Op->getOperand(0);

14266

MVT InVT = In.getSimpleValueType();

14267

SDLoc dl(Op);

14268

14269

// Optimize vectors in AVX mode:

14270

14271

// v8i16 -> v8i32

14272

// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.

14273

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

14274

// Concat upper and lower parts.

14275

14276

// v4i32 -> v4i64

14277

// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.

14278

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

14279

// Concat upper and lower parts.

14280

14281

14282

if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&

14283

((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&

14284

((VT != MVT::v4i64) || (InVT != MVT::v4i32)))

14285

return SDValue();

14286

14287

if (Subtarget->hasInt256())

14288

return DAG.getNode(X86ISD::VZEXT, dl, VT, In);

14289

14290

SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);

14291

SDValue Undef = DAG.getUNDEF(InVT);

14292

bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;

14293

SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

14294

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

14295

14296

MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),

14297

VT.getVectorNumElements()/2);

14298

14299

OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);

14300

OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);

14301

14302

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

14303

}

14304

14305

static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,

14306

SelectionDAG &DAG) {

14307

MVT VT = Op->getSimpleValueType(0);

14308

SDValue In = Op->getOperand(0);

14309

MVT InVT = In.getSimpleValueType();

14310

SDLoc DL(Op);

14311

unsigned int NumElts = VT.getVectorNumElements();

14312

if (NumElts != 8 && NumElts != 16)

14313

return SDValue();

14314

14315

if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)

14316

return DAG.getNode(X86ISD::VZEXT, DL, VT, In);

14317

14318

EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;

14319

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

14320

// Now we have only mask extension

14321

assert(InVT.getVectorElementType() == MVT::i1)((InVT.getVectorElementType() == MVT::i1) ? static_cast<void
> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14321, __PRETTY_FUNCTION__));

14322

SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());

14323

const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();

14324

SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());

14325

unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

14326

SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,

14327

MachinePointerInfo::getConstantPool(),

14328

false, false, false, Alignment);

14329

14330

SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);

14331

if (VT.is512BitVector())

14332

return Brcst;

14333

return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);

14334

}

14335

14336

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,

14337

SelectionDAG &DAG) {

14338

if (Subtarget->hasFp256()) {

14339

SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);

14340

if (Res.getNode())

14341

return Res;

14342

}

14343

14344

return SDValue();

14345

}

14346

14347

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,

14348

SelectionDAG &DAG) {

14349

SDLoc DL(Op);

14350

MVT VT = Op.getSimpleValueType();

14351

SDValue In = Op.getOperand(0);

14352

MVT SVT = In.getSimpleValueType();

14353

14354

if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)

14355

return LowerZERO_EXTEND_AVX512(Op, DAG);

14356

14357

if (Subtarget->hasFp256()) {

14358

SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);

14359

if (Res.getNode())

14360

return Res;

14361

}

14362

14363

assert(!VT.is256BitVector() || !SVT.is128BitVector() ||((!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements
() != SVT.getVectorNumElements()) ? static_cast<void> (
0) : __assert_fail ("!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14364, __PRETTY_FUNCTION__))

14364

VT.getVectorNumElements() != SVT.getVectorNumElements())((!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements
() != SVT.getVectorNumElements()) ? static_cast<void> (
0) : __assert_fail ("!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14364, __PRETTY_FUNCTION__));

14365

return SDValue();

14366

}

14367

14368

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

14369

SDLoc DL(Op);

14370

MVT VT = Op.getSimpleValueType();

14371

SDValue In = Op.getOperand(0);

14372

MVT InVT = In.getSimpleValueType();

14373

14374

if (VT == MVT::i1) {

14375

assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&(((InVT.isInteger() && (InVT.getSizeInBits() <= 64
)) && "Invalid scalar TRUNCATE operation") ? static_cast
<void> (0) : __assert_fail ("(InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && \"Invalid scalar TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14376, __PRETTY_FUNCTION__))

14376

"Invalid scalar TRUNCATE operation")(((InVT.isInteger() && (InVT.getSizeInBits() <= 64
)) && "Invalid scalar TRUNCATE operation") ? static_cast
<void> (0) : __assert_fail ("(InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && \"Invalid scalar TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14376, __PRETTY_FUNCTION__));

14377

if (InVT.getSizeInBits() >= 32)

14378

return SDValue();

14379

In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);

14380

return DAG.getNode(ISD::TRUNCATE, DL, VT, In);

14381

}

14382

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14383, __PRETTY_FUNCTION__))

14383

"Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14383, __PRETTY_FUNCTION__));

14384

14385

if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {

14386

if (VT.getVectorElementType().getSizeInBits() >=8)

14387

return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);

14388

14389

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14389, __PRETTY_FUNCTION__));

14390

unsigned NumElts = InVT.getVectorNumElements();

14391

assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type")(((NumElts == 8 || NumElts == 16) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 8 || NumElts == 16) && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14391, __PRETTY_FUNCTION__));

14392

if (InVT.getSizeInBits() < 512) {

14393

MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;

14394

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

14395

InVT = ExtVT;

14396

}

14397

14398

SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());

14399

const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();

14400

SDValue CP = DAG.getConstantPool(C, getPointerTy());

14401

unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

14402

SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,

14403

MachinePointerInfo::getConstantPool(),

14404

false, false, false, Alignment);

14405

SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);

14406

SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);

14407

return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);

14408

}

14409

14410

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

14411

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

14412

if (Subtarget->hasInt256()) {

14413

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

14414

In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);

14415

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),

14416

ShufMask);

14417

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

14418

DAG.getIntPtrConstant(0));

14419

}

14420

14421

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

14422

DAG.getIntPtrConstant(0));

14423

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

14424

DAG.getIntPtrConstant(2));

14425

OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);

14426

OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);

14427

static const int ShufMask[] = {0, 2, 4, 6};

14428

return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);

14429

}

14430

14431

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

14432

// On AVX2, v8i32 -> v8i16 becomed PSHUFB.

14433

if (Subtarget->hasInt256()) {

14434

In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);

14435

14436

SmallVector<SDValue,32> pshufbMask;

14437

for (unsigned i = 0; i < 2; ++i) {

14438

pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));

14439

pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));

14440

pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));

14441

pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));

14442

pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));

14443

pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));

14444

pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));

14445

pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));

14446

for (unsigned j = 0; j < 8; ++j)

14447

pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));

14448

}

14449

SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);

14450

In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);

14451

In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);

14452

14453

static const int ShufMask[] = {0, 2, -1, -1};

14454

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),

14455

&ShufMask[0]);

14456

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

14457

DAG.getIntPtrConstant(0));

14458

return DAG.getNode(ISD::BITCAST, DL, VT, In);

14459

}

14460

14461

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

14462

DAG.getIntPtrConstant(0));

14463

14464

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

14465

DAG.getIntPtrConstant(4));

14466

14467

OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);

14468

OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);

14469

14470

// The PSHUFB mask:

14471

static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,

14472

-1, -1, -1, -1, -1, -1, -1, -1};

14473

14474

SDValue Undef = DAG.getUNDEF(MVT::v16i8);

14475

OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);

14476

OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);

14477

14478

OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);

14479

OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);

14480

14481

// The MOVLHPS Mask:

14482

static const int ShufMask2[] = {0, 1, 4, 5};

14483

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

14484

return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);

14485

}

14486

14487

// Handle truncation of V256 to V128 using shuffles.

14488

if (!VT.is128BitVector() || !InVT.is256BitVector())

14489

return SDValue();

14490

14491

assert(Subtarget->hasFp256() && "256-bit vector without AVX!")((Subtarget->hasFp256() && "256-bit vector without AVX!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasFp256() && \"256-bit vector without AVX!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14491, __PRETTY_FUNCTION__));

14492

14493

unsigned NumElems = VT.getVectorNumElements();

14494

MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

14495

14496

SmallVector<int, 16> MaskVec(NumElems * 2, -1);

14497

// Prepare truncation shuffle mask

14498

for (unsigned i = 0; i != NumElems; ++i)

14499

MaskVec[i] = i * 2;

14500

SDValue V = DAG.getVectorShuffle(NVT, DL,

14501

DAG.getNode(ISD::BITCAST, DL, NVT, In),

14502

DAG.getUNDEF(NVT), &MaskVec[0]);

14503

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,

14504

DAG.getIntPtrConstant(0));

14505

}

14506

14507

SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,

14508

SelectionDAG &DAG) const {

14509

assert(!Op.getSimpleValueType().isVector())((!Op.getSimpleValueType().isVector()) ? static_cast<void>
(0) : __assert_fail ("!Op.getSimpleValueType().isVector()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14509, __PRETTY_FUNCTION__));

14510

14511

std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,

14512

/*IsSigned=*/ true, /*IsReplace=*/ false);

14513

SDValue FIST = Vals.first, StackSlot = Vals.second;

14514

// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.

14515

if (!FIST.getNode()) return Op;

14516

14517

if (StackSlot.getNode())

14518

// Load the result.

14519

return DAG.getLoad(Op.getValueType(), SDLoc(Op),

14520

FIST, StackSlot, MachinePointerInfo(),

14521

false, false, false, 0);

14522

14523

// The node is the result.

14524

return FIST;

14525

}

14526

14527

SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,

14528

SelectionDAG &DAG) const {

14529

std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,

14530

/*IsSigned=*/ false, /*IsReplace=*/ false);

14531

SDValue FIST = Vals.first, StackSlot = Vals.second;

14532

assert(FIST.getNode() && "Unexpected failure")((FIST.getNode() && "Unexpected failure") ? static_cast
<void> (0) : __assert_fail ("FIST.getNode() && \"Unexpected failure\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14532, __PRETTY_FUNCTION__));

14533

14534

if (StackSlot.getNode())

14535

// Load the result.

14536

return DAG.getLoad(Op.getValueType(), SDLoc(Op),

14537

FIST, StackSlot, MachinePointerInfo(),

14538

false, false, false, 0);

14539

14540

// The node is the result.

14541

return FIST;

14542

}

14543

14544

static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {

14545

SDLoc DL(Op);

14546

MVT VT = Op.getSimpleValueType();

14547

SDValue In = Op.getOperand(0);

14548

MVT SVT = In.getSimpleValueType();

14549

14550

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14550, __PRETTY_FUNCTION__));

14551

14552

return DAG.getNode(X86ISD::VFPEXT, DL, VT,

14553

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,

14554

In, DAG.getUNDEF(SVT)));

14555

}

14556

14557

/// The only differences between FABS and FNEG are the mask and the logic op.

14558

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

14559

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

14560

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14561, __PRETTY_FUNCTION__))

14561

"Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14561, __PRETTY_FUNCTION__));

14562

14563

bool IsFABS = (Op.getOpcode() == ISD::FABS);

14564

14565

// If this is a FABS and it has an FNEG user, bail out to fold the combination

14566

// into an FNABS. We'll lower the FABS after that if it is still in use.

14567

if (IsFABS)

14568

for (SDNode *User : Op->uses())

14569

if (User->getOpcode() == ISD::FNEG)

14570

return Op;

14571

14572

SDValue Op0 = Op.getOperand(0);

14573

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

14574

14575

SDLoc dl(Op);

14576

MVT VT = Op.getSimpleValueType();

14577

// Assume scalar op for initialization; update for vector if needed.

14578

// Note that there are no scalar bitwise logical SSE/AVX instructions, so we

14579

// generate a 16-byte vector constant and logic op even for the scalar case.

14580

// Using a 16-byte mask allows folding the load of the mask with

14581

// the logic op, so it can save (~4 bytes) on code size.

14582

MVT EltVT = VT;

14583

unsigned NumElts = VT == MVT::f64 ? 2 : 4;

14584

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

14585

// decide if we should generate a 16-byte constant mask when we only need 4 or

14586

// 8 bytes for the scalar case.

14587

if (VT.isVector()) {

14588

EltVT = VT.getVectorElementType();

14589

NumElts = VT.getVectorNumElements();

14590

}

14591

14592

unsigned EltBits = EltVT.getSizeInBits();

14593

LLVMContext *Context = DAG.getContext();

14594

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

14595

APInt MaskElt =

14596

IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);

14597

Constant *C = ConstantInt::get(*Context, MaskElt);

14598

C = ConstantVector::getSplat(NumElts, C);

14599

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

14600

SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());

14601

unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();

14602

SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,

14603

MachinePointerInfo::getConstantPool(),

14604

false, false, false, Alignment);

14605

14606

if (VT.isVector()) {

14607

// For a vector, cast operands to a vector type, perform the logic op,

14608

// and cast the result back to the original value type.

14609

MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

14610

SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);

14611

SDValue Operand = IsFNABS ?

14612

DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :

14613

DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);

14614

unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;

14615

return DAG.getNode(ISD::BITCAST, dl, VT,

14616

DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));

14617

}

14618

14619

// If not vector, then scalar.

14620

unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;

14621

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

14622

return DAG.getNode(BitOp, dl, VT, Operand, Mask);

14623

}

14624

14625

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

14626

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

14627

LLVMContext *Context = DAG.getContext();

14628

SDValue Op0 = Op.getOperand(0);

14629

SDValue Op1 = Op.getOperand(1);

14630

SDLoc dl(Op);

14631

MVT VT = Op.getSimpleValueType();

14632

MVT SrcVT = Op1.getSimpleValueType();

14633

14634

// If second operand is smaller, extend it first.

14635

if (SrcVT.bitsLT(VT)) {

14636

Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);

14637

SrcVT = VT;

14638

}

14639

// And if it is bigger, shrink it first.

14640

if (SrcVT.bitsGT(VT)) {

14641

Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));

14642

SrcVT = VT;

14643

}

14644

14645

// At this point the operands and the result should have the same

14646

// type, and that won't be f80 since that is not custom lowered.

14647

14648

const fltSemantics &Sem =

14649

VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;

14650

const unsigned SizeInBits = VT.getSizeInBits();

14651

14652

SmallVector<Constant *, 4> CV(

14653

VT == MVT::f64 ? 2 : 4,

14654

ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));

14655

14656

// First, clear all bits but the sign bit from the second operand (sign).

14657

CV[0] = ConstantFP::get(*Context,

14658

APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));

14659

Constant *C = ConstantVector::get(CV);

14660

SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);

14661

SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,

14662

MachinePointerInfo::getConstantPool(),

14663

false, false, false, 16);

14664

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);

14665

14666

// Next, clear the sign bit from the first operand (magnitude).

14667

// If it's a constant, we can clear it here.

14668

if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {

14669

APFloat APF = Op0CN->getValueAPF();

14670

// If the magnitude is a positive zero, the sign bit alone is enough.

14671

if (APF.isPosZero())

14672

return SignBit;

14673

APF.clearSign();

14674

CV[0] = ConstantFP::get(*Context, APF);

14675

} else {

14676

CV[0] = ConstantFP::get(

14677

*Context,

14678

APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));

14679

}

14680

C = ConstantVector::get(CV);

14681

CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);

14682

SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,

14683

MachinePointerInfo::getConstantPool(),

14684

false, false, false, 16);

14685

// If the magnitude operand wasn't a constant, we need to AND out the sign.

14686

if (!isa<ConstantFPSDNode>(Op0))

14687

Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);

14688

14689

// OR the magnitude value with the sign bit.

14690

return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);

14691

}

14692

14693

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

14694

SDValue N0 = Op.getOperand(0);

14695

SDLoc dl(Op);

14696

MVT VT = Op.getSimpleValueType();

14697

14698

// Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).

14699

SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,

14700

DAG.getConstant(1, VT));

14701

return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));

14702

}

14703

14704

// Check whether an OR'd tree is PTEST-able.

14705

static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,

14706

SelectionDAG &DAG) {

14707

assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")((Op.getOpcode() == ISD::OR && "Only check OR'd tree."
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14707, __PRETTY_FUNCTION__));

14708

14709

if (!Subtarget->hasSSE41())

14710

return SDValue();

14711

14712

if (!Op->hasOneUse())

14713

return SDValue();

14714

14715

SDNode *N = Op.getNode();

14716

SDLoc DL(N);

14717

14718

SmallVector<SDValue, 8> Opnds;

14719

DenseMap<SDValue, unsigned> VecInMap;

14720

SmallVector<SDValue, 8> VecIns;

14721

EVT VT = MVT::Other;

14722

14723

// Recognize a special case where a vector is casted into wide integer to

14724

// test all 0s.

14725

Opnds.push_back(N->getOperand(0));

14726

Opnds.push_back(N->getOperand(1));

14727

14728

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

14729

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

14730

// BFS traverse all OR'd operands.

14731

if (I->getOpcode() == ISD::OR) {

14732

Opnds.push_back(I->getOperand(0));

14733

Opnds.push_back(I->getOperand(1));

14734

// Re-evaluate the number of nodes to be traversed.

14735

e += 2; // 2 more nodes (LHS and RHS) are pushed.

14736

continue;

14737

}

14738

14739

// Quit if a non-EXTRACT_VECTOR_ELT

14740

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

14741

return SDValue();

14742

14743

// Quit if without a constant index.

14744

SDValue Idx = I->getOperand(1);

14745

if (!isa<ConstantSDNode>(Idx))

14746

return SDValue();

14747

14748

SDValue ExtractedFromVec = I->getOperand(0);

14749

DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);

14750

if (M == VecInMap.end()) {

14751

VT = ExtractedFromVec.getValueType();

14752

// Quit if not 128/256-bit vector.

14753

if (!VT.is128BitVector() && !VT.is256BitVector())

14754

return SDValue();

14755

// Quit if not the same type.

14756

if (VecInMap.begin() != VecInMap.end() &&

14757

VT != VecInMap.begin()->first.getValueType())

14758

return SDValue();

14759

M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;

14760

VecIns.push_back(ExtractedFromVec);

14761

}

14762

M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();

14763

}

14764

14765

assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Not extracted from 128-/256-bit vector."
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14766, __PRETTY_FUNCTION__))

14766

"Not extracted from 128-/256-bit vector.")(((VT.is128BitVector() || VT.is256BitVector()) && "Not extracted from 128-/256-bit vector."
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14766, __PRETTY_FUNCTION__));

14767

14768

unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;

14769

14770

for (DenseMap<SDValue, unsigned>::const_iterator

14771

I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {

14772

// Quit if not all elements are used.

14773

if (I->second != FullMask)

14774

return SDValue();

14775

}

14776

14777

EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

14778

14779

// Cast all vectors into TestVT for PTEST.

14780

for (unsigned i = 0, e = VecIns.size(); i < e; ++i)

14781

VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);

14782

14783

// If more than one full vectors are evaluated, OR them first before PTEST.

14784

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {

14785

// Each iteration will OR 2 nodes and append the result until there is only

14786

// 1 node left, i.e. the final OR'd value of all vectors.

14787

SDValue LHS = VecIns[Slot];

14788

SDValue RHS = VecIns[Slot + 1];

14789

VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));

14790

}

14791

14792

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,

14793

VecIns.back(), VecIns.back());

14794

}

14795

14796

/// \brief return true if \c Op has a use that doesn't just read flags.

14797

static bool hasNonFlagsUse(SDValue Op) {

14798

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

14799

++UI) {

14800

SDNode *User = *UI;

14801

unsigned UOpNo = UI.getOperandNo();

14802

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

14803

// Look pass truncate.

14804

UOpNo = User->use_begin().getOperandNo();

14805

User = *User->use_begin();

14806

}

14807

14808

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

14809

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

14810

return true;

14811

}

14812

return false;

14813

}

14814

14815

/// Emit nodes that will be selected as "test Op0,Op0", or something

14816

/// equivalent.

14817

SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,

14818

SelectionDAG &DAG) const {

14819

if (Op.getValueType() == MVT::i1)

14820

// KORTEST instruction should be selected

14821

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

14822

DAG.getConstant(0, Op.getValueType()));

14823

14824

// CF and OF aren't always set the way we want. Determine which

14825

// of these we need.

14826

bool NeedCF = false;

14827

bool NeedOF = false;

14828

switch (X86CC) {

14829

default: break;

14830

case X86::COND_A: case X86::COND_AE:

14831

case X86::COND_B: case X86::COND_BE:

14832

NeedCF = true;

14833

break;

14834

case X86::COND_G: case X86::COND_GE:

14835

case X86::COND_L: case X86::COND_LE:

14836

case X86::COND_O: case X86::COND_NO: {

14837

// Check if we really need to set the

14838

// Overflow flag. If NoSignedWrap is present

14839

// that is not actually needed.

14840

switch (Op->getOpcode()) {

14841

case ISD::ADD:

14842

case ISD::SUB:

14843

case ISD::MUL:

14844

case ISD::SHL: {

14845

const BinaryWithFlagsSDNode *BinNode =

14846

cast<BinaryWithFlagsSDNode>(Op.getNode());

14847

if (BinNode->hasNoSignedWrap())

14848

break;

14849

}

14850

default:

14851

NeedOF = true;

14852

break;

14853

}

14854

break;

14855

}

14856

}

14857

// See if we can use the EFLAGS value from the operand instead of

14858

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

14859

// we prove that the arithmetic won't overflow, we can't use OF or CF.

14860

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

14861

// Emit a CMP with 0, which is the TEST pattern.

14862

//if (Op.getValueType() == MVT::i1)

14863

// return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,

14864

// DAG.getConstant(0, MVT::i1));

14865

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

14866

DAG.getConstant(0, Op.getValueType()));

14867

}

14868

unsigned Opcode = 0;

14869

unsigned NumOperands = 0;

14870

14871

// Truncate operations may prevent the merge of the SETCC instruction

14872

// and the arithmetic instruction before it. Attempt to truncate the operands

14873

// of the arithmetic instruction and use a reduced bit-width instruction.

14874

bool NeedTruncation = false;

14875

SDValue ArithOp = Op;

14876

if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {

14877

SDValue Arith = Op->getOperand(0);

14878

// Both the trunc and the arithmetic op need to have one user each.

14879

if (Arith->hasOneUse())

14880

switch (Arith.getOpcode()) {

14881

default: break;

14882

case ISD::ADD:

14883

case ISD::SUB:

14884

case ISD::AND:

14885

case ISD::OR:

14886

case ISD::XOR: {

14887

NeedTruncation = true;

14888

ArithOp = Arith;

14889

}

14890

}

14891

}

14892

14893

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

14894

// which may be the result of a CAST. We use the variable 'Op', which is the

14895

// non-casted variable when we check for possible users.

14896

switch (ArithOp.getOpcode()) {

14897

case ISD::ADD:

14898

// Due to an isel shortcoming, be conservative if this add is likely to be

14899

// selected as part of a load-modify-store instruction. When the root node

14900

// in a match is a store, isel doesn't know how to remap non-chain non-flag

14901

// uses of other nodes in the match, such as the ADD in this case. This

14902

// leads to the ADD being left around and reselected, with the result being

14903

// two adds in the output. Alas, even if none our users are stores, that

14904

// doesn't prove we're O.K. Ergo, if we have any parents that aren't

14905

// CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require

14906

// climbing the DAG back to the root, and it doesn't seem to be worth the

14907

// effort.

14908

for (SDNode::use_iterator UI = Op.getNode()->use_begin(),

14909

UE = Op.getNode()->use_end(); UI != UE; ++UI)

14910

if (UI->getOpcode() != ISD::CopyToReg &&

14911

UI->getOpcode() != ISD::SETCC &&

14912

UI->getOpcode() != ISD::STORE)

14913

goto default_case;

14914

14915

if (ConstantSDNode *C =

14916

dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {

14917

// An add of one will be selected as an INC.

14918

if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {

14919

Opcode = X86ISD::INC;

14920

NumOperands = 1;

14921

break;

14922

}

14923

14924

// An add of negative one (subtract of one) will be selected as a DEC.

14925

if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {

14926

Opcode = X86ISD::DEC;

14927

NumOperands = 1;

14928

break;

14929

}

14930

}

14931

14932

// Otherwise use a regular EFLAGS-setting add.

14933

Opcode = X86ISD::ADD;

14934

NumOperands = 2;

14935

break;

14936

case ISD::SHL:

14937

case ISD::SRL:

14938

// If we have a constant logical shift that's only used in a comparison

14939

// against zero turn it into an equivalent AND. This allows turning it into

14940

// a TEST instruction later.

14941

if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&

14942

isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {

14943

EVT VT = Op.getValueType();

14944

unsigned BitWidth = VT.getSizeInBits();

14945

unsigned ShAmt = Op->getConstantOperandVal(1);

14946

if (ShAmt >= BitWidth) // Avoid undefined shifts.

14947

break;

14948

APInt Mask = ArithOp.getOpcode() == ISD::SRL

14949

? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)

14950

: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);

14951

if (!Mask.isSignedIntN(32)) // Avoid large immediates.

14952

break;

14953

SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),

14954

DAG.getConstant(Mask, VT));

14955

DAG.ReplaceAllUsesWith(Op, New);

14956

Op = New;

14957

}

14958

break;

14959

14960

case ISD::AND:

14961

// If the primary and result isn't used, don't bother using X86ISD::AND,

14962

// because a TEST instruction will be better.

14963

if (!hasNonFlagsUse(Op))

14964

break;

14965

// FALL THROUGH

14966

case ISD::SUB:

14967

case ISD::OR:

14968

case ISD::XOR:

14969

// Due to the ISEL shortcoming noted above, be conservative if this op is

14970

// likely to be selected as part of a load-modify-store instruction.

14971

for (SDNode::use_iterator UI = Op.getNode()->use_begin(),

14972

UE = Op.getNode()->use_end(); UI != UE; ++UI)

14973

if (UI->getOpcode() == ISD::STORE)

14974

goto default_case;

14975

14976

// Otherwise use a regular EFLAGS-setting instruction.

14977

switch (ArithOp.getOpcode()) {

14978

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14978);

14979

case ISD::SUB: Opcode = X86ISD::SUB; break;

14980

case ISD::XOR: Opcode = X86ISD::XOR; break;

14981

case ISD::AND: Opcode = X86ISD::AND; break;

14982

case ISD::OR: {

14983

if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

14984

SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);

14985

if (EFLAGS.getNode())

14986

return EFLAGS;

14987

}

14988

Opcode = X86ISD::OR;

14989

break;

14990

}

14991

}

14992

14993

NumOperands = 2;

14994

break;

14995

case X86ISD::ADD:

14996

case X86ISD::SUB:

14997

case X86ISD::INC:

14998

case X86ISD::DEC:

14999

case X86ISD::OR:

15000

case X86ISD::XOR:

15001

case X86ISD::AND:

15002

return SDValue(Op.getNode(), 1);

15003

default:

15004

default_case:

15005

break;

15006

}

15007

15008

// If we found that truncation is beneficial, perform the truncation and

15009

// update 'Op'.

15010

if (NeedTruncation) {

15011

EVT VT = Op.getValueType();

15012

SDValue WideVal = Op->getOperand(0);

15013

EVT WideVT = WideVal.getValueType();

15014

unsigned ConvertedOp = 0;

15015

// Use a target machine opcode to prevent further DAGCombine

15016

// optimizations that may separate the arithmetic operations

15017

// from the setcc node.

15018

switch (WideVal.getOpcode()) {

15019

default: break;

15020

case ISD::ADD: ConvertedOp = X86ISD::ADD; break;

15021

case ISD::SUB: ConvertedOp = X86ISD::SUB; break;

15022

case ISD::AND: ConvertedOp = X86ISD::AND; break;

15023

case ISD::OR: ConvertedOp = X86ISD::OR; break;

15024

case ISD::XOR: ConvertedOp = X86ISD::XOR; break;

15025

}

15026

15027

if (ConvertedOp) {

15028

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

15029

if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {

15030

SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));

15031

SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));

15032

Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);

15033

}

15034

}

15035

}

15036

15037

if (Opcode == 0)

15038

// Emit a CMP with 0, which is the TEST pattern.

15039

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

15040

DAG.getConstant(0, Op.getValueType()));

15041

15042

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

15043

SmallVector<SDValue, 4> Ops;

15044

for (unsigned i = 0; i != NumOperands; ++i)

15045

Ops.push_back(Op.getOperand(i));

15046

15047

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

15048

DAG.ReplaceAllUsesWith(Op, New);

15049

return SDValue(New.getNode(), 1);

15050

}

15051

15052

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

15053

/// equivalent.

15054

SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

15055

SDLoc dl, SelectionDAG &DAG) const {

15056

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {

15057

if (C->getAPIntValue() == 0)

15058

return EmitTest(Op0, X86CC, dl, DAG);

15059

15060

if (Op0.getValueType() == MVT::i1)

15061

llvm_unreachable("Unexpected comparison operation for MVT::i1 operands")::llvm::llvm_unreachable_internal("Unexpected comparison operation for MVT::i1 operands"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15061);

15062

}

15063

15064

if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||

15065

Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {

15066

// Do the comparison at i32 if it's smaller, besides the Atom case.

15067

// This avoids subregister aliasing issues. Keep the smaller reference

15068

// if we're optimizing for size, however, as that'll allow better folding

15069

// of memory operations.

15070

if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&

15071

!DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(

15072

AttributeSet::FunctionIndex, Attribute::MinSize) &&

15073

!Subtarget->isAtom()) {

15074

unsigned ExtendOp =

15075

isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;

15076

Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);

15077

Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);

15078

}

15079

// Use SUB instead of CMP to enable CSE between SUB and CMP.

15080

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

15081

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,

15082

Op0, Op1);

15083

return SDValue(Sub.getNode(), 1);

15084

}

15085

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);

15086

}

15087

15088

/// Convert a comparison if required by the subtarget.

15089

SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,

15090

SelectionDAG &DAG) const {

15091

// If the subtarget does not support the FUCOMI instruction, floating-point

15092

// comparisons have to be converted.

15093

if (Subtarget->hasCMov() ||

15094

Cmp.getOpcode() != X86ISD::CMP ||

15095

!Cmp.getOperand(0).getValueType().isFloatingPoint() ||

15096

!Cmp.getOperand(1).getValueType().isFloatingPoint())

15097

return Cmp;

15098

15099

// The instruction selector will select an FUCOM instruction instead of

15100

// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence

15101

// build an SDNode sequence that transfers the result from FPSW into EFLAGS:

15102

// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))

15103

SDLoc dl(Cmp);

15104

SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);

15105

SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);

15106

SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,

15107

DAG.getConstant(8, MVT::i8));

15108

SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

15109

return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);

15110

}

15111

15112

/// The minimum architected relative accuracy is 2^-12. We need one

15113

/// Newton-Raphson step to have a good float result (24 bits of precision).

15114

SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,

15115

DAGCombinerInfo &DCI,

15116

unsigned &RefinementSteps,

15117

bool &UseOneConstNR) const {

15118

// FIXME: We should use instruction latency models to calculate the cost of

15119

// each potential sequence, but this is very hard to do reliably because

15120

// at least Intel's Core* chips have variable timing based on the number of

15121

// significant digits in the divisor and/or sqrt operand.

15122

if (!Subtarget->useSqrtEst())

15123

return SDValue();

15124

15125

EVT VT = Op.getValueType();

15126

15127

// SSE1 has rsqrtss and rsqrtps.

15128

// TODO: Add support for AVX512 (v16f32).

15129

// It is likely not profitable to do this for f64 because a double-precision

15130

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

15131

// instructions: convert to single, rsqrtss, convert back to double, refine

15132

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

15133

// along with FMA, this could be a throughput win.

15134

if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||

15135

(Subtarget->hasAVX() && VT == MVT::v8f32)) {

15136

RefinementSteps = 1;

15137

UseOneConstNR = false;

15138

return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);

15139

}

15140

return SDValue();

15141

}

15142

15143

/// The minimum architected relative accuracy is 2^-12. We need one

15144

/// Newton-Raphson step to have a good float result (24 bits of precision).

15145

SDValue X86TargetLowering::getRecipEstimate(SDValue Op,

15146

DAGCombinerInfo &DCI,

15147

unsigned &RefinementSteps) const {

15148

// FIXME: We should use instruction latency models to calculate the cost of

15149

// each potential sequence, but this is very hard to do reliably because

15150

// at least Intel's Core* chips have variable timing based on the number of

15151

// significant digits in the divisor.

15152

if (!Subtarget->useReciprocalEst())

15153

return SDValue();

15154

15155

EVT VT = Op.getValueType();

15156

15157

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

15158

// TODO: Add support for AVX512 (v16f32).

15159

// It is likely not profitable to do this for f64 because a double-precision

15160

// reciprocal estimate with refinement on x86 prior to FMA requires

15161

// 15 instructions: convert to single, rcpss, convert back to double, refine

15162

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

15163

// along with FMA, this could be a throughput win.

15164

if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||

15165

(Subtarget->hasAVX() && VT == MVT::v8f32)) {

15166

RefinementSteps = ReciprocalEstimateRefinementSteps;

15167

return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);

15168

}

15169

return SDValue();

15170

}

15171

15172

static bool isAllOnes(SDValue V) {

15173

ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);

15174

return C && C->isAllOnesValue();

15175

}

15176

15177

/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node

15178

/// if it's possible.

15179

SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,

15180

SDLoc dl, SelectionDAG &DAG) const {

15181

SDValue Op0 = And.getOperand(0);

15182

SDValue Op1 = And.getOperand(1);

15183

if (Op0.getOpcode() == ISD::TRUNCATE)

15184

Op0 = Op0.getOperand(0);

15185

if (Op1.getOpcode() == ISD::TRUNCATE)

15186

Op1 = Op1.getOperand(0);

15187

15188

SDValue LHS, RHS;

15189

if (Op1.getOpcode() == ISD::SHL)

15190

std::swap(Op0, Op1);

15191

if (Op0.getOpcode() == ISD::SHL) {

15192

if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))

15193

if (And00C->getZExtValue() == 1) {

15194

// If we looked past a truncate, check that it's only truncating away

15195

// known zeros.

15196

unsigned BitWidth = Op0.getValueSizeInBits();

15197

unsigned AndBitWidth = And.getValueSizeInBits();

15198

if (BitWidth > AndBitWidth) {

15199

APInt Zeros, Ones;

15200

DAG.computeKnownBits(Op0, Zeros, Ones);

15201

if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)

15202

return SDValue();

15203

}

15204

LHS = Op1;

15205

RHS = Op0.getOperand(1);

15206

}

15207

} else if (Op1.getOpcode() == ISD::Constant) {

15208

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

15209

uint64_t AndRHSVal = AndRHS->getZExtValue();

15210

SDValue AndLHS = Op0;

15211

15212

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

15213

LHS = AndLHS.getOperand(0);

15214

RHS = AndLHS.getOperand(1);

15215

}

15216

15217

// Use BT if the immediate can't be encoded in a TEST instruction.

15218

if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {

15219

LHS = AndLHS;

15220

RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());

15221

}

15222

}

15223

15224

if (LHS.getNode()) {

15225

// If LHS is i8, promote it to i32 with any_extend. There is no i8 BT

15226

// instruction. Since the shift amount is in-range-or-undefined, we know

15227

// that doing a bittest on the i32 value is ok. We extend to i32 because

15228

// the encoding for the i16 version is larger than the i32 version.

15229

// Also promote i16 to i32 for performance / code size reason.

15230

if (LHS.getValueType() == MVT::i8 ||

15231

LHS.getValueType() == MVT::i16)

15232

LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);

15233

15234

// If the operand types disagree, extend the shift amount to match. Since

15235

// BT ignores high bits (like shifts) we can use anyextend.

15236

if (LHS.getValueType() != RHS.getValueType())

15237

RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);

15238

15239

SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);

15240

X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

15241

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

15242

DAG.getConstant(Cond, MVT::i8), BT);

15243

}

15244

15245

return SDValue();

15246

}

15247

15248

/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point

15249

/// mask CMPs.

15250

static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

15251

SDValue &Op1) {

15252

unsigned SSECC;

15253

bool Swap = false;

15254

15255

// SSE Condition code mapping:

15256

// 0 - EQ

15257

// 1 - LT

15258

// 2 - LE

15259

// 3 - UNORD

15260

// 4 - NEQ

15261

// 5 - NLT

15262

// 6 - NLE

15263

// 7 - ORD

15264

switch (SetCCOpcode) {

15265

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15265);

15266

case ISD::SETOEQ:

15267

case ISD::SETEQ: SSECC = 0; break;

15268

case ISD::SETOGT:

15269

case ISD::SETGT: Swap = true; // Fallthrough

15270

case ISD::SETLT:

15271

case ISD::SETOLT: SSECC = 1; break;

15272

case ISD::SETOGE:

15273

case ISD::SETGE: Swap = true; // Fallthrough

15274

case ISD::SETLE:

15275

case ISD::SETOLE: SSECC = 2; break;

15276

case ISD::SETUO: SSECC = 3; break;

15277

case ISD::SETUNE:

15278

case ISD::SETNE: SSECC = 4; break;

15279

case ISD::SETULE: Swap = true; // Fallthrough

15280

case ISD::SETUGE: SSECC = 5; break;

15281

case ISD::SETULT: Swap = true; // Fallthrough

15282

case ISD::SETUGT: SSECC = 6; break;

15283

case ISD::SETO: SSECC = 7; break;

15284

case ISD::SETUEQ:

15285

case ISD::SETONE: SSECC = 8; break;

15286

}

15287

if (Swap)

15288

std::swap(Op0, Op1);

15289

15290

return SSECC;

15291

}

15292

15293

// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128

15294

// ones, and then concatenate the result back.

15295

static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {

15296

MVT VT = Op.getSimpleValueType();

15297

15298

assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15299, __PRETTY_FUNCTION__))

15299

"Unsupported value type for operation")((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15299, __PRETTY_FUNCTION__));

15300

15301

unsigned NumElems = VT.getVectorNumElements();

15302

SDLoc dl(Op);

15303

SDValue CC = Op.getOperand(2);

15304

15305

// Extract the LHS vectors

15306

SDValue LHS = Op.getOperand(0);

15307

SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);

15308

SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);

15309

15310

// Extract the RHS vectors

15311

SDValue RHS = Op.getOperand(1);

15312

SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);

15313

SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);

15314

15315

// Issue the operation on the smaller types and concatenate the result back

15316

MVT EltVT = VT.getVectorElementType();

15317

MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

15318

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

15319

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),

15320

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));

15321

}

15322

15323

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,

15324

const X86Subtarget *Subtarget) {

15325

SDValue Op0 = Op.getOperand(0);

15326

SDValue Op1 = Op.getOperand(1);

15327

SDValue CC = Op.getOperand(2);

15328

MVT VT = Op.getSimpleValueType();

15329

SDLoc dl(Op);

15330

15331

assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&((Op0.getValueType().getVectorElementType().getSizeInBits() >=
8 && Op.getValueType().getScalarType() == MVT::i1 &&
"Cannot set masked compare for this operation") ? static_cast
<void> (0) : __assert_fail ("Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15333, __PRETTY_FUNCTION__))

15332

Op.getValueType().getScalarType() == MVT::i1 &&((Op0.getValueType().getVectorElementType().getSizeInBits() >=
8 && Op.getValueType().getScalarType() == MVT::i1 &&
"Cannot set masked compare for this operation") ? static_cast
<void> (0) : __assert_fail ("Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15333, __PRETTY_FUNCTION__))

15333

"Cannot set masked compare for this operation")((Op0.getValueType().getVectorElementType().getSizeInBits() >=
8 && Op.getValueType().getScalarType() == MVT::i1 &&
"Cannot set masked compare for this operation") ? static_cast
<void> (0) : __assert_fail ("Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15333, __PRETTY_FUNCTION__));

15334

15335

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

15336

unsigned Opc = 0;

15337

bool Unsigned = false;

15338

bool Swap = false;

15339

unsigned SSECC;

15340

switch (SetCCOpcode) {

15341

15342

case ISD::SETNE: SSECC = 4; break;

15343

case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;

15344

case ISD::SETUGT: SSECC = 6; Unsigned = true; break;

15345

case ISD::SETLT: Swap = true; //fall-through

15346

case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;

15347

case ISD::SETULT: SSECC = 1; Unsigned = true; break;

15348

case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT

15349

case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap

15350

case ISD::SETULE: Unsigned = true; //fall-through

15351

case ISD::SETLE: SSECC = 2; break;

15352

}

15353

15354

if (Swap)

15355

std::swap(Op0, Op1);

15356

if (Opc)

15357

return DAG.getNode(Opc, dl, VT, Op0, Op1);

15358

Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;

15359

return DAG.getNode(Opc, dl, VT, Op0, Op1,

15360

DAG.getConstant(SSECC, MVT::i8));

15361

}

15362

15363

/// \brief Try to turn a VSETULT into a VSETULE by modifying its second

15364

/// operand \p Op1. If non-trivial (for example because it's not constant)

15365

/// return an empty value.

15366

static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)

15367

{

15368

BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());

15369

if (!BV)

15370

return SDValue();

15371

15372

MVT VT = Op1.getSimpleValueType();

15373

MVT EVT = VT.getVectorElementType();

15374

unsigned n = VT.getVectorNumElements();

15375

SmallVector<SDValue, 8> ULTOp1;

15376

15377

for (unsigned i = 0; i < n; ++i) {

15378

ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

15379

if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)

15380

return SDValue();

15381

15382

// Avoid underflow.

15383

APInt Val = Elt->getAPIntValue();

15384

if (Val == 0)

15385

return SDValue();

15386

15387

ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));

15388

}

15389

15390

return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);

15391

}

15392

15393

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,

15394

SelectionDAG &DAG) {

15395

SDValue Op0 = Op.getOperand(0);

15396

SDValue Op1 = Op.getOperand(1);

15397

SDValue CC = Op.getOperand(2);

15398

MVT VT = Op.getSimpleValueType();

15399

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

15400

bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();

15401

SDLoc dl(Op);

15402

15403

if (isFP) {

15404

#ifndef NDEBUG

15405

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

15406

assert(EltVT == MVT::f32 || EltVT == MVT::f64)((EltVT == MVT::f32 || EltVT == MVT::f64) ? static_cast<void
> (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15406, __PRETTY_FUNCTION__));

15407

#endif

15408

15409

unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);

15410

unsigned Opc = X86ISD::CMPP;

15411

if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {

15412

assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15412, __PRETTY_FUNCTION__));

15413

Opc = X86ISD::CMPM;

15414

}

15415

// In the two special cases we can't handle, emit two comparisons.

15416

if (SSECC == 8) {

15417

unsigned CC0, CC1;

15418

unsigned CombineOpc;

15419

if (SetCCOpcode == ISD::SETUEQ) {

15420

CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;

15421

} else {

15422

assert(SetCCOpcode == ISD::SETONE)((SetCCOpcode == ISD::SETONE) ? static_cast<void> (0) :
__assert_fail ("SetCCOpcode == ISD::SETONE", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15422, __PRETTY_FUNCTION__));

15423

CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;

15424

}

15425

15426

SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,

15427

DAG.getConstant(CC0, MVT::i8));

15428

SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,

15429

DAG.getConstant(CC1, MVT::i8));

15430

return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

15431

}

15432

// Handle all other FP comparisons here.

15433

return DAG.getNode(Opc, dl, VT, Op0, Op1,

15434

DAG.getConstant(SSECC, MVT::i8));

15435

}

15436

15437

// Break 256-bit integer vector compare into smaller ones.

15438

if (VT.is256BitVector() && !Subtarget->hasInt256())

15439

return Lower256IntVSETCC(Op, DAG);

15440

15441

bool MaskResult = (VT.getVectorElementType() == MVT::i1);

15442

EVT OpVT = Op1.getValueType();

15443

if (Subtarget->hasAVX512()) {

15444

if (Op1.getValueType().is512BitVector() ||

15445

(Subtarget->hasBWI() && Subtarget->hasVLX()) ||

15446

(MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))

15447

return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);

15448

15449

// In AVX-512 architecture setcc returns mask with i1 elements,

15450

// But there is no compare instruction for i8 and i16 elements in KNL.

15451

// We are not talking about 512-bit operands in this case, these

15452

// types are illegal.

15453

if (MaskResult &&

15454

(OpVT.getVectorElementType().getSizeInBits() < 32 &&

15455

OpVT.getVectorElementType().getSizeInBits() >= 8))

15456

return DAG.getNode(ISD::TRUNCATE, dl, VT,

15457

DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));

15458

}

15459

15460

// We are handling one of the integer comparisons here. Since SSE only has

15461

// GT and EQ comparisons for integer, swapping operands and multiple

15462

// operations may be required for some comparisons.

15463

unsigned Opc;

15464

bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;

15465

bool Subus = false;

15466

15467

switch (SetCCOpcode) {

15468

15469

case ISD::SETNE: Invert = true;

15470

case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;

15471

case ISD::SETLT: Swap = true;

15472

case ISD::SETGT: Opc = X86ISD::PCMPGT; break;

15473

case ISD::SETGE: Swap = true;

15474

case ISD::SETLE: Opc = X86ISD::PCMPGT;

15475

Invert = true; break;

15476

case ISD::SETULT: Swap = true;

15477

case ISD::SETUGT: Opc = X86ISD::PCMPGT;

15478

FlipSigns = true; break;

15479

case ISD::SETUGE: Swap = true;

15480

case ISD::SETULE: Opc = X86ISD::PCMPGT;

15481

FlipSigns = true; Invert = true; break;

15482

}

15483

15484

// Special case: Use min/max operations for SETULE/SETUGE

15485

MVT VET = VT.getVectorElementType();

15486

bool hasMinMax =

15487

(Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))

15488

|| (Subtarget->hasSSE2() && (VET == MVT::i8));

15489

15490

if (hasMinMax) {

15491

switch (SetCCOpcode) {

15492

default: break;

15493

case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;

15494

case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;

15495

}

15496

15497

if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }

15498

}

15499

15500

bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);

15501

if (!MinMax && hasSubus) {

15502

// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

15503

// Op0 u<= Op1:

15504

// t = psubus Op0, Op1

15505

// pcmpeq t, <0..0>

15506

switch (SetCCOpcode) {

15507

default: break;

15508

case ISD::SETULT: {

15509

// If the comparison is against a constant we can turn this into a

15510

// setule. With psubus, setule does not require a swap. This is

15511

// beneficial because the constant in the register is no longer

15512

// destructed as the destination so it can be hoisted out of a loop.

15513

// Only do this pre-AVX since vpcmp* is no longer destructive.

15514

if (Subtarget->hasAVX())

15515

break;

15516

SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);

15517

if (ULEOp1.getNode()) {

15518

Op1 = ULEOp1;

15519

Subus = true; Invert = false; Swap = false;

15520

}

15521

break;

15522

}

15523

// Psubus is better than flip-sign because it requires no inversion.

15524

case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;

15525

case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;

15526

}

15527

15528

if (Subus) {

15529

Opc = X86ISD::SUBUS;

15530

FlipSigns = false;

15531

}

15532

}

15533

15534

if (Swap)

15535

std::swap(Op0, Op1);

15536

15537

// Check that the operation in question is available (most are plain SSE2,

15538

// but PCMPGTQ and PCMPEQQ have different requirements).

15539

if (VT == MVT::v2i64) {

15540

if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {

15541

assert(Subtarget->hasSSE2() && "Don't know how to lower!")((Subtarget->hasSSE2() && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Don't know how to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15541, __PRETTY_FUNCTION__));

15542

15543

// First cast everything to the right type.

15544

Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);

15545

Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);

15546

15547

// Since SSE has no unsigned integer comparisons, we need to flip the sign

15548

// bits of the inputs before performing those operations. The lower

15549

// compare is always unsigned.

15550

SDValue SB;

15551

if (FlipSigns) {

15552

SB = DAG.getConstant(0x80000000U, MVT::v4i32);

15553

} else {

15554

SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);

15555

SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);

15556

SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,

15557

Sign, Zero, Sign, Zero);

15558

}

15559

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);

15560

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);

15561

15562

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

15563

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

15564

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

15565

15566

// Create masks for only the low parts/high parts of the 64 bit integers.

15567

static const int MaskHi[] = { 1, 1, 3, 3 };

15568

static const int MaskLo[] = { 0, 0, 2, 2 };

15569

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

15570

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

15571

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

15572

15573

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

15574

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

15575

15576

if (Invert)

15577

Result = DAG.getNOT(dl, Result, MVT::v4i32);

15578

15579

return DAG.getNode(ISD::BITCAST, dl, VT, Result);

15580

}

15581

15582

if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {

15583

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

15584

// pcmpeqd + pshufd + pand.

15585

assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15585, __PRETTY_FUNCTION__));

15586

15587

// First cast everything to the right type.

15588

Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);

15589

Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);

15590

15591

// Do the compare.

15592

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

15593

15594

// Make sure the lower and upper halves are both all-ones.

15595

static const int Mask[] = { 1, 0, 3, 2 };

15596

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

15597

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

15598

15599

if (Invert)

15600

Result = DAG.getNOT(dl, Result, MVT::v4i32);

15601

15602

return DAG.getNode(ISD::BITCAST, dl, VT, Result);

15603

}

15604

}

15605

15606

// Since SSE has no unsigned integer comparisons, we need to flip the sign

15607

// bits of the inputs before performing those operations.

15608

if (FlipSigns) {

15609

EVT EltVT = VT.getVectorElementType();

15610

SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);

15611

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);

15612

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);

15613

}

15614

15615

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

15616

15617

// If the logical-not of the result is required, perform that now.

15618

if (Invert)

15619

Result = DAG.getNOT(dl, Result, VT);

15620

15621

if (MinMax)

15622

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

15623

15624

if (Subus)

15625

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

15626

getZeroVector(VT, Subtarget, DAG, dl));

15627

15628

return Result;

15629

}

15630

15631

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

15632

15633

MVT VT = Op.getSimpleValueType();

15634

15635

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

15636

15637

assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))((((!Subtarget->hasAVX512() && VT == MVT::i8) || (
VT == MVT::i1)) && "SetCC type must be 8-bit or 1-bit integer"
) ? static_cast<void> (0) : __assert_fail ("((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) && \"SetCC type must be 8-bit or 1-bit integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15638, __PRETTY_FUNCTION__))

15638

&& "SetCC type must be 8-bit or 1-bit integer")((((!Subtarget->hasAVX512() && VT == MVT::i8) || (
VT == MVT::i1)) && "SetCC type must be 8-bit or 1-bit integer"
) ? static_cast<void> (0) : __assert_fail ("((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) && \"SetCC type must be 8-bit or 1-bit integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15638, __PRETTY_FUNCTION__));

15639

SDValue Op0 = Op.getOperand(0);

15640

SDValue Op1 = Op.getOperand(1);

15641

SDLoc dl(Op);

15642

ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

15643

15644

// Optimize to BT if possible.

15645

// Lower (X & (1 << N)) == 0 to BT(X, N).

15646

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

15647

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

15648

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&

15649

Op1.getOpcode() == ISD::Constant &&

15650

cast<ConstantSDNode>(Op1)->isNullValue() &&

15651

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

15652

SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);

15653

if (NewSetCC.getNode()) {

15654

if (VT == MVT::i1)

15655

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);

15656

return NewSetCC;

15657

}

15658

}

15659

15660

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of

15661

// these.

15662

if (Op1.getOpcode() == ISD::Constant &&

15663

(cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||

15664

cast<ConstantSDNode>(Op1)->isNullValue()) &&

15665

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

15666

15667

// If the input is a setcc, then reuse the input setcc or use a new one with

15668

// the inverted condition.

15669

if (Op0.getOpcode() == X86ISD::SETCC) {

15670

X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);

15671

bool Invert = (CC == ISD::SETNE) ^

15672

cast<ConstantSDNode>(Op1)->isNullValue();

15673

if (!Invert)

15674

return Op0;

15675

15676

CCode = X86::GetOppositeBranchCondition(CCode);

15677

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

15678

DAG.getConstant(CCode, MVT::i8),

15679

Op0.getOperand(1));

15680

if (VT == MVT::i1)

15681

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);

15682

return SetCC;

15683

}

15684

}

15685

if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&

15686

(cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&

15687

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

15688

15689

ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);

15690

return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);

15691

}

15692

15693

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

15694

unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);

15695

if (X86CC == X86::COND_INVALID)

15696

return SDValue();

15697

15698

SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);

15699

EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);

15700

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

15701

DAG.getConstant(X86CC, MVT::i8), EFLAGS);

15702

if (VT == MVT::i1)

15703

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);

15704

return SetCC;

15705

}

15706

15707

// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.

15708

static bool isX86LogicalCmp(SDValue Op) {

15709

unsigned Opc = Op.getNode()->getOpcode();

15710

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

15711

Opc == X86ISD::SAHF)

15712

return true;

15713

if (Op.getResNo() == 1 &&

15714

(Opc == X86ISD::ADD ||

15715

Opc == X86ISD::SUB ||

15716

Opc == X86ISD::ADC ||

15717

Opc == X86ISD::SBB ||

15718

Opc == X86ISD::SMUL ||

15719

Opc == X86ISD::UMUL ||

15720

Opc == X86ISD::INC ||

15721

Opc == X86ISD::DEC ||

15722

Opc == X86ISD::OR ||

15723

Opc == X86ISD::XOR ||

15724

Opc == X86ISD::AND))

15725

return true;

15726

15727

if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)

15728

return true;

15729

15730

return false;

15731

}

15732

15733

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

15734

if (V.getOpcode() != ISD::TRUNCATE)

15735

return false;

15736

15737

SDValue VOp0 = V.getOperand(0);

15738

unsigned InBits = VOp0.getValueSizeInBits();

15739

unsigned Bits = V.getValueSizeInBits();

15740

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

15741

}

15742

15743

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

15744

bool addTest = true;

15745

SDValue Cond = Op.getOperand(0);

15746

SDValue Op1 = Op.getOperand(1);

15747

SDValue Op2 = Op.getOperand(2);

15748

SDLoc DL(Op);

15749

EVT VT = Op1.getValueType();

15750

SDValue CC;

15751

15752

// Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

15753

// are available. Otherwise fp cmovs get lowered into a less efficient branch

15754

// sequence later on.

15755

if (Cond.getOpcode() == ISD::SETCC &&

15756

((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||

15757

(Subtarget->hasSSE1() && VT == MVT::f32)) &&

15758

VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {

15759

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

15760

int SSECC = translateX86FSETCC(

15761

cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

15762

15763

if (SSECC != 8) {

15764

if (Subtarget->hasAVX512()) {

15765

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,

15766

DAG.getConstant(SSECC, MVT::i8));

15767

return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);

15768

}

15769

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

15770

DAG.getConstant(SSECC, MVT::i8));

15771

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

15772

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

15773

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

15774

}

15775

}

15776

15777

if (Cond.getOpcode() == ISD::SETCC) {

15778

SDValue NewCond = LowerSETCC(Cond, DAG);

15779

if (NewCond.getNode())

15780

Cond = NewCond;

15781

}

15782

15783

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

15784

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

15785

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

15786

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

15787

if (Cond.getOpcode() == X86ISD::SETCC &&

15788

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

15789

isZero(Cond.getOperand(1).getOperand(1))) {

15790

SDValue Cmp = Cond.getOperand(1);

15791

15792

unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

15793

15794

if ((isAllOnes(Op1) || isAllOnes(Op2)) &&

15795

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

15796

SDValue Y = isAllOnes(Op2) ? Op1 : Op2;

15797

15798

SDValue CmpOp0 = Cmp.getOperand(0);

15799

// Apply further optimizations for special cases

15800

// (select (x != 0), -1, 0) -> neg & sbb

15801

// (select (x == 0), 0, -1) -> neg & sbb

15802

if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))

15803

if (YC->isNullValue() &&

15804

(isAllOnes(Op1) == (CondCode == X86::COND_NE))) {

15805

SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

15806

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,

15807

DAG.getConstant(0, CmpOp0.getValueType()),

15808

CmpOp0);

15809

SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

15810

DAG.getConstant(X86::COND_B, MVT::i8),

15811

SDValue(Neg.getNode(), 1));

15812

return Res;

15813

}

15814

15815

Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,

15816

CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));

15817

Cmp = ConvertCmpIfNecessary(Cmp, DAG);

15818

15819

SDValue Res = // Res = 0 or -1.

15820

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

15821

DAG.getConstant(X86::COND_B, MVT::i8), Cmp);

15822

15823

if (isAllOnes(Op1) != (CondCode == X86::COND_E))

15824

Res = DAG.getNOT(DL, Res, Res.getValueType());

15825

15826

ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);

15827

if (!N2C || !N2C->isNullValue())

15828

Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);

15829

return Res;

15830

}

15831

}

15832

15833

// Look past (and (setcc_carry (cmp ...)), 1).

15834

if (Cond.getOpcode() == ISD::AND &&

15835

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

15836

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));

15837

if (C && C->getAPIntValue() == 1)

15838

Cond = Cond.getOperand(0);

15839

}

15840

15841

// If condition flag is set by a X86ISD::CMP, then use it as the condition

15842

// setting operand in place of the X86ISD::SETCC.

15843

unsigned CondOpcode = Cond.getOpcode();

15844

if (CondOpcode == X86ISD::SETCC ||

15845

CondOpcode == X86ISD::SETCC_CARRY) {

15846

CC = Cond.getOperand(0);

15847

15848

SDValue Cmp = Cond.getOperand(1);

15849

unsigned Opc = Cmp.getOpcode();

15850

MVT VT = Op.getSimpleValueType();

15851

15852

bool IllegalFPCMov = false;

15853

if (VT.isFloatingPoint() && !VT.isVector() &&

15854

!isScalarFPTypeInSSEReg(VT)) // FPStack?

15855

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

15856

15857

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

15858

Opc == X86ISD::BT) { // FIXME

15859

Cond = Cmp;

15860

addTest = false;

15861

}

15862

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

15863

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

15864

((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&

15865

Cond.getOperand(0).getValueType() != MVT::i8)) {

15866

SDValue LHS = Cond.getOperand(0);

15867

SDValue RHS = Cond.getOperand(1);

15868

unsigned X86Opcode;

15869

unsigned X86Cond;

15870

SDVTList VTs;

15871

switch (CondOpcode) {

15872

case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;

15873

case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;

15874

case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;

15875

case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;

15876

case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;

15877

case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;

15878

default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15878);

15879

}

15880

if (CondOpcode == ISD::UMULO)

15881

VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),

15882

MVT::i32);

15883

else

15884

VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

15885

15886

SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);

15887

15888

if (CondOpcode == ISD::UMULO)

15889

Cond = X86Op.getValue(2);

15890

else

15891

Cond = X86Op.getValue(1);

15892

15893

CC = DAG.getConstant(X86Cond, MVT::i8);

15894

addTest = false;

15895

}

15896

15897

if (addTest) {

15898

// Look pass the truncate if the high bits are known zero.

15899

if (isTruncWithZeroHighBitsInput(Cond, DAG))

15900

Cond = Cond.getOperand(0);

15901

15902

// We know the result of AND is compared against zero. Try to match

15903

// it to BT.

15904

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

15905

SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);

15906

if (NewSetCC.getNode()) {

15907

CC = NewSetCC.getOperand(0);

15908

Cond = NewSetCC.getOperand(1);

15909

addTest = false;

15910

}

15911

}

15912

}

15913

15914

if (addTest) {

15915

CC = DAG.getConstant(X86::COND_NE, MVT::i8);

15916

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);

15917

}

15918

15919

// a < b ? -1 : 0 -> RES = ~setcc_carry

15920

// a < b ? 0 : -1 -> RES = setcc_carry

15921

// a >= b ? -1 : 0 -> RES = setcc_carry

15922

// a >= b ? 0 : -1 -> RES = ~setcc_carry

15923

if (Cond.getOpcode() == X86ISD::SUB) {

15924

Cond = ConvertCmpIfNecessary(Cond, DAG);

15925

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

15926

15927

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

15928

(isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {

15929

SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

15930

DAG.getConstant(X86::COND_B, MVT::i8), Cond);

15931

if (isAllOnes(Op1) != (CondCode == X86::COND_B))

15932

return DAG.getNOT(DL, Res, Res.getValueType());

15933

return Res;

15934

}

15935

}

15936

15937

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

15938

// widen the cmov and push the truncate through. This avoids introducing a new

15939

// branch during isel and doesn't add any extensions.

15940

if (Op.getValueType() == MVT::i8 &&

15941

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

15942

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

15943

if (T1.getValueType() == T2.getValueType() &&

15944

// Blacklist CopyFromReg to avoid partial register stalls.

15945

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

15946

SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);

15947

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);

15948

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

15949

}

15950

}

15951

15952

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

15953

// condition is true.

15954

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);

15955

SDValue Ops[] = { Op2, Op1, CC, Cond };

15956

return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);

15957

}

15958

15959

static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,

15960

SelectionDAG &DAG) {

15961

MVT VT = Op->getSimpleValueType(0);

15962

SDValue In = Op->getOperand(0);

15963

MVT InVT = In.getSimpleValueType();

15964

MVT VTElt = VT.getVectorElementType();

15965

MVT InVTElt = InVT.getVectorElementType();

15966

SDLoc dl(Op);

15967

15968

// SKX processor

15969

if ((InVTElt == MVT::i1) &&

15970

(((Subtarget->hasBWI() && Subtarget->hasVLX() &&

15971

VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||

15972

15973

((Subtarget->hasBWI() && VT.is512BitVector() &&

15974

VTElt.getSizeInBits() <= 16)) ||

15975

15976

((Subtarget->hasDQI() && Subtarget->hasVLX() &&

15977

VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||

15978

15979

((Subtarget->hasDQI() && VT.is512BitVector() &&

15980

VTElt.getSizeInBits() >= 32))))

15981

return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

15982

15983

unsigned int NumElts = VT.getVectorNumElements();

15984

15985

if (NumElts != 8 && NumElts != 16)

15986

return SDValue();

15987

15988

if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {

15989

if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)

15990

return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));

15991

return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

15992

}

15993

15994

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

15995

assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type")((InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15995, __PRETTY_FUNCTION__));

15996

15997

MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;

15998

Constant *C = ConstantInt::get(*DAG.getContext(),

15999

APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));

16000

16001

SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());

16002

unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();

16003

SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,

16004

MachinePointerInfo::getConstantPool(),

16005

false, false, false, Alignment);

16006

SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);

16007

if (VT.is512BitVector())

16008

return Brcst;

16009

return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);

16010

}

16011

16012

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,

16013

SelectionDAG &DAG) {

16014

MVT VT = Op->getSimpleValueType(0);

16015

SDValue In = Op->getOperand(0);

16016

MVT InVT = In.getSimpleValueType();

16017

SDLoc dl(Op);

16018

16019

if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)

16020

return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);

16021

16022

if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&

16023

(VT != MVT::v8i32 || InVT != MVT::v8i16) &&

16024

(VT != MVT::v16i16 || InVT != MVT::v16i8))

16025

return SDValue();

16026

16027

if (Subtarget->hasInt256())

16028

return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

16029

16030

// Optimize vectors in AVX mode

16031

// Sign extend v8i16 to v8i32 and

16032

// v4i32 to v4i64

16033

16034

// Divide input vector into two parts

16035

// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}

16036

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

16037

// concat the vectors to original VT

16038

16039

unsigned NumElems = InVT.getVectorNumElements();

16040

SDValue Undef = DAG.getUNDEF(InVT);

16041

16042

SmallVector<int,8> ShufMask1(NumElems, -1);

16043

for (unsigned i = 0; i != NumElems/2; ++i)

16044

ShufMask1[i] = i;

16045

16046

SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);

16047

16048

SmallVector<int,8> ShufMask2(NumElems, -1);

16049

for (unsigned i = 0; i != NumElems/2; ++i)

16050

ShufMask2[i] = i + NumElems/2;

16051

16052

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);

16053

16054

MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),

16055

VT.getVectorNumElements()/2);

16056

16057

OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);

16058

OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);

16059

16060

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

16061

}

16062

16063

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

16064

// may emit an illegal shuffle but the expansion is still better than scalar

16065

// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise

16066

// we'll emit a shuffle and a arithmetic shift.

16067

// TODO: It is possible to support ZExt by zeroing the undef values during

16068

// the shuffle phase or after the shuffle.

16069

static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,

16070

SelectionDAG &DAG) {

16071

MVT RegVT = Op.getSimpleValueType();

16072

assert(RegVT.isVector() && "We only custom lower vector sext loads.")((RegVT.isVector() && "We only custom lower vector sext loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector sext loads.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16072, __PRETTY_FUNCTION__));

16073

assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16074, __PRETTY_FUNCTION__))

16074

"We only custom lower integer vector sext loads.")((RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16074, __PRETTY_FUNCTION__));

16075

16076

// Nothing useful we can do without SSE2 shuffles.

16077

assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.")((Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"We only custom lower sext loads with SSE2.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16077, __PRETTY_FUNCTION__));

16078

16079

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

16080

SDLoc dl(Ld);

16081

EVT MemVT = Ld->getMemoryVT();

16082

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

16083

unsigned RegSz = RegVT.getSizeInBits();

16084

16085

ISD::LoadExtType Ext = Ld->getExtensionType();

16086

16087

assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)(((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? static_cast<void> (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16088, __PRETTY_FUNCTION__))

16088

&& "Only anyext and sext are currently implemented.")(((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? static_cast<void> (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16088, __PRETTY_FUNCTION__));

16089

assert(MemVT != RegVT && "Cannot extend to the same type")((MemVT != RegVT && "Cannot extend to the same type")
? static_cast<void> (0) : __assert_fail ("MemVT != RegVT && \"Cannot extend to the same type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16089, __PRETTY_FUNCTION__));

16090

assert(MemVT.isVector() && "Must load a vector from memory")((MemVT.isVector() && "Must load a vector from memory"
) ? static_cast<void> (0) : __assert_fail ("MemVT.isVector() && \"Must load a vector from memory\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16090, __PRETTY_FUNCTION__));

16091

16092

unsigned NumElems = RegVT.getVectorNumElements();

16093

unsigned MemSz = MemVT.getSizeInBits();

16094

assert(RegSz > MemSz && "Register size must be greater than the mem size")((RegSz > MemSz && "Register size must be greater than the mem size"
) ? static_cast<void> (0) : __assert_fail ("RegSz > MemSz && \"Register size must be greater than the mem size\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16094, __PRETTY_FUNCTION__));

16095

16096

if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {

16097

// The only way in which we have a legal 256-bit vector result but not the

16098

// integer 256-bit operations needed to directly lower a sextload is if we

16099

// have AVX1 but not AVX2. In that case, we can always emit a sextload to

16100

// a 128-bit vector and a normal sign_extend to 256-bits that should get

16101

// correctly legalized. We do this late to allow the canonical form of

16102

// sextload to persist throughout the rest of the DAG combiner -- it wants

16103

// to fold together any extensions it can, and so will fuse a sign_extend

16104

// of an sextload into a sextload targeting a wider value.

16105

SDValue Load;

16106

if (MemSz == 128) {

16107

// Just switch this to a normal load.

16108

assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? static_cast<
void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16110, __PRETTY_FUNCTION__))

16109

"it must be a legal 128-bit vector "((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? static_cast<
void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16110, __PRETTY_FUNCTION__))

16110

"type!")((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? static_cast<
void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16110, __PRETTY_FUNCTION__));

16111

Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),

16112

Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),

16113

Ld->isInvariant(), Ld->getAlignment());

16114

} else {

16115

assert(MemSz < 128 &&((MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? static_cast<void> (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16116, __PRETTY_FUNCTION__))

16116

"Can't extend a type wider than 128 bits to a 256 bit vector!")((MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? static_cast<void> (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16116, __PRETTY_FUNCTION__));

16117

// Do an sext load to a 128-bit vector type. We want to use the same

16118

// number of elements, but elements half as wide. This will end up being

16119

// recursively lowered by this routine, but will succeed as we definitely

16120

// have all the necessary features if we're using AVX1.

16121

EVT HalfEltVT =

16122

EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);

16123

EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);

16124

Load =

16125

DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),

16126

Ld->getPointerInfo(), MemVT, Ld->isVolatile(),

16127

Ld->isNonTemporal(), Ld->isInvariant(),

16128

Ld->getAlignment());

16129

}

16130

16131

// Replace chain users with the new chain.

16132

assert(Load->getNumValues() == 2 && "Loads must carry a chain!")((Load->getNumValues() == 2 && "Loads must carry a chain!"
) ? static_cast<void> (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16132, __PRETTY_FUNCTION__));

16133

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

16134

16135

// Finally, do a normal sign-extend to the desired register.

16136

return DAG.getSExtOrTrunc(Load, dl, RegVT);

16137

}

16138

16139

// All sizes must be a power of two.

16140

assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&((isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16141, __PRETTY_FUNCTION__))

16141

"Non-power-of-two elements are not custom lowered!")((isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16141, __PRETTY_FUNCTION__));

16142

16143

// Attempt to load the original value using scalar loads.

16144

// Find the largest scalar type that divides the total loaded size.

16145

MVT SclrLoadTy = MVT::i8;

16146

for (MVT Tp : MVT::integer_valuetypes()) {

16147

if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {

16148

SclrLoadTy = Tp;

16149

}

16150

}

16151

16152

// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.

16153

if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&

16154

(64 <= MemSz))

16155

SclrLoadTy = MVT::f64;

16156

16157

// Calculate the number of scalar loads that we need to perform

16158

// in order to load our vector from memory.

16159

unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

16160

16161

assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&(((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"
) ? static_cast<void> (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16162, __PRETTY_FUNCTION__))

16162

"Can only lower sext loads with a single scalar load!")(((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"
) ? static_cast<void> (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16162, __PRETTY_FUNCTION__));

16163

16164

unsigned loadRegZize = RegSz;

16165

if (Ext == ISD::SEXTLOAD && RegSz == 256)

16166

loadRegZize /= 2;

16167

16168

// Represent our vector as a sequence of elements which are the

16169

// largest scalar that we can load.

16170

EVT LoadUnitVecVT = EVT::getVectorVT(

16171

*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());

16172

16173

// Represent the data using the same element type that is stored in

16174

// memory. In practice, we ''widen'' MemVT.

16175

EVT WideVecVT =

16176

EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

16177

loadRegZize / MemVT.getScalarType().getSizeInBits());

16178

16179

assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&((WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type") ? static_cast<void> (0) : __assert_fail
("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16180, __PRETTY_FUNCTION__))

16180

"Invalid vector type")((WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type") ? static_cast<void> (0) : __assert_fail
("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16180, __PRETTY_FUNCTION__));

16181

16182

// We can't shuffle using an illegal type.

16183

assert(TLI.isTypeLegal(WideVecVT) &&((TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"
) ? static_cast<void> (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16184, __PRETTY_FUNCTION__))

16184

"We only lower types that form legal widened vector types")((TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"
) ? static_cast<void> (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16184, __PRETTY_FUNCTION__));

16185

16186

SmallVector<SDValue, 8> Chains;

16187

SDValue Ptr = Ld->getBasePtr();

16188

SDValue Increment =

16189

DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());

16190

SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

16191

16192

for (unsigned i = 0; i < NumLoads; ++i) {

16193

// Perform a single load.

16194

SDValue ScalarLoad =

16195

DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),

16196

Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),

16197

Ld->getAlignment());

16198

Chains.push_back(ScalarLoad.getValue(1));

16199

// Create the first element type using SCALAR_TO_VECTOR in order to avoid

16200

// another round of DAGCombining.

16201

if (i == 0)

16202

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);

16203

else

16204

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,

16205

ScalarLoad, DAG.getIntPtrConstant(i));

16206

16207

Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);

16208

}

16209

16210

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

16211

16212

// Bitcast the loaded value to a vector of the original element type, in

16213

// the size of the target vector type.

16214

SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);

16215

unsigned SizeRatio = RegSz / MemSz;

16216

16217

if (Ext == ISD::SEXTLOAD) {

16218

// If we have SSE4.1, we can directly emit a VSEXT node.

16219

if (Subtarget->hasSSE41()) {

16220

SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);

16221

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);

16222

return Sext;

16223

}

16224

16225

// Otherwise we'll shuffle the small elements in the high bits of the

16226

// larger type and perform an arithmetic shift. If the shift is not legal

16227

// it's better to scalarize.

16228

assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&((TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && "We can't implement a sext load without an arithmetic right shift!"
) ? static_cast<void> (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && \"We can't implement a sext load without an arithmetic right shift!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16229, __PRETTY_FUNCTION__))

16229

"We can't implement a sext load without an arithmetic right shift!")((TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && "We can't implement a sext load without an arithmetic right shift!"
) ? static_cast<void> (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && \"We can't implement a sext load without an arithmetic right shift!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16229, __PRETTY_FUNCTION__));

16230

16231

// Redistribute the loaded elements into the different locations.

16232

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

16233

for (unsigned i = 0; i != NumElems; ++i)

16234

ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;

16235

16236

SDValue Shuff = DAG.getVectorShuffle(

16237

WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);

16238

16239

Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);

16240

16241

// Build the arithmetic shift.

16242

unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -

16243

MemVT.getVectorElementType().getSizeInBits();

16244

Shuff =

16245

DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));

16246

16247

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);

16248

return Shuff;

16249

}

16250

16251

// Redistribute the loaded elements into the different locations.

16252

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

16253

for (unsigned i = 0; i != NumElems; ++i)

16254

ShuffleVec[i * SizeRatio] = i;

16255

16256

SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,

16257

DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);

16258

16259

// Bitcast to the requested type.

16260

Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);

16261

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);

16262

return Shuff;

16263

}

16264

16265

// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or

16266

// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart

16267

// from the AND / OR.

16268

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

16269

Opc = Op.getOpcode();

16270

if (Opc != ISD::OR && Opc != ISD::AND)

16271

return false;

16272

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

16273

Op.getOperand(0).hasOneUse() &&

16274

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

16275

Op.getOperand(1).hasOneUse());

16276

}

16277

16278

// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and

16279

// 1 and that the SETCC node has a single use.

16280

static bool isXor1OfSetCC(SDValue Op) {

16281

if (Op.getOpcode() != ISD::XOR)

16282

return false;

16283

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

16284

if (N1C && N1C->getAPIntValue() == 1) {

16285

return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

16286

Op.getOperand(0).hasOneUse();

16287

}

16288

return false;

16289

}

16290

16291

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

16292

bool addTest = true;

16293

SDValue Chain = Op.getOperand(0);

16294

SDValue Cond = Op.getOperand(1);

16295

SDValue Dest = Op.getOperand(2);

16296

SDLoc dl(Op);

16297

SDValue CC;

16298

bool Inverted = false;

16299

16300

if (Cond.getOpcode() == ISD::SETCC) {

16301

// Check for setcc([su]{add,sub,mul}o == 0).

16302

if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

16303

isa<ConstantSDNode>(Cond.getOperand(1)) &&

16304

cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&

16305

Cond.getOperand(0).getResNo() == 1 &&

16306

(Cond.getOperand(0).getOpcode() == ISD::SADDO ||

16307

Cond.getOperand(0).getOpcode() == ISD::UADDO ||

16308

Cond.getOperand(0).getOpcode() == ISD::SSUBO ||

16309

Cond.getOperand(0).getOpcode() == ISD::USUBO ||

16310

Cond.getOperand(0).getOpcode() == ISD::SMULO ||

16311

Cond.getOperand(0).getOpcode() == ISD::UMULO)) {

16312

Inverted = true;

16313

Cond = Cond.getOperand(0);

16314

} else {

16315

SDValue NewCond = LowerSETCC(Cond, DAG);

16316

if (NewCond.getNode())

16317

Cond = NewCond;

16318

}

16319

}

16320

#if 0

16321

// FIXME: LowerXALUO doesn't handle these!!

16322

else if (Cond.getOpcode() == X86ISD::ADD ||

16323

Cond.getOpcode() == X86ISD::SUB ||

16324

Cond.getOpcode() == X86ISD::SMUL ||

16325

Cond.getOpcode() == X86ISD::UMUL)

16326

Cond = LowerXALUO(Cond, DAG);

16327

#endif

16328

16329

// Look pass (and (setcc_carry (cmp ...)), 1).

16330

if (Cond.getOpcode() == ISD::AND &&

16331

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

16332

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));

16333

if (C && C->getAPIntValue() == 1)

16334

Cond = Cond.getOperand(0);

16335

}

16336

16337

// If condition flag is set by a X86ISD::CMP, then use it as the condition

16338

// setting operand in place of the X86ISD::SETCC.

16339

unsigned CondOpcode = Cond.getOpcode();

16340

if (CondOpcode == X86ISD::SETCC ||

16341

CondOpcode == X86ISD::SETCC_CARRY) {

16342

CC = Cond.getOperand(0);

16343

16344

SDValue Cmp = Cond.getOperand(1);

16345

unsigned Opc = Cmp.getOpcode();

16346

// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??

16347

if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {

16348

Cond = Cmp;

16349

addTest = false;

16350

} else {

16351

switch (cast<ConstantSDNode>(CC)->getZExtValue()) {

16352

default: break;

16353

case X86::COND_O:

16354

case X86::COND_B:

16355

// These can only come from an arithmetic instruction with overflow,

16356

// e.g. SADDO, UADDO.

16357

Cond = Cond.getNode()->getOperand(1);

16358

addTest = false;

16359

break;

16360

}

16361

}

16362

}

16363

CondOpcode = Cond.getOpcode();

16364

if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

16365

CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

16366

((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&

16367

Cond.getOperand(0).getValueType() != MVT::i8)) {

16368

SDValue LHS = Cond.getOperand(0);

16369

SDValue RHS = Cond.getOperand(1);

16370

unsigned X86Opcode;

16371

unsigned X86Cond;

16372

SDVTList VTs;

16373

// Keep this in sync with LowerXALUO, otherwise we might create redundant

16374

// instructions that can't be removed afterwards (i.e. X86ISD::ADD and

16375

// X86ISD::INC).

16376

switch (CondOpcode) {

16377

case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;

16378

case ISD::SADDO:

16379

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))

16380

if (C->isOne()) {

16381

X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;

16382

break;

16383

}

16384

X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;

16385

case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;

16386

case ISD::SSUBO:

16387

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))

16388

if (C->isOne()) {

16389

X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;

16390

break;

16391

}

16392

X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;

16393

case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;

16394

case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;

16395

16396

}

16397

if (Inverted)

16398

X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);

16399

if (CondOpcode == ISD::UMULO)

16400

VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),

16401

MVT::i32);

16402

else

16403

VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

16404

16405

SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);

16406

16407

if (CondOpcode == ISD::UMULO)

16408

Cond = X86Op.getValue(2);

16409

else

16410

Cond = X86Op.getValue(1);

16411

16412

CC = DAG.getConstant(X86Cond, MVT::i8);

16413

addTest = false;

16414

} else {

16415

unsigned CondOpc;

16416

if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {

16417

SDValue Cmp = Cond.getOperand(0).getOperand(1);

16418

if (CondOpc == ISD::OR) {

16419

// Also, recognize the pattern generated by an FCMP_UNE. We can emit

16420

// two branches instead of an explicit OR instruction with a

16421

// separate test.

16422

if (Cmp == Cond.getOperand(1).getOperand(1) &&

16423

isX86LogicalCmp(Cmp)) {

16424

CC = Cond.getOperand(0).getOperand(0);

16425

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

16426

Chain, Dest, CC, Cmp);

16427

CC = Cond.getOperand(1).getOperand(0);

16428

Cond = Cmp;

16429

addTest = false;

16430

}

16431

} else { // ISD::AND

16432

// Also, recognize the pattern generated by an FCMP_OEQ. We can emit

16433

// two branches instead of an explicit AND instruction with a

16434

// separate test. However, we only do this if this block doesn't

16435

// have a fall-through edge, because this requires an explicit

16436

// jmp when the condition is false.

16437

if (Cmp == Cond.getOperand(1).getOperand(1) &&

16438

isX86LogicalCmp(Cmp) &&

16439

Op.getNode()->hasOneUse()) {

16440

X86::CondCode CCode =

16441

(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);

16442

CCode = X86::GetOppositeBranchCondition(CCode);

16443

CC = DAG.getConstant(CCode, MVT::i8);

16444

SDNode *User = *Op.getNode()->use_begin();

16445

// Look for an unconditional branch following this conditional branch.

16446

// We need this because we need to reverse the successors in order

16447

// to implement FCMP_OEQ.

16448

if (User->getOpcode() == ISD::BR) {

16449

SDValue FalseBB = User->getOperand(1);

16450

SDNode *NewBR =

16451

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

16452

assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16452, __PRETTY_FUNCTION__));

16453

(void)NewBR;

16454

Dest = FalseBB;

16455

16456

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

16457

Chain, Dest, CC, Cmp);

16458

X86::CondCode CCode =

16459

(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);

16460

CCode = X86::GetOppositeBranchCondition(CCode);

16461

CC = DAG.getConstant(CCode, MVT::i8);

16462

Cond = Cmp;

16463

addTest = false;

16464

}

16465

}

16466

}

16467

} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {

16468

// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.

16469

// It should be transformed during dag combiner except when the condition

16470

// is set by a arithmetics with overflow node.

16471

X86::CondCode CCode =

16472

(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);

16473

CCode = X86::GetOppositeBranchCondition(CCode);

16474

CC = DAG.getConstant(CCode, MVT::i8);

16475

Cond = Cond.getOperand(0).getOperand(1);

16476

addTest = false;

16477

} else if (Cond.getOpcode() == ISD::SETCC &&

16478

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {

16479

// For FCMP_OEQ, we can emit

16480

// two branches instead of an explicit AND instruction with a

16481

// separate test. However, we only do this if this block doesn't

16482

// have a fall-through edge, because this requires an explicit

16483

// jmp when the condition is false.

16484

if (Op.getNode()->hasOneUse()) {

16485

SDNode *User = *Op.getNode()->use_begin();

16486

// Look for an unconditional branch following this conditional branch.

16487

// We need this because we need to reverse the successors in order

16488

// to implement FCMP_OEQ.

16489

if (User->getOpcode() == ISD::BR) {

16490

SDValue FalseBB = User->getOperand(1);

16491

SDNode *NewBR =

16492

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

16493

16494

(void)NewBR;

16495

Dest = FalseBB;

16496

16497

SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

16498

Cond.getOperand(0), Cond.getOperand(1));

16499

Cmp = ConvertCmpIfNecessary(Cmp, DAG);

16500

CC = DAG.getConstant(X86::COND_NE, MVT::i8);

16501

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

16502

Chain, Dest, CC, Cmp);

16503

CC = DAG.getConstant(X86::COND_P, MVT::i8);

16504

Cond = Cmp;

16505

addTest = false;

16506

}

16507

}

16508

} else if (Cond.getOpcode() == ISD::SETCC &&

16509

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {

16510

// For FCMP_UNE, we can emit

16511

// two branches instead of an explicit AND instruction with a

16512

// separate test. However, we only do this if this block doesn't

16513

// have a fall-through edge, because this requires an explicit

16514

// jmp when the condition is false.

16515

if (Op.getNode()->hasOneUse()) {

16516

SDNode *User = *Op.getNode()->use_begin();

16517

// Look for an unconditional branch following this conditional branch.

16518

// We need this because we need to reverse the successors in order

16519

// to implement FCMP_UNE.

16520

if (User->getOpcode() == ISD::BR) {

16521

SDValue FalseBB = User->getOperand(1);

16522

SDNode *NewBR =

16523

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

16524

16525

(void)NewBR;

16526

16527

SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,

16528

Cond.getOperand(0), Cond.getOperand(1));

16529

Cmp = ConvertCmpIfNecessary(Cmp, DAG);

16530

CC = DAG.getConstant(X86::COND_NE, MVT::i8);

16531

Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

16532

Chain, Dest, CC, Cmp);

16533

CC = DAG.getConstant(X86::COND_NP, MVT::i8);

16534

Cond = Cmp;

16535

addTest = false;

16536

Dest = FalseBB;

16537

}

16538

}

16539

}

16540

}

16541

16542

if (addTest) {

16543

// Look pass the truncate if the high bits are known zero.

16544

if (isTruncWithZeroHighBitsInput(Cond, DAG))

16545

Cond = Cond.getOperand(0);

16546

16547

// We know the result of AND is compared against zero. Try to match

16548

// it to BT.

16549

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

16550

SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);

16551

if (NewSetCC.getNode()) {

16552

CC = NewSetCC.getOperand(0);

16553

Cond = NewSetCC.getOperand(1);

16554

addTest = false;

16555

}

16556

}

16557

}

16558

16559

if (addTest) {

16560

X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;

16561

CC = DAG.getConstant(X86Cond, MVT::i8);

16562

Cond = EmitTest(Cond, X86Cond, dl, DAG);

16563

}

16564

Cond = ConvertCmpIfNecessary(Cond, DAG);

16565

return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),

16566

Chain, Dest, CC, Cond);

16567

}

16568

16569

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

16570

// Calls to _alloca are needed to probe the stack when allocating more than 4k

16571

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

16572

// that the guard pages used by the OS virtual memory manager are allocated in

16573

// correct sequence.

16574

SDValue

16575

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

16576

SelectionDAG &DAG) const {

16577

MachineFunction &MF = DAG.getMachineFunction();

16578

bool SplitStack = MF.shouldSplitStack();

16579

bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||

16580

SplitStack;

16581

SDLoc dl(Op);

16582

16583

if (!Lower) {

16584

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

16585

SDNode* Node = Op.getNode();

16586

16587

unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();

16588

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16589, __PRETTY_FUNCTION__))

16589

" not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16589, __PRETTY_FUNCTION__));

16590

EVT VT = Node->getValueType(0);

16591

SDValue Tmp1 = SDValue(Node, 0);

16592

SDValue Tmp2 = SDValue(Node, 1);

16593

SDValue Tmp3 = Node->getOperand(2);

16594

SDValue Chain = Tmp1.getOperand(0);

16595

16596

// Chain the dynamic stack allocation so that it doesn't modify the stack

16597

// pointer when other instructions are using the stack.

16598

Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),

16599

SDLoc(Node));

16600

16601

SDValue Size = Tmp2.getOperand(1);

16602

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

16603

Chain = SP.getValue(1);

16604

unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();

16605

const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();

16606

unsigned StackAlign = TFI.getStackAlignment();

16607

Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

16608

if (Align > StackAlign)

16609

Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,

16610

DAG.getConstant(-(uint64_t)Align, VT));

16611

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain

16612

16613

Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),

16614

DAG.getIntPtrConstant(0, true), SDValue(),

16615

SDLoc(Node));

16616

16617

SDValue Ops[2] = { Tmp1, Tmp2 };

16618

return DAG.getMergeValues(Ops, dl);

16619

}

16620

16621

// Get the inputs.

16622

SDValue Chain = Op.getOperand(0);

16623

SDValue Size = Op.getOperand(1);

16624

unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();

16625

EVT VT = Op.getNode()->getValueType(0);

16626

16627

bool Is64Bit = Subtarget->is64Bit();

16628

EVT SPTy = getPointerTy();

16629

16630

if (SplitStack) {

16631

MachineRegisterInfo &MRI = MF.getRegInfo();

16632

16633

if (Is64Bit) {

16634

// The 64 bit implementation of segmented stacks needs to clobber both r10

16635

// r11. This makes it impossible to use it along with nested parameters.

16636

const Function *F = MF.getFunction();

16637

16638

for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();

16639

I != E; ++I)

16640

if (I->hasNestAttr())

16641

report_fatal_error("Cannot use segmented stacks with functions that "

16642

"have nested arguments.");

16643

}

16644

16645

const TargetRegisterClass *AddrRegClass =

16646

getRegClassFor(getPointerTy());

16647

unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);

16648

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

16649

SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

16650

DAG.getRegister(Vreg, SPTy));

16651

SDValue Ops1[2] = { Value, Chain };

16652

return DAG.getMergeValues(Ops1, dl);

16653

} else {

16654

SDValue Flag;

16655

const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);

16656

16657

Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);

16658

Flag = Chain.getValue(1);

16659

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

16660

16661

Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);

16662

16663

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

16664

DAG.getSubtarget().getRegisterInfo());

16665

unsigned SPReg = RegInfo->getStackRegister();

16666

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

16667

Chain = SP.getValue(1);

16668

16669

if (Align) {

16670

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

16671

DAG.getConstant(-(uint64_t)Align, VT));

16672

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

16673

}

16674

16675

SDValue Ops1[2] = { SP, Chain };

16676

return DAG.getMergeValues(Ops1, dl);

16677

}

16678

}

16679

16680

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

16681

MachineFunction &MF = DAG.getMachineFunction();

16682

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

16683

16684

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

16685

SDLoc DL(Op);

16686

16687

if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {

16688

// vastart just stores the address of the VarArgsFrameIndex slot into the

16689

// memory location argument.

16690

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),

16691

getPointerTy());

16692

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

16693

MachinePointerInfo(SV), false, false, 0);

16694

}

16695

16696

// __va_list_tag:

16697

// gp_offset (0 - 6 * 8)

16698

// fp_offset (48 - 48 + 8 * 16)

16699

// overflow_arg_area (point to parameters coming in memory).

16700

// reg_save_area

16701

SmallVector<SDValue, 8> MemOps;

16702

SDValue FIN = Op.getOperand(1);

16703

// Store gp_offset

16704

SDValue Store = DAG.getStore(Op.getOperand(0), DL,

16705

DAG.getConstant(FuncInfo->getVarArgsGPOffset(),

16706

MVT::i32),

16707

FIN, MachinePointerInfo(SV), false, false, 0);

16708

MemOps.push_back(Store);

16709

16710

// Store fp_offset

16711

FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),

16712

FIN, DAG.getIntPtrConstant(4));

16713

Store = DAG.getStore(Op.getOperand(0), DL,

16714

DAG.getConstant(FuncInfo->getVarArgsFPOffset(),

16715

MVT::i32),

16716

FIN, MachinePointerInfo(SV, 4), false, false, 0);

16717

MemOps.push_back(Store);

16718

16719

// Store ptr to overflow_arg_area

16720

FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),

16721

FIN, DAG.getIntPtrConstant(4));

16722

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),

16723

getPointerTy());

16724

Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,

16725

MachinePointerInfo(SV, 8),

16726

false, false, 0);

16727

MemOps.push_back(Store);

16728

16729

// Store ptr to reg_save_area.

16730

FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),

16731

FIN, DAG.getIntPtrConstant(8));

16732

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

16733

getPointerTy());

16734

Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,

16735

MachinePointerInfo(SV, 16), false, false, 0);

16736

MemOps.push_back(Store);

16737

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

16738

}

16739

16740

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

16741

assert(Subtarget->is64Bit() &&((Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16742, __PRETTY_FUNCTION__))

16742

"LowerVAARG only handles 64-bit va_arg!")((Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16742, __PRETTY_FUNCTION__));

16743

assert((Subtarget->isTargetLinux() ||(((Subtarget->isTargetLinux() || Subtarget->isTargetDarwin
()) && "Unhandled target in LowerVAARG") ? static_cast
<void> (0) : __assert_fail ("(Subtarget->isTargetLinux() || Subtarget->isTargetDarwin()) && \"Unhandled target in LowerVAARG\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16745, __PRETTY_FUNCTION__))

16744

Subtarget->isTargetDarwin()) &&(((Subtarget->isTargetLinux() || Subtarget->isTargetDarwin
()) && "Unhandled target in LowerVAARG") ? static_cast
<void> (0) : __assert_fail ("(Subtarget->isTargetLinux() || Subtarget->isTargetDarwin()) && \"Unhandled target in LowerVAARG\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16745, __PRETTY_FUNCTION__))

16745

"Unhandled target in LowerVAARG")(((Subtarget->isTargetLinux() || Subtarget->isTargetDarwin
()) && "Unhandled target in LowerVAARG") ? static_cast
<void> (0) : __assert_fail ("(Subtarget->isTargetLinux() || Subtarget->isTargetDarwin()) && \"Unhandled target in LowerVAARG\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16745, __PRETTY_FUNCTION__));

16746

assert(Op.getNode()->getNumOperands() == 4)((Op.getNode()->getNumOperands() == 4) ? static_cast<void
> (0) : __assert_fail ("Op.getNode()->getNumOperands() == 4"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16746, __PRETTY_FUNCTION__));

16747

SDValue Chain = Op.getOperand(0);

16748

SDValue SrcPtr = Op.getOperand(1);

16749

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

16750

unsigned Align = Op.getConstantOperandVal(3);

16751

SDLoc dl(Op);

16752

16753

EVT ArgVT = Op.getNode()->getValueType(0);

16754

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

16755

uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);

16756

uint8_t ArgMode;

16757

16758

// Decide which area this value should be read from.

16759

// TODO: Implement the AMD64 ABI in its entirety. This simple

16760

// selection mechanism works only for the basic types.

16761

if (ArgVT == MVT::f80) {

16762

llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16762);

16763

} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

16764

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

16765

} else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {

16766

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

16767

} else {

16768

llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16768);

16769

}

16770

16771

if (ArgMode == 2) {

16772

// Sanity Check: Make sure using fp_offset makes sense.

16773

assert(!DAG.getTarget().Options.UseSoftFloat &&((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))

16774

!(DAG.getMachineFunction()((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))

16775

.getFunction()->getAttributes()((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))

16776

.hasAttribute(AttributeSet::FunctionIndex,((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))

16777

Attribute::NoImplicitFloat)) &&((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))

16778

Subtarget->hasSSE1())((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__));

16779

}

16780

16781

// Insert VAARG_64 node into the DAG

16782

// VAARG_64 returns two values: Variable Argument Address, Chain

16783

SmallVector<SDValue, 11> InstOps;

16784

InstOps.push_back(Chain);

16785

InstOps.push_back(SrcPtr);

16786

InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));

16787

InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));

16788

InstOps.push_back(DAG.getConstant(Align, MVT::i32));

16789

SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);

16790

SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,

16791

VTs, InstOps, MVT::i64,

16792

MachinePointerInfo(SV),

16793

/*Align=*/0,

16794

/*Volatile=*/false,

16795

/*ReadMem=*/true,

16796

/*WriteMem=*/true);

16797

Chain = VAARG.getValue(1);

16798

16799

// Load the next argument and return it

16800

return DAG.getLoad(ArgVT, dl,

16801

Chain,

16802

VAARG,

16803

MachinePointerInfo(),

16804

false, false, false, 0);

16805

}

16806

16807

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,

16808

SelectionDAG &DAG) {

16809

// X86-64 va_list is a struct { i32, i32, i8*, i8* }.

16810

assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16810, __PRETTY_FUNCTION__));

16811

SDValue Chain = Op.getOperand(0);

16812

SDValue DstPtr = Op.getOperand(1);

16813

SDValue SrcPtr = Op.getOperand(2);

16814

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

16815

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

16816

SDLoc DL(Op);

16817

16818

return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,

16819

DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,

16820

false,

16821

MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

16822

}

16823

16824

// getTargetVShiftByConstNode - Handle vector element shifts where the shift

16825

// amount is a constant. Takes immediate version of shift as input.

16826

static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,

16827

SDValue SrcOp, uint64_t ShiftAmt,

16828

SelectionDAG &DAG) {

16829

MVT ElementType = VT.getVectorElementType();

16830

16831

// Fold this packed shift into its first operand if ShiftAmt is 0.

16832

if (ShiftAmt == 0)

16833

return SrcOp;

16834

16835

// Check for ShiftAmt >= element width

16836

if (ShiftAmt >= ElementType.getSizeInBits()) {

16837

if (Opc == X86ISD::VSRAI)

16838

ShiftAmt = ElementType.getSizeInBits() - 1;

16839

else

16840

return DAG.getConstant(0, VT);

16841

}

16842

16843

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16844, __PRETTY_FUNCTION__))

16844

&& "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16844, __PRETTY_FUNCTION__));

16845

16846

// Fold this packed vector shift into a build vector if SrcOp is a

16847

// vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.

16848

if (VT == SrcOp.getSimpleValueType() &&

16849

ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

16850

SmallVector<SDValue, 8> Elts;

16851

unsigned NumElts = SrcOp->getNumOperands();

16852

ConstantSDNode *ND;

16853

16854

switch(Opc) {

16855

default: llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16855);

16856

case X86ISD::VSHLI:

16857

for (unsigned i=0; i!=NumElts; ++i) {

16858

SDValue CurrentOp = SrcOp->getOperand(i);

16859

if (CurrentOp->getOpcode() == ISD::UNDEF) {

16860

Elts.push_back(CurrentOp);

16861

continue;

16862

}

16863

ND = cast<ConstantSDNode>(CurrentOp);

16864

const APInt &C = ND->getAPIntValue();

16865

Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));

16866

}

16867

break;

16868

case X86ISD::VSRLI:

16869

for (unsigned i=0; i!=NumElts; ++i) {

16870

SDValue CurrentOp = SrcOp->getOperand(i);

16871

if (CurrentOp->getOpcode() == ISD::UNDEF) {

16872

Elts.push_back(CurrentOp);

16873

continue;

16874

}

16875

ND = cast<ConstantSDNode>(CurrentOp);

16876

const APInt &C = ND->getAPIntValue();

16877

Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));

16878

}

16879

break;

16880

case X86ISD::VSRAI:

16881

for (unsigned i=0; i!=NumElts; ++i) {

16882

SDValue CurrentOp = SrcOp->getOperand(i);

16883

if (CurrentOp->getOpcode() == ISD::UNDEF) {

16884

Elts.push_back(CurrentOp);

16885

continue;

16886

}

16887

ND = cast<ConstantSDNode>(CurrentOp);

16888

const APInt &C = ND->getAPIntValue();

16889

Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));

16890

}

16891

break;

16892

}

16893

16894

return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);

16895

}

16896

16897

return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));

16898

}

16899

16900

// getTargetVShiftNode - Handle vector element shifts where the shift amount

16901

// may or may not be a constant. Takes immediate version of shift as input.

16902

static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,

16903

SDValue SrcOp, SDValue ShAmt,

16904

SelectionDAG &DAG) {

16905

MVT SVT = ShAmt.getSimpleValueType();

16906

assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"
) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16906, __PRETTY_FUNCTION__));

16907

16908

// Catch shift-by-constant.

16909

if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

16910

return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,

16911

CShAmt->getZExtValue(), DAG);

16912

16913

// Change opcode to non-immediate version

16914

switch (Opc) {

16915

default: llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16915);

16916

case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;

16917

case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;

16918

case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;

16919

}

16920

16921

const X86Subtarget &Subtarget =

16922

DAG.getTarget().getSubtarget<X86Subtarget>();

16923

if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&

16924

ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {

16925

// Let the shuffle legalizer expand this shift amount node.

16926

SDValue Op0 = ShAmt.getOperand(0);

16927

Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);

16928

ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);

16929

} else {

16930

// Need to build a vector containing shift amount.

16931

// SSE/AVX packed shifts only use the lower 64-bit of the shift count.

16932

SmallVector<SDValue, 4> ShOps;

16933

ShOps.push_back(ShAmt);

16934

if (SVT == MVT::i32) {

16935

ShOps.push_back(DAG.getConstant(0, SVT));

16936

ShOps.push_back(DAG.getUNDEF(SVT));

16937

}

16938

ShOps.push_back(DAG.getUNDEF(SVT));

16939

16940

MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;

16941

ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);

16942

}

16943

16944

// The return type has to be a 128-bit type with the same element

16945

// type as the input type.

16946

MVT EltVT = VT.getVectorElementType();

16947

EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());

16948

16949

ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);

16950

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

16951

}

16952

16953

/// \brief Return (and \p Op, \p Mask) for compare instructions or

16954

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

16955

/// necessary casting for \p Mask when lowering masking intrinsics.

16956

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

16957

SDValue PreservedSrc,

16958

const X86Subtarget *Subtarget,

16959

SelectionDAG &DAG) {

16960

EVT VT = Op.getValueType();

16961

EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),

16962

MVT::i1, VT.getVectorNumElements());

16963

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

16964

Mask.getValueType().getSizeInBits());

16965

SDLoc dl(Op);

16966

16967

assert(MaskVT.isSimple() && "invalid mask type")((MaskVT.isSimple() && "invalid mask type") ? static_cast
<void> (0) : __assert_fail ("MaskVT.isSimple() && \"invalid mask type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16967, __PRETTY_FUNCTION__));

16968

16969

if (isAllOnes(Mask))

16970

return Op;

16971

16972

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

16973

// are extracted by EXTRACT_SUBVECTOR.

16974

SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

16975

DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),

16976

DAG.getIntPtrConstant(0));

16977

16978

switch (Op.getOpcode()) {

16979

default: break;

16980

case X86ISD::PCMPEQM:

16981

case X86ISD::PCMPGTM:

16982

case X86ISD::CMPM:

16983

case X86ISD::CMPMU:

16984

return DAG.getNode(ISD::AND, dl, VT, Op, VMask);

16985

}

16986

if (PreservedSrc.getOpcode() == ISD::UNDEF)

16987

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

16988

return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);

16989

}

16990

16991

/// \brief Creates an SDNode for a predicated scalar operation.

16992

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

16993

/// The mask is comming as MVT::i8 and it should be truncated

16994

/// to MVT::i1 while lowering masking intrinsics.

16995

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

16996

/// "X86select" instead of "vselect". We just can't create the "vselect" node for

16997

/// a scalar instruction.

16998

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

16999

SDValue PreservedSrc,

17000

const X86Subtarget *Subtarget,

17001

SelectionDAG &DAG) {

17002

if (isAllOnes(Mask))

17003

return Op;

17004

17005

EVT VT = Op.getValueType();

17006

SDLoc dl(Op);

17007

// The mask should be of type MVT::i1

17008

SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);

17009

17010

if (PreservedSrc.getOpcode() == ISD::UNDEF)

17011

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

17012

return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);

17013

}

17014

17015

static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,

17016

SelectionDAG &DAG) {

17017

SDLoc dl(Op);

17018

unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

17019

EVT VT = Op.getValueType();

17020

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

17021

if (IntrData) {

17022

switch(IntrData->Type) {

17023

case INTR_TYPE_1OP:

17024

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));

17025

case INTR_TYPE_2OP:

17026

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),

17027

Op.getOperand(2));

17028

case INTR_TYPE_3OP:

17029

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),

17030

Op.getOperand(2), Op.getOperand(3));

17031

case INTR_TYPE_1OP_MASK_RM: {

17032

SDValue Src = Op.getOperand(1);

17033

SDValue Src0 = Op.getOperand(2);

17034

SDValue Mask = Op.getOperand(3);

17035

SDValue RoundingMode = Op.getOperand(4);

17036

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,

17037

RoundingMode),

17038

Mask, Src0, Subtarget, DAG);

17039

}

17040

case INTR_TYPE_SCALAR_MASK_RM: {

17041

SDValue Src1 = Op.getOperand(1);

17042

SDValue Src2 = Op.getOperand(2);

17043

SDValue Src0 = Op.getOperand(3);

17044

SDValue Mask = Op.getOperand(4);

17045

SDValue RoundingMode = Op.getOperand(5);

17046

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,

17047

RoundingMode),

17048

Mask, Src0, Subtarget, DAG);

17049

}

17050

case INTR_TYPE_2OP_MASK: {

17051

SDValue Mask = Op.getOperand(4);

17052

SDValue PassThru = Op.getOperand(3);

17053

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

17054

if (IntrWithRoundingModeOpcode != 0) {

17055

unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();

17056

if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {

17057

return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

17058

dl, Op.getValueType(),

17059

Op.getOperand(1), Op.getOperand(2),

17060

Op.getOperand(3), Op.getOperand(5)),

17061

Mask, PassThru, Subtarget, DAG);

17062

}

17063

}

17064

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,

17065

Op.getOperand(1),

17066

Op.getOperand(2)),

17067

Mask, PassThru, Subtarget, DAG);

17068

}

17069

case FMA_OP_MASK: {

17070

SDValue Src1 = Op.getOperand(1);

17071

SDValue Src2 = Op.getOperand(2);

17072

SDValue Src3 = Op.getOperand(3);

17073

SDValue Mask = Op.getOperand(4);

17074

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

17075

if (IntrWithRoundingModeOpcode != 0) {

17076

SDValue Rnd = Op.getOperand(5);

17077

if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=

17078

X86::STATIC_ROUNDING::CUR_DIRECTION)

17079

return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,

17080

dl, Op.getValueType(),

17081

Src1, Src2, Src3, Rnd),

17082

Mask, Src1, Subtarget, DAG);

17083

}

17084

return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,

17085

dl, Op.getValueType(),

17086

Src1, Src2, Src3),

17087

Mask, Src1, Subtarget, DAG);

17088

}

17089

case CMP_MASK:

17090

case CMP_MASK_CC: {

17091

// Comparison intrinsics with masks.

17092

// Example of transformation:

17093

// (i8 (int_x86_avx512_mask_pcmpeq_q_128

17094

// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->

17095

// (i8 (bitcast

17096

// (v8i1 (insert_subvector undef,

17097

// (v2i1 (and (PCMPEQM %a, %b),

17098

// (extract_subvector

17099

// (v8i1 (bitcast %mask)), 0))), 0))))

17100

EVT VT = Op.getOperand(1).getValueType();

17101

EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17102

VT.getVectorNumElements());

17103

SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);

17104

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17105

Mask.getValueType().getSizeInBits());

17106

SDValue Cmp;

17107

if (IntrData->Type == CMP_MASK_CC) {

17108

Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),

17109

Op.getOperand(2), Op.getOperand(3));

17110

} else {

17111

assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!")((IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Type == CMP_MASK && \"Unexpected intrinsic type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17111, __PRETTY_FUNCTION__));

17112

Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),

17113

Op.getOperand(2));

17114

}

17115

SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,

17116

DAG.getTargetConstant(0, MaskVT),

17117

Subtarget, DAG);

17118

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,

17119

DAG.getUNDEF(BitcastVT), CmpMask,

17120

DAG.getIntPtrConstant(0));

17121

return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);

17122

}

17123

case COMI: { // Comparison intrinsics

17124

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

17125

SDValue LHS = Op.getOperand(1);

17126

SDValue RHS = Op.getOperand(2);

17127

unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);

17128

assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!")((X86CC != X86::COND_INVALID && "Unexpected illegal condition!"
) ? static_cast<void> (0) : __assert_fail ("X86CC != X86::COND_INVALID && \"Unexpected illegal condition!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17128, __PRETTY_FUNCTION__));

17129

SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

17130

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

17131

DAG.getConstant(X86CC, MVT::i8), Cond);

17132

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

17133

}

17134

case VSHIFT:

17135

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

17136

Op.getOperand(1), Op.getOperand(2), DAG);

17137

case VSHIFT_MASK:

17138

return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,

17139

Op.getSimpleValueType(),

17140

Op.getOperand(1),

17141

Op.getOperand(2), DAG),

17142

Op.getOperand(4), Op.getOperand(3), Subtarget,

17143

DAG);

17144

case COMPRESS_EXPAND_IN_REG: {

17145

SDValue Mask = Op.getOperand(3);

17146

SDValue DataToCompress = Op.getOperand(1);

17147

SDValue PassThru = Op.getOperand(2);

17148

if (isAllOnes(Mask)) // return data as is

17149

return Op.getOperand(1);

17150

EVT VT = Op.getValueType();

17151

EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17152

VT.getVectorNumElements());

17153

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17154

Mask.getValueType().getSizeInBits());

17155

SDLoc dl(Op);

17156

SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

17157

DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),

17158

DAG.getIntPtrConstant(0));

17159

17160

return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,

17161

PassThru);

17162

}

17163

case BLEND: {

17164

SDValue Mask = Op.getOperand(3);

17165

EVT VT = Op.getValueType();

17166

EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17167

VT.getVectorNumElements());

17168

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17169

Mask.getValueType().getSizeInBits());

17170

SDLoc dl(Op);

17171

SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

17172

DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),

17173

DAG.getIntPtrConstant(0));

17174

return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),

17175

Op.getOperand(2));

17176

}

17177

default:

17178

break;

17179

}

17180

}

17181

17182

switch (IntNo) {

17183

default: return SDValue(); // Don't custom lower most intrinsics.

17184

17185

case Intrinsic::x86_avx512_mask_valign_q_512:

17186

case Intrinsic::x86_avx512_mask_valign_d_512:

17187

// Vector source operands are swapped.

17188

return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,

17189

Op.getValueType(), Op.getOperand(2),

17190

Op.getOperand(1),

17191

Op.getOperand(3)),

17192

Op.getOperand(5), Op.getOperand(4),

17193

Subtarget, DAG);

17194

17195

// ptest and testp intrinsics. The intrinsic these come from are designed to

17196

// return an integer value, not just an instruction so lower it to the ptest

17197

// or testp pattern and a setcc for the result.

17198

case Intrinsic::x86_sse41_ptestz:

17199

case Intrinsic::x86_sse41_ptestc:

17200

case Intrinsic::x86_sse41_ptestnzc:

17201

case Intrinsic::x86_avx_ptestz_256:

17202

case Intrinsic::x86_avx_ptestc_256:

17203

case Intrinsic::x86_avx_ptestnzc_256:

17204

case Intrinsic::x86_avx_vtestz_ps:

17205

case Intrinsic::x86_avx_vtestc_ps:

17206

case Intrinsic::x86_avx_vtestnzc_ps:

17207

case Intrinsic::x86_avx_vtestz_pd:

17208

case Intrinsic::x86_avx_vtestc_pd:

17209

case Intrinsic::x86_avx_vtestnzc_pd:

17210

case Intrinsic::x86_avx_vtestz_ps_256:

17211

case Intrinsic::x86_avx_vtestc_ps_256:

17212

case Intrinsic::x86_avx_vtestnzc_ps_256:

17213

case Intrinsic::x86_avx_vtestz_pd_256:

17214

case Intrinsic::x86_avx_vtestc_pd_256:

17215

case Intrinsic::x86_avx_vtestnzc_pd_256: {

17216

bool IsTestPacked = false;

17217

unsigned X86CC;

17218

switch (IntNo) {

17219

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17219);

17220

case Intrinsic::x86_avx_vtestz_ps:

17221

case Intrinsic::x86_avx_vtestz_pd:

17222

case Intrinsic::x86_avx_vtestz_ps_256:

17223

case Intrinsic::x86_avx_vtestz_pd_256:

17224

IsTestPacked = true; // Fallthrough

17225

case Intrinsic::x86_sse41_ptestz:

17226

case Intrinsic::x86_avx_ptestz_256:

17227

// ZF = 1

17228

X86CC = X86::COND_E;

17229

break;

17230

case Intrinsic::x86_avx_vtestc_ps:

17231

case Intrinsic::x86_avx_vtestc_pd:

17232

case Intrinsic::x86_avx_vtestc_ps_256:

17233

case Intrinsic::x86_avx_vtestc_pd_256:

17234

IsTestPacked = true; // Fallthrough

17235

case Intrinsic::x86_sse41_ptestc:

17236

case Intrinsic::x86_avx_ptestc_256:

17237

// CF = 1

17238

X86CC = X86::COND_B;

17239

break;

17240

case Intrinsic::x86_avx_vtestnzc_ps:

17241

case Intrinsic::x86_avx_vtestnzc_pd:

17242

case Intrinsic::x86_avx_vtestnzc_ps_256:

17243

case Intrinsic::x86_avx_vtestnzc_pd_256:

17244

IsTestPacked = true; // Fallthrough

17245

case Intrinsic::x86_sse41_ptestnzc:

17246

case Intrinsic::x86_avx_ptestnzc_256:

17247

// ZF and CF = 0

17248

X86CC = X86::COND_A;

17249

break;

17250

}

17251

17252

SDValue LHS = Op.getOperand(1);

17253

SDValue RHS = Op.getOperand(2);

17254

unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;

17255

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

17256

SDValue CC = DAG.getConstant(X86CC, MVT::i8);

17257

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);

17258

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

17259

}

17260

case Intrinsic::x86_avx512_kortestz_w:

17261

case Intrinsic::x86_avx512_kortestc_w: {

17262

unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;

17263

SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));

17264

SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));

17265

SDValue CC = DAG.getConstant(X86CC, MVT::i8);

17266

SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

17267

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);

17268

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

17269

}

17270

17271

case Intrinsic::x86_sse42_pcmpistria128:

17272

case Intrinsic::x86_sse42_pcmpestria128:

17273

case Intrinsic::x86_sse42_pcmpistric128:

17274

case Intrinsic::x86_sse42_pcmpestric128:

17275

case Intrinsic::x86_sse42_pcmpistrio128:

17276

case Intrinsic::x86_sse42_pcmpestrio128:

17277

case Intrinsic::x86_sse42_pcmpistris128:

17278

case Intrinsic::x86_sse42_pcmpestris128:

17279

case Intrinsic::x86_sse42_pcmpistriz128:

17280

case Intrinsic::x86_sse42_pcmpestriz128: {

17281

unsigned Opcode;

17282

unsigned X86CC;

17283

switch (IntNo) {

17284

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17284); // Can't reach here.

17285

case Intrinsic::x86_sse42_pcmpistria128:

17286

Opcode = X86ISD::PCMPISTRI;

17287

X86CC = X86::COND_A;

17288

break;

17289

case Intrinsic::x86_sse42_pcmpestria128:

17290

Opcode = X86ISD::PCMPESTRI;

17291

X86CC = X86::COND_A;

17292

break;

17293

case Intrinsic::x86_sse42_pcmpistric128:

17294

Opcode = X86ISD::PCMPISTRI;

17295

X86CC = X86::COND_B;

17296

break;

17297

case Intrinsic::x86_sse42_pcmpestric128:

17298

Opcode = X86ISD::PCMPESTRI;

17299

X86CC = X86::COND_B;

17300

break;

17301

case Intrinsic::x86_sse42_pcmpistrio128:

17302

Opcode = X86ISD::PCMPISTRI;

17303

X86CC = X86::COND_O;

17304

break;

17305

case Intrinsic::x86_sse42_pcmpestrio128:

17306

Opcode = X86ISD::PCMPESTRI;

17307

X86CC = X86::COND_O;

17308

break;

17309

case Intrinsic::x86_sse42_pcmpistris128:

17310

Opcode = X86ISD::PCMPISTRI;

17311

X86CC = X86::COND_S;

17312

break;

17313

case Intrinsic::x86_sse42_pcmpestris128:

17314

Opcode = X86ISD::PCMPESTRI;

17315

X86CC = X86::COND_S;

17316

break;

17317

case Intrinsic::x86_sse42_pcmpistriz128:

17318

Opcode = X86ISD::PCMPISTRI;

17319

X86CC = X86::COND_E;

17320

break;

17321

case Intrinsic::x86_sse42_pcmpestriz128:

17322

Opcode = X86ISD::PCMPESTRI;

17323

X86CC = X86::COND_E;

17324

break;

17325

}

17326

SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());

17327

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

17328

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);

17329

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

17330

DAG.getConstant(X86CC, MVT::i8),

17331

SDValue(PCMP.getNode(), 1));

17332

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

17333

}

17334

17335

case Intrinsic::x86_sse42_pcmpistri128:

17336

case Intrinsic::x86_sse42_pcmpestri128: {

17337

unsigned Opcode;

17338

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

17339

Opcode = X86ISD::PCMPISTRI;

17340

else

17341

Opcode = X86ISD::PCMPESTRI;

17342

17343

SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());

17344

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

17345

return DAG.getNode(Opcode, dl, VTs, NewOps);

17346

}

17347

}

17348

}

17349

17350

static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

17351

SDValue Src, SDValue Mask, SDValue Base,

17352

SDValue Index, SDValue ScaleOp, SDValue Chain,

17353

const X86Subtarget * Subtarget) {

17354

SDLoc dl(Op);

17355

ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);

17356

assert(C && "Invalid scale type")((C && "Invalid scale type") ? static_cast<void>
(0) : __assert_fail ("C && \"Invalid scale type\"", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17356, __PRETTY_FUNCTION__));

17357

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);

17358

EVT MaskVT = MVT::getVectorVT(MVT::i1,

17359

Index.getSimpleValueType().getVectorNumElements());

17360

SDValue MaskInReg;

17361

ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);

17362

if (MaskC)

17363

MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);

17364

else

17365

MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);

17366

SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);

17367

SDValue Disp = DAG.getTargetConstant(0, MVT::i32);

17368

SDValue Segment = DAG.getRegister(0, MVT::i32);

17369

if (Src.getOpcode() == ISD::UNDEF)

17370

Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);

17371

SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};

17372

SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);

17373

SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };

17374

return DAG.getMergeValues(RetOps, dl);

17375

}

17376

17377

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

17378

SDValue Src, SDValue Mask, SDValue Base,

17379

SDValue Index, SDValue ScaleOp, SDValue Chain) {

17380

SDLoc dl(Op);

17381

ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);

17382

17383

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);

17384

SDValue Disp = DAG.getTargetConstant(0, MVT::i32);

17385

SDValue Segment = DAG.getRegister(0, MVT::i32);

17386

EVT MaskVT = MVT::getVectorVT(MVT::i1,

17387

Index.getSimpleValueType().getVectorNumElements());

17388

SDValue MaskInReg;

17389

ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);

17390

if (MaskC)

17391

MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);

17392

else

17393

MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);

17394

SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);

17395

SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};

17396

SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);

17397

return SDValue(Res, 1);

17398

}

17399

17400

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

17401

SDValue Mask, SDValue Base, SDValue Index,

17402

SDValue ScaleOp, SDValue Chain) {

17403

SDLoc dl(Op);

17404

ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);

17405

17406

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);

17407

SDValue Disp = DAG.getTargetConstant(0, MVT::i32);

17408

SDValue Segment = DAG.getRegister(0, MVT::i32);

17409

EVT MaskVT =

17410

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

17411

SDValue MaskInReg;

17412

ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);

17413

if (MaskC)

17414

MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);

17415

else

17416

MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);

17417

//SDVTList VTs = DAG.getVTList(MVT::Other);

17418

SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};

17419

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

17420

return SDValue(Res, 0);

17421

}

17422

17423

// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that

17424

// read performance monitor counters (x86_rdpmc).

17425

static void getReadPerformanceCounter(SDNode *N, SDLoc DL,

17426

SelectionDAG &DAG, const X86Subtarget *Subtarget,

17427

SmallVectorImpl<SDValue> &Results) {

17428

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17428, __PRETTY_FUNCTION__));

17429

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

17430

SDValue LO, HI;

17431

17432

// The ECX register is used to select the index of the performance counter

17433

// to read.

17434

SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,

17435

N->getOperand(2));

17436

SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);

17437

17438

// Reads the content of a 64-bit performance counter and returns it in the

17439

// registers EDX:EAX.

17440

if (Subtarget->is64Bit()) {

17441

LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));

17442

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

17443

LO.getValue(2));

17444

} else {

17445

LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));

17446

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

17447

LO.getValue(2));

17448

}

17449

Chain = HI.getValue(1);

17450

17451

if (Subtarget->is64Bit()) {

17452

// The EAX register is loaded with the low-order 32 bits. The EDX register

17453

// is loaded with the supported high-order bits of the counter.

17454

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

17455

DAG.getConstant(32, MVT::i8));

17456

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

17457

Results.push_back(Chain);

17458

return;

17459

}

17460

17461

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

17462

SDValue Ops[] = { LO, HI };

17463

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

17464

Results.push_back(Pair);

17465

Results.push_back(Chain);

17466

}

17467

17468

// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that

17469

// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is

17470

// also used to custom lower READCYCLECOUNTER nodes.

17471

static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,

17472

SelectionDAG &DAG, const X86Subtarget *Subtarget,

17473

SmallVectorImpl<SDValue> &Results) {

17474

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

17475

SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));

17476

SDValue LO, HI;

17477

17478

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

17479

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

17480

// and the EAX register is loaded with the low-order 32 bits.

17481

if (Subtarget->is64Bit()) {

17482

LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));

17483

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

17484

LO.getValue(2));

17485

} else {

17486

LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));

17487

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

17488

LO.getValue(2));

17489

}

17490

SDValue Chain = HI.getValue(1);

17491

17492

if (Opcode == X86ISD::RDTSCP_DAG) {

17493

17494

17495

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

17496

// the ECX register. Add 'ecx' explicitly to the chain.

17497

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,

17498

HI.getValue(2));

17499

// Explicitly store the content of ECX at the location passed in input

17500

// to the 'rdtscp' intrinsic.

17501

Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),

17502

MachinePointerInfo(), false, false, 0);

17503

}

17504

17505

if (Subtarget->is64Bit()) {

17506

// The EDX register is loaded with the high-order 32 bits of the MSR, and

17507

// the EAX register is loaded with the low-order 32 bits.

17508

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

17509

DAG.getConstant(32, MVT::i8));

17510

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

17511

Results.push_back(Chain);

17512

return;

17513

}

17514

17515

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

17516

SDValue Ops[] = { LO, HI };

17517

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

17518

Results.push_back(Pair);

17519

Results.push_back(Chain);

17520

}

17521

17522

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,

17523

SelectionDAG &DAG) {

17524

SmallVector<SDValue, 2> Results;

17525

SDLoc DL(Op);

17526

getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,

17527

Results);

17528

return DAG.getMergeValues(Results, DL);

17529

}

17530

17531

17532

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,

17533

SelectionDAG &DAG) {

17534

unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

17535

17536

const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);

17537

if (!IntrData)

17538

return SDValue();

17539

17540

SDLoc dl(Op);

17541

switch(IntrData->Type) {

17542

default:

17543

llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17543);

17544

break;

17545

case RDSEED:

17546

case RDRAND: {

17547

// Emit the node with the right value type.

17548

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);

17549

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

17550

17551

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

17552

// Otherwise return the value from Rand, which is always 0, casted to i32.

17553

SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

17554

DAG.getConstant(1, Op->getValueType(1)),

17555

DAG.getConstant(X86::COND_B, MVT::i32),

17556

SDValue(Result.getNode(), 1) };

17557

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,

17558

DAG.getVTList(Op->getValueType(1), MVT::Glue),

17559

Ops);

17560

17561

// Return { result, isValid, chain }.

17562

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

17563

SDValue(Result.getNode(), 2));

17564

}

17565

case GATHER: {

17566

//gather(v1, mask, index, base, scale);

17567

SDValue Chain = Op.getOperand(0);

17568

SDValue Src = Op.getOperand(2);

17569

SDValue Base = Op.getOperand(3);

17570

SDValue Index = Op.getOperand(4);

17571

SDValue Mask = Op.getOperand(5);

17572

SDValue Scale = Op.getOperand(6);

17573

return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,

17574

Subtarget);

17575

}

17576

case SCATTER: {

17577

//scatter(base, mask, index, v1, scale);

17578

SDValue Chain = Op.getOperand(0);

17579

SDValue Base = Op.getOperand(2);

17580

SDValue Mask = Op.getOperand(3);

17581

SDValue Index = Op.getOperand(4);

17582

SDValue Src = Op.getOperand(5);

17583

SDValue Scale = Op.getOperand(6);

17584

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);

17585

}

17586

case PREFETCH: {

17587

SDValue Hint = Op.getOperand(6);

17588

unsigned HintVal;

17589

if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||

17590

(HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)

17591

llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1")::llvm::llvm_unreachable_internal("Wrong prefetch hint in intrinsic: should be 0 or 1"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17591);

17592

unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);

17593

SDValue Chain = Op.getOperand(0);

17594

SDValue Mask = Op.getOperand(2);

17595

SDValue Index = Op.getOperand(3);

17596

SDValue Base = Op.getOperand(4);

17597

SDValue Scale = Op.getOperand(5);

17598

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);

17599

}

17600

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

17601

case RDTSC: {

17602

SmallVector<SDValue, 2> Results;

17603

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);

17604

return DAG.getMergeValues(Results, dl);

17605

}

17606

// Read Performance Monitoring Counters.

17607

case RDPMC: {

17608

SmallVector<SDValue, 2> Results;

17609

getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);

17610

return DAG.getMergeValues(Results, dl);

17611

}

17612

// XTEST intrinsics.

17613

case XTEST: {

17614

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

17615

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

17616

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

17617

DAG.getConstant(X86::COND_NE, MVT::i8),

17618

InTrans);

17619

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

17620

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

17621

Ret, SDValue(InTrans.getNode(), 1));

17622

}

17623

// ADC/ADCX/SBB

17624

case ADX: {

17625

SmallVector<SDValue, 2> Results;

17626

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

17627

SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);

17628

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),

17629

DAG.getConstant(-1, MVT::i8));

17630

SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),

17631

Op.getOperand(4), GenCF.getValue(1));

17632

SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),

17633

Op.getOperand(5), MachinePointerInfo(),

17634

false, false, 0);

17635

SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

17636

DAG.getConstant(X86::COND_B, MVT::i8),

17637

Res.getValue(1));

17638

Results.push_back(SetCC);

17639

Results.push_back(Store);

17640

return DAG.getMergeValues(Results, dl);

17641

}

17642

case COMPRESS_TO_MEM: {

17643

SDLoc dl(Op);

17644

SDValue Mask = Op.getOperand(4);

17645

SDValue DataToCompress = Op.getOperand(3);

17646

SDValue Addr = Op.getOperand(2);

17647

SDValue Chain = Op.getOperand(0);

17648

17649

if (isAllOnes(Mask)) // return just a store

17650

return DAG.getStore(Chain, dl, DataToCompress, Addr,

17651

MachinePointerInfo(), false, false, 0);

17652

17653

EVT VT = DataToCompress.getValueType();

17654

EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17655

VT.getVectorNumElements());

17656

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17657

Mask.getValueType().getSizeInBits());

17658

SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

17659

DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),

17660

DAG.getIntPtrConstant(0));

17661

17662

SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,

17663

DataToCompress, DAG.getUNDEF(VT));

17664

return DAG.getStore(Chain, dl, Compressed, Addr,

17665

MachinePointerInfo(), false, false, 0);

17666

}

17667

case EXPAND_FROM_MEM: {

17668

SDLoc dl(Op);

17669

SDValue Mask = Op.getOperand(4);

17670

SDValue PathThru = Op.getOperand(3);

17671

SDValue Addr = Op.getOperand(2);

17672

SDValue Chain = Op.getOperand(0);

17673

EVT VT = Op.getValueType();

17674

17675

if (isAllOnes(Mask)) // return just a load

17676

return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,

17677

false, 0);

17678

EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17679

VT.getVectorNumElements());

17680

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

17681

Mask.getValueType().getSizeInBits());

17682

SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

17683

DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),

17684

DAG.getIntPtrConstant(0));

17685

17686

SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),

17687

false, false, false, 0);

17688

17689

SmallVector<SDValue, 2> Results;

17690

Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,

17691

PathThru));

17692

Results.push_back(Chain);

17693

return DAG.getMergeValues(Results, dl);

17694

}

17695

}

17696

}

17697

17698

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

17699

SelectionDAG &DAG) const {

17700

MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();

17701

MFI->setReturnAddressIsTaken(true);

17702

17703

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

17704

return SDValue();

17705

17706

unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

17707

SDLoc dl(Op);

17708

EVT PtrVT = getPointerTy();

17709

17710

if (Depth > 0) {

17711

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

17712

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

17713

DAG.getSubtarget().getRegisterInfo());

17714

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);

17715

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

17716

DAG.getNode(ISD::ADD, dl, PtrVT,

17717

FrameAddr, Offset),

17718

MachinePointerInfo(), false, false, false, 0);

17719

}

17720

17721

// Just load the return address.

17722

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

17723

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

17724

RetAddrFI, MachinePointerInfo(), false, false, false, 0);

17725

}

17726

17727

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

17728

MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();

17729

MFI->setFrameAddressIsTaken(true);

17730

17731

EVT VT = Op.getValueType();

17732

SDLoc dl(Op); // FIXME probably not meaningful

17733

unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

17734

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

17735

DAG.getSubtarget().getRegisterInfo());

17736

unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(

17737

DAG.getMachineFunction());

17738

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17740, __PRETTY_FUNCTION__))

17739

(FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17740, __PRETTY_FUNCTION__))

17740

"Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17740, __PRETTY_FUNCTION__));

17741

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

17742

while (Depth--)

17743

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

17744

MachinePointerInfo(),

17745

false, false, false, 0);

17746

return FrameAddr;

17747

}

17748

17749

// FIXME? Maybe this could be a TableGen attribute on some registers and

17750

// this table could be generated automatically from RegInfo.

17751

unsigned X86TargetLowering::getRegisterByName(const char* RegName,

17752

EVT VT) const {

17753

unsigned Reg = StringSwitch<unsigned>(RegName)

17754

.Case("esp", X86::ESP)

17755

.Case("rsp", X86::RSP)

17756

.Default(0);

17757

if (Reg)

17758

return Reg;

17759

report_fatal_error("Invalid register name global variable");

17760

}

17761

17762

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

17763

SelectionDAG &DAG) const {

17764

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

17765

DAG.getSubtarget().getRegisterInfo());

17766

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());

17767

}

17768

17769

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

17770

SDValue Chain = Op.getOperand(0);

17771

SDValue Offset = Op.getOperand(1);

17772

SDValue Handler = Op.getOperand(2);

17773

SDLoc dl (Op);

17774

17775

EVT PtrVT = getPointerTy();

17776

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

17777

DAG.getSubtarget().getRegisterInfo());

17778

unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

17779

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17781, __PRETTY_FUNCTION__))

17780

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17781, __PRETTY_FUNCTION__))

17781

"Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17781, __PRETTY_FUNCTION__));

17782

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

17783

unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

17784

17785

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

17786

DAG.getIntPtrConstant(RegInfo->getSlotSize()));

17787

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

17788

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),

17789

false, false, 0);

17790

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

17791

17792

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

17793

DAG.getRegister(StoreAddrReg, PtrVT));

17794

}

17795

17796

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

17797

SelectionDAG &DAG) const {

17798

SDLoc DL(Op);

17799

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

17800

DAG.getVTList(MVT::i32, MVT::Other),

17801

Op.getOperand(0), Op.getOperand(1));

17802

}

17803

17804

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

17805

SelectionDAG &DAG) const {

17806

SDLoc DL(Op);

17807

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

17808

Op.getOperand(0), Op.getOperand(1));

17809

}

17810

17811

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

17812

return Op.getOperand(0);

17813

}

17814

17815

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

17816

SelectionDAG &DAG) const {

17817

SDValue Root = Op.getOperand(0);

17818

SDValue Trmp = Op.getOperand(1); // trampoline

17819

SDValue FPtr = Op.getOperand(2); // nested function

17820

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

17821

SDLoc dl (Op);

17822

17823

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

17824

const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();

17825

17826

if (Subtarget->is64Bit()) {

17827

SDValue OutChains[6];

17828

17829

// Large code-model.

17830

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

17831

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

17832

17833

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

17834

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

17835

17836

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

17837

17838

// Load the pointer to the nested function into R11.

17839

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

17840

SDValue Addr = Trmp;

17841

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),

17842

Addr, MachinePointerInfo(TrmpAddr),

17843

false, false, 0);

17844

17845

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

17846

DAG.getConstant(2, MVT::i64));

17847

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

17848

MachinePointerInfo(TrmpAddr, 2),

17849

false, false, 2);

17850

17851

// Load the 'nest' parameter value into R10.

17852

// R10 is specified in X86CallingConv.td

17853

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

17854

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

17855

DAG.getConstant(10, MVT::i64));

17856

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),

17857

Addr, MachinePointerInfo(TrmpAddr, 10),

17858

false, false, 0);

17859

17860

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

17861

DAG.getConstant(12, MVT::i64));

17862

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

17863

MachinePointerInfo(TrmpAddr, 12),

17864

false, false, 2);

17865

17866

// Jump to the nested function.

17867

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

17868

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

17869

DAG.getConstant(20, MVT::i64));

17870

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),

17871

Addr, MachinePointerInfo(TrmpAddr, 20),

17872

false, false, 0);

17873

17874

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

17875

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

17876

DAG.getConstant(22, MVT::i64));

17877

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,

17878

MachinePointerInfo(TrmpAddr, 22),

17879

false, false, 0);

17880

17881

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

17882

} else {

17883

const Function *Func =

17884

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

17885

CallingConv::ID CC = Func->getCallingConv();

17886

unsigned NestReg;

17887

17888

switch (CC) {

17889

default:

17890

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17890);

17891

case CallingConv::C:

17892

case CallingConv::X86_StdCall: {

17893

// Pass 'nest' parameter in ECX.

17894

// Must be kept in sync with X86CallingConv.td

17895

NestReg = X86::ECX;

17896

17897

// Check that ECX wasn't needed by an 'inreg' parameter.

17898

FunctionType *FTy = Func->getFunctionType();

17899

const AttributeSet &Attrs = Func->getAttributes();

17900

17901

if (!Attrs.isEmpty() && !Func->isVarArg()) {

17902

unsigned InRegCount = 0;

17903

unsigned Idx = 1;

17904

17905

for (FunctionType::param_iterator I = FTy->param_begin(),

17906

E = FTy->param_end(); I != E; ++I, ++Idx)

17907

if (Attrs.hasAttribute(Idx, Attribute::InReg))

17908

// FIXME: should only count parameters that are lowered to integers.

17909

InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;

17910

17911

if (InRegCount > 2) {

17912

report_fatal_error("Nest register in use - reduce number of inreg"

17913

" parameters!");

17914

}

17915

}

17916

break;

17917

}

17918

case CallingConv::X86_FastCall:

17919

case CallingConv::X86_ThisCall:

17920

case CallingConv::Fast:

17921

// Pass 'nest' parameter in EAX.

17922

// Must be kept in sync with X86CallingConv.td

17923

NestReg = X86::EAX;

17924

break;

17925

}

17926

17927

SDValue OutChains[4];

17928

SDValue Addr, Disp;

17929

17930

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

17931

DAG.getConstant(10, MVT::i32));

17932

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

17933

17934

// This is storing the opcode for MOV32ri.

17935

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

17936

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

17937

OutChains[0] = DAG.getStore(Root, dl,

17938

DAG.getConstant(MOV32ri|N86Reg, MVT::i8),

17939

Trmp, MachinePointerInfo(TrmpAddr),

17940

false, false, 0);

17941

17942

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

17943

DAG.getConstant(1, MVT::i32));

17944

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

17945

MachinePointerInfo(TrmpAddr, 1),

17946

false, false, 1);

17947

17948

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

17949

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

17950

DAG.getConstant(5, MVT::i32));

17951

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,

17952

MachinePointerInfo(TrmpAddr, 5),

17953

false, false, 1);

17954

17955

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

17956

DAG.getConstant(6, MVT::i32));

17957

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

17958

MachinePointerInfo(TrmpAddr, 6),

17959

false, false, 1);

17960

17961

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

17962

}

17963

}

17964

17965

SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,

17966

SelectionDAG &DAG) const {

17967

17968

The rounding mode is in bits 11:10 of FPSR, and has the following

17969

settings:

17970

00 Round to nearest

17971

01 Round to -inf

17972

10 Round to +inf

17973

11 Round to 0

17974

17975

FLT_ROUNDS, on the other hand, expects the following:

17976

-1 Undefined

17977

0 Round to 0

17978

1 Round to nearest

17979

2 Round to +inf

17980

3 Round to -inf

17981

17982

To perform the conversion, we do:

17983

(((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)

17984

17985

17986

MachineFunction &MF = DAG.getMachineFunction();

17987

const TargetMachine &TM = MF.getTarget();

17988

const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();

17989

unsigned StackAlignment = TFI.getStackAlignment();

17990

MVT VT = Op.getSimpleValueType();

17991

SDLoc DL(Op);

17992

17993

// Save FP Control Word to stack slot

17994

int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);

17995

SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());

17996

17997

MachineMemOperand *MMO =

17998

MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),

17999

MachineMemOperand::MOStore, 2, 2);

18000

18001

SDValue Ops[] = { DAG.getEntryNode(), StackSlot };

18002

SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

18003

DAG.getVTList(MVT::Other),

18004

Ops, MVT::i16, MMO);

18005

18006

// Load FP Control Word from stack slot

18007

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,

18008

MachinePointerInfo(), false, false, false, 0);

18009

18010

// Transform as necessary

18011

SDValue CWD1 =

18012

DAG.getNode(ISD::SRL, DL, MVT::i16,

18013

DAG.getNode(ISD::AND, DL, MVT::i16,

18014

CWD, DAG.getConstant(0x800, MVT::i16)),

18015

DAG.getConstant(11, MVT::i8));

18016

SDValue CWD2 =

18017

DAG.getNode(ISD::SRL, DL, MVT::i16,

18018

DAG.getNode(ISD::AND, DL, MVT::i16,

18019

CWD, DAG.getConstant(0x400, MVT::i16)),

18020

DAG.getConstant(9, MVT::i8));

18021

18022

SDValue RetVal =

18023

DAG.getNode(ISD::AND, DL, MVT::i16,

18024

DAG.getNode(ISD::ADD, DL, MVT::i16,

18025

DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),

18026

DAG.getConstant(1, MVT::i16)),

18027

DAG.getConstant(3, MVT::i16));

18028

18029

return DAG.getNode((VT.getSizeInBits() < 16 ?

18030

ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);

18031

}

18032

18033

static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {

18034

MVT VT = Op.getSimpleValueType();

18035

EVT OpVT = VT;

18036

unsigned NumBits = VT.getSizeInBits();

18037

SDLoc dl(Op);

18038

18039

Op = Op.getOperand(0);

18040

if (VT == MVT::i8) {

18041

// Zero extend to i32 since there is not an i8 bsr.

18042

OpVT = MVT::i32;

18043

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

18044

}

18045

18046

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

18047

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

18048

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

18049

18050

// If src is zero (i.e. bsr sets ZF), returns NumBits.

18051

SDValue Ops[] = {

18052

Op,

18053

DAG.getConstant(NumBits+NumBits-1, OpVT),

18054

DAG.getConstant(X86::COND_E, MVT::i8),

18055

Op.getValue(1)

18056

};

18057

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

18058

18059

// Finally xor with NumBits-1.

18060

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));

18061

18062

if (VT == MVT::i8)

18063

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

18064

return Op;

18065

}

18066

18067

static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {

18068

MVT VT = Op.getSimpleValueType();

18069

EVT OpVT = VT;

18070

unsigned NumBits = VT.getSizeInBits();

18071

SDLoc dl(Op);

18072

18073

Op = Op.getOperand(0);

18074

if (VT == MVT::i8) {

18075

// Zero extend to i32 since there is not an i8 bsr.

18076

OpVT = MVT::i32;

18077

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

18078

}

18079

18080

// Issue a bsr (scan bits in reverse).

18081

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

18082

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

18083

18084

// And xor with NumBits-1.

18085

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));

18086

18087

if (VT == MVT::i8)

18088

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

18089

return Op;

18090

}

18091

18092

static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {

18093

MVT VT = Op.getSimpleValueType();

18094

unsigned NumBits = VT.getSizeInBits();

18095

SDLoc dl(Op);

18096

Op = Op.getOperand(0);

18097

18098

// Issue a bsf (scan bits forward) which also sets EFLAGS.

18099

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

18100

Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);

18101

18102

// If src is zero (i.e. bsf sets ZF), returns NumBits.

18103

SDValue Ops[] = {

18104

Op,

18105

DAG.getConstant(NumBits, VT),

18106

DAG.getConstant(X86::COND_E, MVT::i8),

18107

Op.getValue(1)

18108

};

18109

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

18110

}

18111

18112

// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit

18113

// ones, and then concatenate the result back.

18114

static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {

18115

MVT VT = Op.getSimpleValueType();

18116

18117

assert(VT.is256BitVector() && VT.isInteger() &&((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18118, __PRETTY_FUNCTION__))

18118

"Unsupported value type for operation")((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18118, __PRETTY_FUNCTION__));

18119

18120

unsigned NumElems = VT.getVectorNumElements();

18121

SDLoc dl(Op);

18122

18123

// Extract the LHS vectors

18124

SDValue LHS = Op.getOperand(0);

18125

SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);

18126

SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);

18127

18128

// Extract the RHS vectors

18129

SDValue RHS = Op.getOperand(1);

18130

SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);

18131

SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);

18132

18133

MVT EltVT = VT.getVectorElementType();

18134

MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

18135

18136

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

18137

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),

18138

DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));

18139

}

18140

18141

static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {

18142

assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18144, __PRETTY_FUNCTION__))

18143

Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18144, __PRETTY_FUNCTION__))

18144

"Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18144, __PRETTY_FUNCTION__));

18145

return Lower256IntArith(Op, DAG);

18146

}

18147

18148

static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {

18149

18150

18151

18152

return Lower256IntArith(Op, DAG);

18153

}

18154

18155

static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,

18156

SelectionDAG &DAG) {

18157

SDLoc dl(Op);

18158

MVT VT = Op.getSimpleValueType();

18159

18160

// Decompose 256-bit ops into smaller 128-bit ops.

18161

if (VT.is256BitVector() && !Subtarget->hasInt256())

18162

return Lower256IntArith(Op, DAG);

18163

18164

SDValue A = Op.getOperand(0);

18165

SDValue B = Op.getOperand(1);

18166

18167

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

18168

if (VT == MVT::v4i32) {

18169

assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&((Subtarget->hasSSE2() && !Subtarget->hasSSE41(
) && "Should not custom lower when pmuldq is available!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && !Subtarget->hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18170, __PRETTY_FUNCTION__))

18170

"Should not custom lower when pmuldq is available!")((Subtarget->hasSSE2() && !Subtarget->hasSSE41(
) && "Should not custom lower when pmuldq is available!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && !Subtarget->hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18170, __PRETTY_FUNCTION__));

18171

18172

// Extract the odd parts.

18173

static const int UnpackMask[] = { 1, -1, 3, -1 };

18174

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

18175

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

18176

18177

// Multiply the even parts.

18178

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);

18179

// Now multiply odd parts.

18180

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);

18181

18182

Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);

18183

Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);

18184

18185

// Merge the two vectors back together with a shuffle. This expands into 2

18186

// shuffles.

18187

static const int ShufMask[] = { 0, 4, 2, 6 };

18188

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

18189

}

18190

18191

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18192, __PRETTY_FUNCTION__))

18192

"Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18192, __PRETTY_FUNCTION__));

18193

18194

// Ahi = psrlqi(a, 32);

18195

// Bhi = psrlqi(b, 32);

18196

18197

// AloBlo = pmuludq(a, b);

18198

// AloBhi = pmuludq(a, Bhi);

18199

// AhiBlo = pmuludq(Ahi, b);

18200

18201

// AloBhi = psllqi(AloBhi, 32);

18202

// AhiBlo = psllqi(AhiBlo, 32);

18203

// return AloBlo + AloBhi + AhiBlo;

18204

18205

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

18206

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

18207

18208

// Bit cast to 32-bit vectors for MULUDQ

18209

EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :

18210

(VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;

18211

A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);

18212

B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);

18213

Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);

18214

Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);

18215

18216

SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

18217

SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

18218

SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

18219

18220

AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);

18221

AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);

18222

18223

SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);

18224

return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);

18225

}

18226

18227

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

18228

assert(Subtarget->isTargetWin64() && "Unexpected target")((Subtarget->isTargetWin64() && "Unexpected target"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->isTargetWin64() && \"Unexpected target\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18228, __PRETTY_FUNCTION__));

18229

EVT VT = Op.getValueType();

18230

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18231, __PRETTY_FUNCTION__))

18231

"Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18231, __PRETTY_FUNCTION__));

18232

18233

RTLIB::Libcall LC;

18234

bool isSigned;

18235

switch (Op->getOpcode()) {

18236

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18236);

18237

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

18238

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

18239

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

18240

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

18241

case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;

18242

case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;

18243

}

18244

18245

SDLoc dl(Op);

18246

SDValue InChain = DAG.getEntryNode();

18247

18248

TargetLowering::ArgListTy Args;

18249

TargetLowering::ArgListEntry Entry;

18250

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

18251

EVT ArgVT = Op->getOperand(i).getValueType();

18252

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18253, __PRETTY_FUNCTION__))

18253

"Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18253, __PRETTY_FUNCTION__));

18254

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

18255

Entry.Node = StackPtr;

18256

InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),

18257

false, false, 16);

18258

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

18259

Entry.Ty = PointerType::get(ArgTy,0);

18260

Entry.isSExt = false;

18261

Entry.isZExt = false;

18262

Args.push_back(Entry);

18263

}

18264

18265

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

18266

getPointerTy());

18267

18268

TargetLowering::CallLoweringInfo CLI(DAG);

18269

CLI.setDebugLoc(dl).setChain(InChain)

18270

.setCallee(getLibcallCallingConv(LC),

18271

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),

18272

Callee, std::move(Args), 0)

18273

.setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);

18274

18275

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

18276

return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);

18277

}

18278

18279

static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,

18280

SelectionDAG &DAG) {

18281

SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);

18282

EVT VT = Op0.getValueType();

18283

SDLoc dl(Op);

18284

18285

assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||(((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT
== MVT::v8i32 && Subtarget->hasInt256())) ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18286, __PRETTY_FUNCTION__))

18286

(VT == MVT::v8i32 && Subtarget->hasInt256()))(((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT
== MVT::v8i32 && Subtarget->hasInt256())) ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18286, __PRETTY_FUNCTION__));

18287

18288

// PMULxD operations multiply each even value (starting at 0) of LHS with

18289

// the related value of RHS and produce a widen result.

18290

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

18291

// => <2 x i64> <ae|cg>

18292

18293

// In other word, to have all the results, we need to perform two PMULxD:

18294

// 1. one with the even values.

18295

// 2. one with the odd values.

18296

// To achieve #2, with need to place the odd values at an even position.

18297

18298

// Place the odd value at an even position (basically, shift all values 1

18299

// step to the left):

18300

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};

18301

// <a|b|c|d> => <b|undef|d|undef>

18302

SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);

18303

// <e|f|g|h> => <f|undef|h|undef>

18304

SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);

18305

18306

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

18307

// ints.

18308

MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;

18309

bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;

18310

unsigned Opcode =

18311

(!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;

18312

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

18313

// => <2 x i64> <ae|cg>

18314

SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,

18315

DAG.getNode(Opcode, dl, MulVT, Op0, Op1));

18316

18317

// => <2 x i64> <bf|dh>

18318

SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,

18319

DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));

18320

18321

// Shuffle it back into the right order.

18322

SDValue Highs, Lows;

18323

if (VT == MVT::v8i32) {

18324

const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};

18325

Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);

18326

const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};

18327

Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);

18328

} else {

18329

const int HighMask[] = {1, 5, 3, 7};

18330

Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);

18331

const int LowMask[] = {0, 4, 2, 6};

18332

Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);

18333

}

18334

18335

// If we have a signed multiply but no PMULDQ fix up the high parts of a

18336

// unsigned multiply.

18337

if (IsSigned && !Subtarget->hasSSE41()) {

18338

SDValue ShAmt =

18339

DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));

18340

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

18341

DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);

18342

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

18343

DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

18344

18345

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

18346

Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);

18347

}

18348

18349

// The first result of MUL_LOHI is actually the low value, followed by the

18350

// high value.

18351

SDValue Ops[] = {Lows, Highs};

18352

return DAG.getMergeValues(Ops, dl);

18353

}

18354

18355

static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,

18356

const X86Subtarget *Subtarget) {

18357

MVT VT = Op.getSimpleValueType();

18358

SDLoc dl(Op);

18359

SDValue R = Op.getOperand(0);

18360

SDValue Amt = Op.getOperand(1);

18361

18362

// Optimize shl/srl/sra with constant shift amount.

18363

if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {

18364

if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {

18365

uint64_t ShiftAmt = ShiftConst->getZExtValue();

18366

18367

if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||

18368

(Subtarget->hasInt256() &&

18369

(VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||

18370

(Subtarget->hasAVX512() &&

18371

(VT == MVT::v8i64 || VT == MVT::v16i32))) {

18372

if (Op.getOpcode() == ISD::SHL)

18373

return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,

18374

DAG);

18375

if (Op.getOpcode() == ISD::SRL)

18376

return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,

18377

DAG);

18378

if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)

18379

return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,

18380

DAG);

18381

}

18382

18383

if (VT == MVT::v16i8) {

18384

if (Op.getOpcode() == ISD::SHL) {

18385

// Make a large shift.

18386

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,

18387

MVT::v8i16, R, ShiftAmt,

18388

DAG);

18389

SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);

18390

// Zero out the rightmost bits.

18391

SmallVector<SDValue, 16> V(16,

18392

DAG.getConstant(uint8_t(-1U << ShiftAmt),

18393

MVT::i8));

18394

return DAG.getNode(ISD::AND, dl, VT, SHL,

18395

DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));

18396

}

18397

if (Op.getOpcode() == ISD::SRL) {

18398

// Make a large shift.

18399

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,

18400

MVT::v8i16, R, ShiftAmt,

18401

DAG);

18402

SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);

18403

// Zero out the leftmost bits.

18404

SmallVector<SDValue, 16> V(16,

18405

DAG.getConstant(uint8_t(-1U) >> ShiftAmt,

18406

MVT::i8));

18407

return DAG.getNode(ISD::AND, dl, VT, SRL,

18408

DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));

18409

}

18410

if (Op.getOpcode() == ISD::SRA) {

18411

if (ShiftAmt == 7) {

18412

// R s>> 7 === R s< 0

18413

SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);

18414

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

18415

}

18416

18417

// R s>> a === ((R u>> a) ^ m) - m

18418

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

18419

SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,

18420

MVT::i8));

18421

SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);

18422

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

18423

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

18424

return Res;

18425

}

18426

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18426);

18427

}

18428

18429

if (Subtarget->hasInt256() && VT == MVT::v32i8) {

18430

if (Op.getOpcode() == ISD::SHL) {

18431

// Make a large shift.

18432

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,

18433

MVT::v16i16, R, ShiftAmt,

18434

DAG);

18435

SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);

18436

// Zero out the rightmost bits.

18437

SmallVector<SDValue, 32> V(32,

18438

DAG.getConstant(uint8_t(-1U << ShiftAmt),

18439

MVT::i8));

18440

return DAG.getNode(ISD::AND, dl, VT, SHL,

18441

DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));

18442

}

18443

if (Op.getOpcode() == ISD::SRL) {

18444

// Make a large shift.

18445

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,

18446

MVT::v16i16, R, ShiftAmt,

18447

DAG);

18448

SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);

18449

// Zero out the leftmost bits.

18450

SmallVector<SDValue, 32> V(32,

18451

DAG.getConstant(uint8_t(-1U) >> ShiftAmt,

18452

MVT::i8));

18453

return DAG.getNode(ISD::AND, dl, VT, SRL,

18454

DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));

18455

}

18456

if (Op.getOpcode() == ISD::SRA) {

18457

if (ShiftAmt == 7) {

18458

// R s>> 7 === R s< 0

18459

SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);

18460

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

18461

}

18462

18463

// R s>> a === ((R u>> a) ^ m) - m

18464

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

18465

SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,

18466

MVT::i8));

18467

SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);

18468

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

18469

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

18470

return Res;

18471

}

18472

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18472);

18473

}

18474

}

18475

}

18476

18477

// Special case in 32-bit mode, where i64 is expanded into high and low parts.

18478

if (!Subtarget->is64Bit() &&

18479

(VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&

18480

Amt.getOpcode() == ISD::BITCAST &&

18481

Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {

18482

Amt = Amt.getOperand(0);

18483

unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /

18484

VT.getVectorNumElements();

18485

unsigned RatioInLog2 = Log2_32_Ceil(Ratio);

18486

uint64_t ShiftAmt = 0;

18487

for (unsigned i = 0; i != Ratio; ++i) {

18488

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));

18489

if (!C)

18490

return SDValue();

18491

// 6 == Log2(64)

18492

ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));

18493

}

18494

// Check remaining shift amounts.

18495

for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {

18496

uint64_t ShAmt = 0;

18497

for (unsigned j = 0; j != Ratio; ++j) {

18498

ConstantSDNode *C =

18499

dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));

18500

if (!C)

18501

return SDValue();

18502

// 6 == Log2(64)

18503

ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));

18504

}

18505

if (ShAmt != ShiftAmt)

18506

return SDValue();

18507

}

18508

switch (Op.getOpcode()) {

18509

default:

18510

llvm_unreachable("Unknown shift opcode!")::llvm::llvm_unreachable_internal("Unknown shift opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18510);

18511

case ISD::SHL:

18512

return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,

18513

DAG);

18514

case ISD::SRL:

18515

return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,

18516

DAG);

18517

case ISD::SRA:

18518

return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,

18519

DAG);

18520

}

18521

}

18522

18523

return SDValue();

18524

}

18525

18526

static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,

18527

const X86Subtarget* Subtarget) {

18528

MVT VT = Op.getSimpleValueType();

18529

SDLoc dl(Op);

18530

SDValue R = Op.getOperand(0);

18531

SDValue Amt = Op.getOperand(1);

18532

18533

if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||

18534

VT == MVT::v4i32 || VT == MVT::v8i16 ||

18535

(Subtarget->hasInt256() &&

18536

((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||

18537

VT == MVT::v8i32 || VT == MVT::v16i16)) ||

18538

(Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {

18539

SDValue BaseShAmt;

18540

EVT EltVT = VT.getVectorElementType();

18541

18542

if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {

18543

// Check if this build_vector node is doing a splat.

18544

// If so, then set BaseShAmt equal to the splat value.

18545

BaseShAmt = BV->getSplatValue();

18546

if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)

18547

BaseShAmt = SDValue();

18548

} else {

18549

if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)

18550

Amt = Amt.getOperand(0);

18551

18552

ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);

18553

if (SVN && SVN->isSplat()) {

18554

unsigned SplatIdx = (unsigned)SVN->getSplatIndex();

18555

SDValue InVec = Amt.getOperand(0);

18556

if (InVec.getOpcode() == ISD::BUILD_VECTOR) {

18557

assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&(((SplatIdx < InVec.getValueType().getVectorNumElements())
&& "Unexpected shuffle index found!") ? static_cast<
void> (0) : __assert_fail ("(SplatIdx < InVec.getValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18558, __PRETTY_FUNCTION__))

18558

"Unexpected shuffle index found!")(((SplatIdx < InVec.getValueType().getVectorNumElements())
&& "Unexpected shuffle index found!") ? static_cast<
void> (0) : __assert_fail ("(SplatIdx < InVec.getValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18558, __PRETTY_FUNCTION__));

18559

BaseShAmt = InVec.getOperand(SplatIdx);

18560

} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {

18561

if (ConstantSDNode *C =

18562

dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {

18563

if (C->getZExtValue() == SplatIdx)

18564

BaseShAmt = InVec.getOperand(1);

18565

}

18566

}

18567

18568

if (!BaseShAmt)

18569

// Avoid introducing an extract element from a shuffle.

18570

BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,

18571

DAG.getIntPtrConstant(SplatIdx));

18572

}

18573

}

18574

18575

if (BaseShAmt.getNode()) {

18576

assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18576, __PRETTY_FUNCTION__));

18577

if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))

18578

BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);

18579

else if (EltVT.bitsLT(MVT::i32))

18580

BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

18581

18582

switch (Op.getOpcode()) {

18583

default:

18584

llvm_unreachable("Unknown shift opcode!")::llvm::llvm_unreachable_internal("Unknown shift opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18584);

18585

case ISD::SHL:

18586

switch (VT.SimpleTy) {

18587

default: return SDValue();

18588

case MVT::v2i64:

18589

case MVT::v4i32:

18590

case MVT::v8i16:

18591

case MVT::v4i64:

18592

case MVT::v8i32:

18593

case MVT::v16i16:

18594

case MVT::v16i32:

18595

case MVT::v8i64:

18596

return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);

18597

}

18598

case ISD::SRA:

18599

switch (VT.SimpleTy) {

18600

default: return SDValue();

18601

case MVT::v4i32:

18602

case MVT::v8i16:

18603

case MVT::v8i32:

18604

case MVT::v16i16:

18605

case MVT::v16i32:

18606

case MVT::v8i64:

18607

return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);

18608

}

18609

case ISD::SRL:

18610

switch (VT.SimpleTy) {

18611

default: return SDValue();

18612

case MVT::v2i64:

18613

case MVT::v4i32:

18614

case MVT::v8i16:

18615

case MVT::v4i64:

18616

case MVT::v8i32:

18617

case MVT::v16i16:

18618

case MVT::v16i32:

18619

case MVT::v8i64:

18620

return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);

18621

}

18622

}

18623

}

18624

}

18625

18626

// Special case in 32-bit mode, where i64 is expanded into high and low parts.

18627

if (!Subtarget->is64Bit() &&

18628

(VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||

18629

(Subtarget->hasAVX512() && VT == MVT::v8i64)) &&

18630

Amt.getOpcode() == ISD::BITCAST &&

18631

Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {

18632

Amt = Amt.getOperand(0);

18633

unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /

18634

VT.getVectorNumElements();

18635

std::vector<SDValue> Vals(Ratio);

18636

for (unsigned i = 0; i != Ratio; ++i)

18637

Vals[i] = Amt.getOperand(i);

18638

for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {

18639

for (unsigned j = 0; j != Ratio; ++j)

18640

if (Vals[j] != Amt.getOperand(i + j))

18641

return SDValue();

18642

}

18643

switch (Op.getOpcode()) {

18644

default:

18645

llvm_unreachable("Unknown shift opcode!")::llvm::llvm_unreachable_internal("Unknown shift opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18645);

18646

case ISD::SHL:

18647

return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));

18648

case ISD::SRL:

18649

return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));

18650

case ISD::SRA:

18651

return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));

18652

}

18653

}

18654

18655

return SDValue();

18656

}

18657

18658

static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,

18659

SelectionDAG &DAG) {

18660

MVT VT = Op.getSimpleValueType();

18661

SDLoc dl(Op);

18662

SDValue R = Op.getOperand(0);

18663

SDValue Amt = Op.getOperand(1);

18664

SDValue V;

18665

18666

assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18666, __PRETTY_FUNCTION__));

18667

assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18667, __PRETTY_FUNCTION__));

18668

18669

V = LowerScalarImmediateShift(Op, DAG, Subtarget);

18670

if (V.getNode())

18671

return V;

18672

18673

V = LowerScalarVariableShift(Op, DAG, Subtarget);

18674

if (V.getNode())

18675

return V;

18676

18677

if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))

18678

return Op;

18679

// AVX2 has VPSLLV/VPSRAV/VPSRLV.

18680

if (Subtarget->hasInt256()) {

18681

if (Op.getOpcode() == ISD::SRL &&

18682

(VT == MVT::v2i64 || VT == MVT::v4i32 ||

18683

VT == MVT::v4i64 || VT == MVT::v8i32))

18684

return Op;

18685

if (Op.getOpcode() == ISD::SHL &&

18686

(VT == MVT::v2i64 || VT == MVT::v4i32 ||

18687

VT == MVT::v4i64 || VT == MVT::v8i32))

18688

return Op;

18689

if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))

18690

return Op;

18691

}

18692

18693

// If possible, lower this packed shift into a vector multiply instead of

18694

// expanding it into a sequence of scalar shifts.

18695

// Do this only if the vector shift count is a constant build_vector.

18696

if (Op.getOpcode() == ISD::SHL &&

18697

(VT == MVT::v8i16 || VT == MVT::v4i32 ||

18698

(Subtarget->hasInt256() && VT == MVT::v16i16)) &&

18699

ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {

18700

SmallVector<SDValue, 8> Elts;

18701

EVT SVT = VT.getScalarType();

18702

unsigned SVTBits = SVT.getSizeInBits();

18703

const APInt &One = APInt(SVTBits, 1);

18704

unsigned NumElems = VT.getVectorNumElements();

18705

18706

for (unsigned i=0; i !=NumElems; ++i) {

18707

SDValue Op = Amt->getOperand(i);

18708

if (Op->getOpcode() == ISD::UNDEF) {

18709

Elts.push_back(Op);

18710

continue;

18711

}

18712

18713

ConstantSDNode *ND = cast<ConstantSDNode>(Op);

18714

const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());

18715

uint64_t ShAmt = C.getZExtValue();

18716

if (ShAmt >= SVTBits) {

18717

Elts.push_back(DAG.getUNDEF(SVT));

18718

continue;

18719

}

18720

Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));

18721

}

18722

SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);

18723

return DAG.getNode(ISD::MUL, dl, VT, R, BV);

18724

}

18725

18726

// Lower SHL with variable shift amount.

18727

if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {

18728

Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));

18729

18730

Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));

18731

Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);

18732

Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);

18733

return DAG.getNode(ISD::MUL, dl, VT, Op, R);

18734

}

18735

18736

// If possible, lower this shift as a sequence of two shifts by

18737

// constant plus a MOVSS/MOVSD instead of scalarizing it.

18738

// Example:

18739

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

18740

18741

// Could be rewritten as:

18742

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

18743

18744

// The advantage is that the two shifts from the example would be

18745

// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing

18746

// the vector shift into four scalar shifts plus four pairs of vector

18747

// insert/extract.

18748

if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&

18749

ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {

18750

unsigned TargetOpcode = X86ISD::MOVSS;

18751

bool CanBeSimplified;

18752

// The splat value for the first packed shift (the 'X' from the example).

18753

SDValue Amt1 = Amt->getOperand(0);

18754

// The splat value for the second packed shift (the 'Y' from the example).

18755

SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :

18756

Amt->getOperand(2);

18757

18758

// See if it is possible to replace this node with a sequence of

18759

// two shifts followed by a MOVSS/MOVSD

18760

if (VT == MVT::v4i32) {

18761

// Check if it is legal to use a MOVSS.

18762

CanBeSimplified = Amt2 == Amt->getOperand(2) &&

18763

Amt2 == Amt->getOperand(3);

18764

if (!CanBeSimplified) {

18765

// Otherwise, check if we can still simplify this node using a MOVSD.

18766

CanBeSimplified = Amt1 == Amt->getOperand(1) &&

18767

Amt->getOperand(2) == Amt->getOperand(3);

18768

TargetOpcode = X86ISD::MOVSD;

18769

Amt2 = Amt->getOperand(2);

18770

}

18771

} else {

18772

// Do similar checks for the case where the machine value type

18773

// is MVT::v8i16.

18774

CanBeSimplified = Amt1 == Amt->getOperand(1);

18775

for (unsigned i=3; i != 8 && CanBeSimplified; ++i)

18776

CanBeSimplified = Amt2 == Amt->getOperand(i);

18777

18778

if (!CanBeSimplified) {

18779

TargetOpcode = X86ISD::MOVSD;

18780

CanBeSimplified = true;

18781

Amt2 = Amt->getOperand(4);

18782

for (unsigned i=0; i != 4 && CanBeSimplified; ++i)

18783

CanBeSimplified = Amt1 == Amt->getOperand(i);

18784

for (unsigned j=4; j != 8 && CanBeSimplified; ++j)

18785

CanBeSimplified = Amt2 == Amt->getOperand(j);

18786

}

18787

}

18788

18789

if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&

18790

isa<ConstantSDNode>(Amt2)) {

18791

// Replace this node with two shifts followed by a MOVSS/MOVSD.

18792

EVT CastVT = MVT::v4i32;

18793

SDValue Splat1 =

18794

DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);

18795

SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);

18796

SDValue Splat2 =

18797

DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);

18798

SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);

18799

if (TargetOpcode == X86ISD::MOVSD)

18800

CastVT = MVT::v2i64;

18801

SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);

18802

SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);

18803

SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,

18804

BitCast1, DAG);

18805

return DAG.getNode(ISD::BITCAST, dl, VT, Result);

18806

}

18807

}

18808

18809

if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {

18810

assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.")((Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Need SSE2 for pslli/pcmpeq.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18810, __PRETTY_FUNCTION__));

18811

18812

// a = a << 5;

18813

Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));

18814

Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);

18815

18816

// Turn 'a' into a mask suitable for VSELECT

18817

SDValue VSelM = DAG.getConstant(0x80, VT);

18818

SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);

18819

OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);

18820

18821

SDValue CM1 = DAG.getConstant(0x0f, VT);

18822

SDValue CM2 = DAG.getConstant(0x3f, VT);

18823

18824

// r = VSELECT(r, psllw(r & (char16)15, 4), a);

18825

SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);

18826

M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);

18827

M = DAG.getNode(ISD::BITCAST, dl, VT, M);

18828

R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);

18829

18830

// a += a

18831

Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);

18832

OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);

18833

OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);

18834

18835

// r = VSELECT(r, psllw(r & (char16)63, 2), a);

18836

M = DAG.getNode(ISD::AND, dl, VT, R, CM2);

18837

M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);

18838

M = DAG.getNode(ISD::BITCAST, dl, VT, M);

18839

R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);

18840

18841

// a += a

18842

Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);

18843

OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);

18844

OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);

18845

18846

// return VSELECT(r, r+r, a);

18847

R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,

18848

DAG.getNode(ISD::ADD, dl, VT, R, R), R);

18849

return R;

18850

}

18851

18852

// It's worth extending once and using the v8i32 shifts for 16-bit types, but

18853

// the extra overheads to get from v16i8 to v8i32 make the existing SSE

18854

// solution better.

18855

if (Subtarget->hasInt256() && VT == MVT::v8i16) {

18856

MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;

18857

unsigned ExtOpc =

18858

Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

18859

R = DAG.getNode(ExtOpc, dl, NewVT, R);

18860

Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);

18861

return DAG.getNode(ISD::TRUNCATE, dl, VT,

18862

DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));

18863

}

18864

18865

// Decompose 256-bit shifts into smaller 128-bit shifts.

18866

if (VT.is256BitVector()) {

18867

unsigned NumElems = VT.getVectorNumElements();

18868

MVT EltVT = VT.getVectorElementType();

18869

EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

18870

18871

// Extract the two vectors

18872

SDValue V1 = Extract128BitVector(R, 0, DAG, dl);

18873

SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);

18874

18875

// Recreate the shift amount vectors

18876

SDValue Amt1, Amt2;

18877

if (Amt.getOpcode() == ISD::BUILD_VECTOR) {

18878

// Constant shift amount

18879

SmallVector<SDValue, 4> Amt1Csts;

18880

SmallVector<SDValue, 4> Amt2Csts;

18881

for (unsigned i = 0; i != NumElems/2; ++i)

18882

Amt1Csts.push_back(Amt->getOperand(i));

18883

for (unsigned i = NumElems/2; i != NumElems; ++i)

18884

Amt2Csts.push_back(Amt->getOperand(i));

18885

18886

Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);

18887

Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);

18888

} else {

18889

// Variable shift amount

18890

Amt1 = Extract128BitVector(Amt, 0, DAG, dl);

18891

Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);

18892

}

18893

18894

// Issue new vector shifts for the smaller types

18895

V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);

18896

V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);

18897

18898

// Concatenate the result back

18899

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);

18900

}

18901

18902

return SDValue();

18903

}

18904

18905

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

18906

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

18907

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

18908

// looks for this combo and may remove the "setcc" instruction if the "setcc"

18909

// has only one use.

18910

SDNode *N = Op.getNode();

18911

SDValue LHS = N->getOperand(0);

18912

SDValue RHS = N->getOperand(1);

18913

unsigned BaseOp = 0;

18914

unsigned Cond = 0;

18915

SDLoc DL(Op);

18916

switch (Op.getOpcode()) {

18917

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18917);

18918

case ISD::SADDO:

18919

// A subtract of one will be selected as a INC. Note that INC doesn't

18920

// set CF, so we can't do this for UADDO.

18921

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))

18922

if (C->isOne()) {

18923

BaseOp = X86ISD::INC;

18924

Cond = X86::COND_O;

18925

break;

18926

}

18927

BaseOp = X86ISD::ADD;

18928

Cond = X86::COND_O;

18929

break;

18930

case ISD::UADDO:

18931

BaseOp = X86ISD::ADD;

18932

Cond = X86::COND_B;

18933

break;

18934

case ISD::SSUBO:

18935

// A subtract of one will be selected as a DEC. Note that DEC doesn't

18936

// set CF, so we can't do this for USUBO.

18937

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))

18938

if (C->isOne()) {

18939

BaseOp = X86ISD::DEC;

18940

Cond = X86::COND_O;

18941

break;

18942

}

18943

BaseOp = X86ISD::SUB;

18944

Cond = X86::COND_O;

18945

break;

18946

case ISD::USUBO:

18947

BaseOp = X86ISD::SUB;

18948

Cond = X86::COND_B;

18949

break;

18950

case ISD::SMULO:

18951

BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;

18952

Cond = X86::COND_O;

18953

break;

18954

case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs

18955

if (N->getValueType(0) == MVT::i8) {

18956

BaseOp = X86ISD::UMUL8;

18957

Cond = X86::COND_O;

18958

break;

18959

}

18960

SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),

18961

MVT::i32);

18962

SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);

18963

18964

SDValue SetCC =

18965

DAG.getNode(X86ISD::SETCC, DL, MVT::i8,

18966

DAG.getConstant(X86::COND_O, MVT::i32),

18967

SDValue(Sum.getNode(), 2));

18968

18969

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

18970

}

18971

}

18972

18973

// Also sets EFLAGS.

18974

SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);

18975

SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

18976

18977

SDValue SetCC =

18978

DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),

18979

DAG.getConstant(Cond, MVT::i32),

18980

SDValue(Sum.getNode(), 1));

18981

18982

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

18983

}

18984

18985

// Sign extension of the low part of vector elements. This may be used either

18986

// when sign extend instructions are not available or if the vector element

18987

// sizes already match the sign-extended size. If the vector elements are in

18988

// their pre-extended size and sign extend instructions are available, that will

18989

// be handled by LowerSIGN_EXTEND.

18990

SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,

18991

SelectionDAG &DAG) const {

18992

SDLoc dl(Op);

18993

EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();

18994

MVT VT = Op.getSimpleValueType();

18995

18996

if (!Subtarget->hasSSE2() || !VT.isVector())

18997

return SDValue();

18998

18999

unsigned BitsDiff = VT.getScalarType().getSizeInBits() -

19000

ExtraVT.getScalarType().getSizeInBits();

19001

19002

switch (VT.SimpleTy) {

19003

default: return SDValue();

19004

case MVT::v8i32:

19005

case MVT::v16i16:

19006

if (!Subtarget->hasFp256())

19007

return SDValue();

19008

if (!Subtarget->hasInt256()) {

19009

// needs to be split

19010

unsigned NumElems = VT.getVectorNumElements();

19011

19012

// Extract the LHS vectors

19013

SDValue LHS = Op.getOperand(0);

19014

SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);

19015

SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);

19016

19017

MVT EltVT = VT.getVectorElementType();

19018

EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

19019

19020

EVT ExtraEltVT = ExtraVT.getVectorElementType();

19021

unsigned ExtraNumElems = ExtraVT.getVectorNumElements();

19022

ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,

19023

ExtraNumElems/2);

19024

SDValue Extra = DAG.getValueType(ExtraVT);

19025

19026

LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);

19027

LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);

19028

19029

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);

19030

}

19031

// fall through

19032

case MVT::v4i32:

19033

case MVT::v8i16: {

19034

SDValue Op0 = Op.getOperand(0);

19035

19036

// This is a sign extension of some low part of vector elements without

19037

// changing the size of the vector elements themselves:

19038

// Shift-Left + Shift-Right-Algebraic.

19039

SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,

19040

BitsDiff, DAG);

19041

return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,

19042

DAG);

19043

}

19044

}

19045

}

19046

19047

/// Returns true if the operand type is exactly twice the native width, and

19048

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

19049

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

19050

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

19051

bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {

19052

const X86Subtarget &Subtarget =

19053

getTargetMachine().getSubtarget<X86Subtarget>();

19054

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

19055

19056

if (OpWidth == 64)

19057

return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b

19058

else if (OpWidth == 128)

19059

return Subtarget.hasCmpxchg16b();

19060

else

19061

return false;

19062

}

19063

19064

bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

19065

return needsCmpXchgNb(SI->getValueOperand()->getType());

19066

}

19067

19068

// Note: this turns large loads into lock cmpxchg8b/16b.

19069

// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.

19070

bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

19071

auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());

19072

return needsCmpXchgNb(PTy->getElementType());

19073

}

19074

19075

bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

19076

const X86Subtarget &Subtarget =

19077

getTargetMachine().getSubtarget<X86Subtarget>();

19078

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

19079

const Type *MemType = AI->getType();

19080

19081

// If the operand is too big, we must see if cmpxchg8/16b is available

19082

// and default to library calls otherwise.

19083

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

19084

return needsCmpXchgNb(MemType);

19085

19086

AtomicRMWInst::BinOp Op = AI->getOperation();

19087

switch (Op) {

19088

default:

19089

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19089);

19090

case AtomicRMWInst::Xchg:

19091

case AtomicRMWInst::Add:

19092

case AtomicRMWInst::Sub:

19093

// It's better to use xadd, xsub or xchg for these in all cases.

19094

return false;

19095

case AtomicRMWInst::Or:

19096

case AtomicRMWInst::And:

19097

case AtomicRMWInst::Xor:

19098

// If the atomicrmw's result isn't actually used, we can just add a "lock"

19099

// prefix to a normal instruction for these operations.

19100

return !AI->use_empty();

19101

case AtomicRMWInst::Nand:

19102

case AtomicRMWInst::Max:

19103

case AtomicRMWInst::Min:

19104

case AtomicRMWInst::UMax:

19105

case AtomicRMWInst::UMin:

19106

// These always require a non-trivial set of data operations on x86. We must

19107

// use a cmpxchg loop.

19108

return true;

19109

}

19110

}

19111

19112

static bool hasMFENCE(const X86Subtarget& Subtarget) {

19113

// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for

19114

// no-sse2). There isn't any reason to disable it if the target processor

19115

// supports it.

19116

return Subtarget.hasSSE2() || Subtarget.is64Bit();

19117

}

19118

19119

LoadInst *

19120

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

19121

const X86Subtarget &Subtarget =

19122

getTargetMachine().getSubtarget<X86Subtarget>();

19123

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

19124

const Type *MemType = AI->getType();

19125

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

19126

// there is no benefit in turning such RMWs into loads, and it is actually

19127

// harmful as it introduces a mfence.

19128

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

19129

return nullptr;

19130

19131

auto Builder = IRBuilder<>(AI);

19132

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

19133

auto SynchScope = AI->getSynchScope();

19134

// We must restrict the ordering to avoid generating loads with Release or

19135

// ReleaseAcquire orderings.

19136

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

19137

auto Ptr = AI->getPointerOperand();

19138

19139

// Before the load we need a fence. Here is an example lifted from

19140

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

19141

// is required:

19142

// Thread 0:

19143

// x.store(1, relaxed);

19144

// r1 = y.fetch_add(0, release);

19145

// Thread 1:

19146

// y.fetch_add(42, acquire);

19147

// r2 = x.load(relaxed);

19148

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

19149

// lowered to just a load without a fence. A mfence flushes the store buffer,

19150

// making the optimization clearly correct.

19151

// FIXME: it is required if isAtLeastRelease(Order) but it is not clear

19152

// otherwise, we might be able to be more agressive on relaxed idempotent

19153

// rmw. In practice, they do not look useful, so we don't try to be

19154

// especially clever.

19155

if (SynchScope == SingleThread) {

19156

// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at

19157

// the IR level, so we must wrap it in an intrinsic.

19158

return nullptr;

19159

} else if (hasMFENCE(Subtarget)) {

19160

Function *MFence = llvm::Intrinsic::getDeclaration(M,

19161

Intrinsic::x86_sse2_mfence);

19162

Builder.CreateCall(MFence);

19163

} else {

19164

// FIXME: it might make sense to use a locked operation here but on a

19165

// different cache-line to prevent cache-line bouncing. In practice it

19166

// is probably a small win, and x86 processors without mfence are rare

19167

// enough that we do not bother.

19168

return nullptr;

19169

}

19170

19171

// Finally we can emit the atomic load.

19172

LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,

19173

AI->getType()->getPrimitiveSizeInBits());

19174

Loaded->setAtomic(Order, SynchScope);

19175

AI->replaceAllUsesWith(Loaded);

19176

AI->eraseFromParent();

19177

return Loaded;

19178

}

19179

19180

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,

19181

SelectionDAG &DAG) {

19182

SDLoc dl(Op);

19183

AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(

19184

cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());

19185

SynchronizationScope FenceScope = static_cast<SynchronizationScope>(

19186

cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

19187

19188

// The only fence that needs an instruction is a sequentially-consistent

19189

// cross-thread fence.

19190

if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {

19191

if (hasMFENCE(*Subtarget))

19192

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

19193

19194

SDValue Chain = Op.getOperand(0);

19195

SDValue Zero = DAG.getConstant(0, MVT::i32);

19196

SDValue Ops[] = {

19197

DAG.getRegister(X86::ESP, MVT::i32), // Base

19198

DAG.getTargetConstant(1, MVT::i8), // Scale

19199

DAG.getRegister(0, MVT::i32), // Index

19200

DAG.getTargetConstant(0, MVT::i32), // Disp

19201

DAG.getRegister(0, MVT::i32), // Segment.

19202

Zero,

19203

Chain

19204

};

19205

SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);

19206

return SDValue(Res, 0);

19207

}

19208

19209

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

19210

return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

19211

}

19212

19213

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,

19214

SelectionDAG &DAG) {

19215

MVT T = Op.getSimpleValueType();

19216

SDLoc DL(Op);

19217

unsigned Reg = 0;

19218

unsigned size = 0;

19219

switch(T.SimpleTy) {

19220

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19220);

19221

case MVT::i8: Reg = X86::AL; size = 1; break;

19222

case MVT::i16: Reg = X86::AX; size = 2; break;

19223

case MVT::i32: Reg = X86::EAX; size = 4; break;

19224

case MVT::i64:

19225

assert(Subtarget->is64Bit() && "Node not type legal!")((Subtarget->is64Bit() && "Node not type legal!") ?
static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"Node not type legal!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19225, __PRETTY_FUNCTION__));

19226

Reg = X86::RAX; size = 8;

19227

break;

19228

}

19229

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

19230

Op.getOperand(2), SDValue());

19231

SDValue Ops[] = { cpIn.getValue(0),

19232

Op.getOperand(1),

19233

Op.getOperand(3),

19234

DAG.getTargetConstant(size, MVT::i8),

19235

cpIn.getValue(1) };

19236

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

19237

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

19238

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

19239

Ops, T, MMO);

19240

19241

SDValue cpOut =

19242

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

19243

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

19244

MVT::i32, cpOut.getValue(2));

19245

SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),

19246

DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);

19247

19248

DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);

19249

DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);

19250

DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));

19251

return SDValue();

19252

}

19253

19254

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,

19255

SelectionDAG &DAG) {

19256

MVT SrcVT = Op.getOperand(0).getSimpleValueType();

19257

MVT DstVT = Op.getSimpleValueType();

19258

19259

if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {

19260

assert(Subtarget->hasSSE2() && "Requires at least SSE2!")((Subtarget->hasSSE2() && "Requires at least SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Requires at least SSE2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19260, __PRETTY_FUNCTION__));

19261

if (DstVT != MVT::f64)

19262

// This conversion needs to be expanded.

19263

return SDValue();

19264

19265

SDValue InVec = Op->getOperand(0);

19266

SDLoc dl(Op);

19267

unsigned NumElts = SrcVT.getVectorNumElements();

19268

EVT SVT = SrcVT.getVectorElementType();

19269

19270

// Widen the vector in input in the case of MVT::v2i32.

19271

// Example: from MVT::v2i32 to MVT::v4i32.

19272

SmallVector<SDValue, 16> Elts;

19273

for (unsigned i = 0, e = NumElts; i != e; ++i)

19274

Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,

19275

DAG.getIntPtrConstant(i)));

19276

19277

// Explicitly mark the extra elements as Undef.

19278

SDValue Undef = DAG.getUNDEF(SVT);

19279

for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)

19280

Elts.push_back(Undef);

19281

19282

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);

19283

SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);

19284

SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);

19285

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,

19286

DAG.getIntPtrConstant(0));

19287

}

19288

19289

assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&((Subtarget->is64Bit() && !Subtarget->hasSSE2()
&& Subtarget->hasMMX() && "Unexpected custom BITCAST"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19290, __PRETTY_FUNCTION__))

19290

Subtarget->hasMMX() && "Unexpected custom BITCAST")((Subtarget->is64Bit() && !Subtarget->hasSSE2()
&& Subtarget->hasMMX() && "Unexpected custom BITCAST"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19290, __PRETTY_FUNCTION__));

19291

assert((DstVT == MVT::i64 ||(((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits
()==64)) && "Unexpected custom BITCAST") ? static_cast
<void> (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19293, __PRETTY_FUNCTION__))

19292

(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&(((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits
()==64)) && "Unexpected custom BITCAST") ? static_cast
<void> (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19293, __PRETTY_FUNCTION__))

19293

"Unexpected custom BITCAST")(((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits
()==64)) && "Unexpected custom BITCAST") ? static_cast
<void> (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19293, __PRETTY_FUNCTION__));

19294

// i64 <=> MMX conversions are Legal.

19295

if (SrcVT==MVT::i64 && DstVT.isVector())

19296

return Op;

19297

if (DstVT==MVT::i64 && SrcVT.isVector())

19298

return Op;

19299

// MMX <=> MMX conversions are Legal.

19300

if (SrcVT.isVector() && DstVT.isVector())

19301

return Op;

19302

// All other conversions need to be expanded.

19303

return SDValue();

19304

}

19305

19306

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,

19307

SelectionDAG &DAG) {

19308

SDNode *Node = Op.getNode();

19309

SDLoc dl(Node);

19310

19311

Op = Op.getOperand(0);

19312

EVT VT = Op.getValueType();

19313

assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "CTPOP lowering only implemented for 128/256-bit wide vector types"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"CTPOP lowering only implemented for 128/256-bit wide vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19314, __PRETTY_FUNCTION__))

19314

"CTPOP lowering only implemented for 128/256-bit wide vector types")(((VT.is128BitVector() || VT.is256BitVector()) && "CTPOP lowering only implemented for 128/256-bit wide vector types"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"CTPOP lowering only implemented for 128/256-bit wide vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19314, __PRETTY_FUNCTION__));

19315

19316

unsigned NumElts = VT.getVectorNumElements();

19317

EVT EltVT = VT.getVectorElementType();

19318

unsigned Len = EltVT.getSizeInBits();

19319

19320

// This is the vectorized version of the "best" algorithm from

19321

// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel

19322

// with a minor tweak to use a series of adds + shifts instead of vector

19323

// multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:

19324

19325

// v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled

19326

// v8i32 => Always profitable

19327

19328

// FIXME: There a couple of possible improvements:

19329

19330

// 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).

19331

// 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html

19332

19333

assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&((EltVT.isInteger() && (Len == 32 || Len == 64) &&
Len % 8 == 0 && "CTPOP not implemented for this vector element type."
) ? static_cast<void> (0) : __assert_fail ("EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && \"CTPOP not implemented for this vector element type.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19334, __PRETTY_FUNCTION__))

19334

"CTPOP not implemented for this vector element type.")((EltVT.isInteger() && (Len == 32 || Len == 64) &&
Len % 8 == 0 && "CTPOP not implemented for this vector element type."
) ? static_cast<void> (0) : __assert_fail ("EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && \"CTPOP not implemented for this vector element type.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19334, __PRETTY_FUNCTION__));

19335

19336

// X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid

19337

// extra legalization.

19338

bool NeedsBitcast = EltVT == MVT::i32;

19339

MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;

19340

19341

SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);

19342

SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);

19343

SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);

19344

19345

// v = v - ((v >> 1) & 0x55555555...)

19346

SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));

19347

SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);

19348

SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);

19349

if (NeedsBitcast)

19350

Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);

19351

19352

SmallVector<SDValue, 8> Mask55(NumElts, Cst55);

19353

SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);

19354

if (NeedsBitcast)

19355

M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);

19356

19357

SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);

19358

if (VT != And.getValueType())

19359

And = DAG.getNode(ISD::BITCAST, dl, VT, And);

19360

SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);

19361

19362

// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)

19363

SmallVector<SDValue, 8> Mask33(NumElts, Cst33);

19364

SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);

19365

SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));

19366

SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);

19367

19368

Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);

19369

if (NeedsBitcast) {

19370

Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);

19371

M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);

19372

Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);

19373

}

19374

19375

SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);

19376

SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);

19377

if (VT != AndRHS.getValueType()) {

19378

AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);

19379

AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);

19380

}

19381

SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);

19382

19383

// v = (v + (v >> 4)) & 0x0F0F0F0F...

19384

SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));

19385

SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);

19386

Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);

19387

Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);

19388

19389

SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);

19390

SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);

19391

if (NeedsBitcast) {

19392

Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);

19393

M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);

19394

}

19395

And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);

19396

if (VT != And.getValueType())

19397

And = DAG.getNode(ISD::BITCAST, dl, VT, And);

19398

19399

// The algorithm mentioned above uses:

19400

// v = (v * 0x01010101...) >> (Len - 8)

19401

19402

// Change it to use vector adds + vector shifts which yield faster results on

19403

// Haswell than using vector integer multiplication.

19404

19405

// For i32 elements:

19406

// v = v + (v >> 8)

19407

// v = v + (v >> 16)

19408

19409

// For i64 elements:

19410

// v = v + (v >> 8)

19411

// v = v + (v >> 16)

19412

// v = v + (v >> 32)

19413

19414

Add = And;

19415

SmallVector<SDValue, 8> Csts;

19416

for (unsigned i = 8; i <= Len/2; i *= 2) {

19417

Csts.assign(NumElts, DAG.getConstant(i, EltVT));

19418

SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);

19419

Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);

19420

Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);

19421

Csts.clear();

19422

}

19423

19424

// The result is on the least significant 6-bits on i32 and 7-bits on i64.

19425

SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);

19426

SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);

19427

SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);

19428

if (NeedsBitcast) {

19429

Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);

19430

M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);

19431

}

19432

And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);

19433

if (VT != And.getValueType())

19434

And = DAG.getNode(ISD::BITCAST, dl, VT, And);

19435

19436

return And;

19437

}

19438

19439

static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {

19440

SDNode *Node = Op.getNode();

19441

SDLoc dl(Node);

19442

EVT T = Node->getValueType(0);

19443

SDValue negOp = DAG.getNode(ISD::SUB, dl, T,

19444

DAG.getConstant(0, T), Node->getOperand(2));

19445

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,

19446

cast<AtomicSDNode>(Node)->getMemoryVT(),

19447

Node->getOperand(0),

19448

Node->getOperand(1), negOp,

19449

cast<AtomicSDNode>(Node)->getMemOperand(),

19450

cast<AtomicSDNode>(Node)->getOrdering(),

19451

cast<AtomicSDNode>(Node)->getSynchScope());

19452

}

19453

19454

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {

19455

SDNode *Node = Op.getNode();

19456

SDLoc dl(Node);

19457

EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();

19458

19459

// Convert seq_cst store -> xchg

19460

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

19461

// FIXME: On 32-bit, store -> fist or movq would be more efficient

19462

// (The only way to get a 16-byte store is cmpxchg16b)

19463

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

19464

if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||

19465

!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

19466

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

19467

cast<AtomicSDNode>(Node)->getMemoryVT(),

19468

Node->getOperand(0),

19469

Node->getOperand(1), Node->getOperand(2),

19470

cast<AtomicSDNode>(Node)->getMemOperand(),

19471

cast<AtomicSDNode>(Node)->getOrdering(),

19472

cast<AtomicSDNode>(Node)->getSynchScope());

19473

return Swap.getValue(1);

19474

}

19475

// Other atomic stores have a simple pattern.

19476

return Op;

19477

}

19478

19479

static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {

19480

EVT VT = Op.getNode()->getSimpleValueType(0);

19481

19482

// Let legalize expand this if it isn't a legal type yet.

19483

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

19484

return SDValue();

19485

19486

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

19487

19488

unsigned Opc;

19489

bool ExtraOp = false;

19490

switch (Op.getOpcode()) {

19491

default: llvm_unreachable("Invalid code")::llvm::llvm_unreachable_internal("Invalid code", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19491);

19492

case ISD::ADDC: Opc = X86ISD::ADD; break;

19493

case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;

19494

case ISD::SUBC: Opc = X86ISD::SUB; break;

19495

case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;

19496

}

19497

19498

if (!ExtraOp)

19499

return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),

19500

Op.getOperand(1));

19501

return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),

19502

Op.getOperand(1), Op.getOperand(2));

19503

}

19504

19505

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,

19506

SelectionDAG &DAG) {

19507

assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit())((Subtarget->isTargetDarwin() && Subtarget->is64Bit
()) ? static_cast<void> (0) : __assert_fail ("Subtarget->isTargetDarwin() && Subtarget->is64Bit()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19507, __PRETTY_FUNCTION__));

19508

19509

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

19510

// which returns the values as { float, float } (in XMM0) or

19511

// { double, double } (which is returned in XMM0, XMM1).

19512

SDLoc dl(Op);

19513

SDValue Arg = Op.getOperand(0);

19514

EVT ArgVT = Arg.getValueType();

19515

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

19516

19517

TargetLowering::ArgListTy Args;

19518

TargetLowering::ArgListEntry Entry;

19519

19520

Entry.Node = Arg;

19521

Entry.Ty = ArgTy;

19522

Entry.isSExt = false;

19523

Entry.isZExt = false;

19524

Args.push_back(Entry);

19525

19526

bool isF64 = ArgVT == MVT::f64;

19527

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

19528

// the small struct {f32, f32} is returned in (eax, edx). For f64,

19529

// the results are returned via SRet in memory.

19530

const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";

19531

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

19532

SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());

19533

19534

Type *RetTy = isF64

19535

? (Type*)StructType::get(ArgTy, ArgTy, nullptr)

19536

: (Type*)VectorType::get(ArgTy, 4);

19537

19538

TargetLowering::CallLoweringInfo CLI(DAG);

19539

CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())

19540

.setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);

19541

19542

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

19543

19544

if (isF64)

19545

// Returned in xmm0 and xmm1.

19546

return CallResult.first;

19547

19548

// Returned in bits 0:31 and 32:64 xmm0.

19549

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

19550

CallResult.first, DAG.getIntPtrConstant(0));

19551

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

19552

CallResult.first, DAG.getIntPtrConstant(1));

19553

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

19554

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

19555

}

19556

19557

/// LowerOperation - Provide custom lowering hooks for some operations.

19558

///

19559

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

19560

switch (Op.getOpcode()) {

19561

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19561);

19562

case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);

19563

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

19564

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

19565

return LowerCMP_SWAP(Op, Subtarget, DAG);

19566

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

19567

case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);

19568

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);

19569

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

19570

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);

19571

case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);

19572

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

19573

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

19574

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

19575

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

19576

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

19577

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);

19578

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

19579

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

19580

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

19581

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

19582

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

19583

case ISD::SHL_PARTS:

19584

case ISD::SRA_PARTS:

19585

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

19586

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

19587

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

19588

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

19589

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

19590

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

19591

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

19592

case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);

19593

case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);

19594

case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

19595

case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);

19596

case ISD::FABS:

19597

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

19598

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

19599

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

19600

case ISD::SETCC: return LowerSETCC(Op, DAG);

19601

case ISD::SELECT: return LowerSELECT(Op, DAG);

19602

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

19603

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

19604

case ISD::VASTART: return LowerVASTART(Op, DAG);

19605

case ISD::VAARG: return LowerVAARG(Op, DAG);

19606

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

19607

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);

19608

case ISD::INTRINSIC_VOID:

19609

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

19610

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

19611

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

19612

case ISD::FRAME_TO_ARGS_OFFSET:

19613

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

19614

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

19615

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

19616

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

19617

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

19618

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

19619

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

19620

case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);

19621

case ISD::CTLZ: return LowerCTLZ(Op, DAG);

19622

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);

19623

case ISD::CTTZ: return LowerCTTZ(Op, DAG);

19624

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

19625

case ISD::UMUL_LOHI:

19626

case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);

19627

case ISD::SRA:

19628

case ISD::SRL:

19629

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

19630

case ISD::SADDO:

19631

case ISD::UADDO:

19632

case ISD::SSUBO:

19633

case ISD::USUBO:

19634

case ISD::SMULO:

19635

case ISD::UMULO: return LowerXALUO(Op, DAG);

19636

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

19637

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

19638

case ISD::ADDC:

19639

case ISD::ADDE:

19640

case ISD::SUBC:

19641

case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);

19642

case ISD::ADD: return LowerADD(Op, DAG);

19643

case ISD::SUB: return LowerSUB(Op, DAG);

19644

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

19645

}

19646

}

19647

19648

/// ReplaceNodeResults - Replace a node with an illegal result type

19649

/// with a new node built out of custom code.

19650

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

19651

SmallVectorImpl<SDValue>&Results,

19652

SelectionDAG &DAG) const {

19653

SDLoc dl(N);

19654

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

19655

switch (N->getOpcode()) {

19656

default:

19657

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19657);

19658

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

19659

case X86ISD::FMINC:

19660

case X86ISD::FMIN:

19661

case X86ISD::FMAXC:

19662

case X86ISD::FMAX: {

19663

EVT VT = N->getValueType(0);

19664

if (VT != MVT::v2f32)

19665

llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.")::llvm::llvm_unreachable_internal("Unexpected type (!= v2f32) on FMIN/FMAX."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19665);

19666

SDValue UNDEF = DAG.getUNDEF(VT);

19667

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

19668

N->getOperand(0), UNDEF);

19669

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

19670

N->getOperand(1), UNDEF);

19671

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

19672

return;

19673

}

19674

case ISD::SIGN_EXTEND_INREG:

19675

case ISD::ADDC:

19676

case ISD::ADDE:

19677

case ISD::SUBC:

19678

case ISD::SUBE:

19679

// We don't want to expand or promote these.

19680

return;

19681

case ISD::SDIV:

19682

case ISD::UDIV:

19683

case ISD::SREM:

19684

case ISD::UREM:

19685

case ISD::SDIVREM:

19686

case ISD::UDIVREM: {

19687

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

19688

Results.push_back(V);

19689

return;

19690

}

19691

case ISD::FP_TO_SINT:

19692

case ISD::FP_TO_UINT: {

19693

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

19694

19695

if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))

19696

return;

19697

19698

std::pair<SDValue,SDValue> Vals =

19699

FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);

19700

SDValue FIST = Vals.first, StackSlot = Vals.second;

19701

if (FIST.getNode()) {

19702

EVT VT = N->getValueType(0);

19703

// Return a load from the stack slot.

19704

if (StackSlot.getNode())

19705

Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,

19706

MachinePointerInfo(),

19707

false, false, false, 0));

19708

else

19709

Results.push_back(FIST);

19710

}

19711

return;

19712

}

19713

case ISD::UINT_TO_FP: {

19714

19715

if (N->getOperand(0).getValueType() != MVT::v2i32 ||

19716

N->getValueType(0) != MVT::v2f32)

19717

return;

19718

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,

19719

N->getOperand(0));

19720

SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),

19721

MVT::f64);

19722

SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);

19723

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

19724

DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));

19725

Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);

19726

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

19727

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

19728

return;

19729

}

19730

case ISD::FP_ROUND: {

19731

if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))

19732

return;

19733

SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));

19734

Results.push_back(V);

19735

return;

19736

}

19737

case ISD::INTRINSIC_W_CHAIN: {

19738

unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

19739

switch (IntNo) {

19740

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19741)

19741

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19741);

19742

case Intrinsic::x86_rdtsc:

19743

return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,

19744

Results);

19745

case Intrinsic::x86_rdtscp:

19746

return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,

19747

Results);

19748

case Intrinsic::x86_rdpmc:

19749

return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);

19750

}

19751

}

19752

case ISD::READCYCLECOUNTER: {

19753

return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,

19754

Results);

19755

}

19756

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

19757

EVT T = N->getValueType(0);

19758

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"
) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19758, __PRETTY_FUNCTION__));

19759

bool Regs64bit = T == MVT::i128;

19760

EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

19761

SDValue cpInL, cpInH;

19762

cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

19763

DAG.getConstant(0, HalfT));

19764

cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

19765

DAG.getConstant(1, HalfT));

19766

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

19767

Regs64bit ? X86::RAX : X86::EAX,

19768

cpInL, SDValue());

19769

cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,

19770

Regs64bit ? X86::RDX : X86::EDX,

19771

cpInH, cpInL.getValue(1));

19772

SDValue swapInL, swapInH;

19773

swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

19774

DAG.getConstant(0, HalfT));

19775

swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

19776

DAG.getConstant(1, HalfT));

19777

swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,

19778

Regs64bit ? X86::RBX : X86::EBX,

19779

swapInL, cpInH.getValue(1));

19780

swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,

19781

Regs64bit ? X86::RCX : X86::ECX,

19782

swapInH, swapInL.getValue(1));

19783

SDValue Ops[] = { swapInH.getValue(0),

19784

N->getOperand(1),

19785

swapInH.getValue(1) };

19786

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

19787

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

19788

unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :

19789

X86ISD::LCMPXCHG8_DAG;

19790

SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);

19791

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

19792

Regs64bit ? X86::RAX : X86::EAX,

19793

HalfT, Result.getValue(1));

19794

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

19795

Regs64bit ? X86::RDX : X86::EDX,

19796

HalfT, cpOutL.getValue(2));

19797

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

19798

19799

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

19800

MVT::i32, cpOutH.getValue(2));

19801

SDValue Success =

19802

DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

19803

DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);

19804

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

19805

19806

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

19807

Results.push_back(Success);

19808

Results.push_back(EFLAGS.getValue(1));

19809

return;

19810

}

19811

case ISD::ATOMIC_SWAP:

19812

case ISD::ATOMIC_LOAD_ADD:

19813

case ISD::ATOMIC_LOAD_SUB:

19814

case ISD::ATOMIC_LOAD_AND:

19815

case ISD::ATOMIC_LOAD_OR:

19816

case ISD::ATOMIC_LOAD_XOR:

19817

case ISD::ATOMIC_LOAD_NAND:

19818

case ISD::ATOMIC_LOAD_MIN:

19819

case ISD::ATOMIC_LOAD_MAX:

19820

case ISD::ATOMIC_LOAD_UMIN:

19821

case ISD::ATOMIC_LOAD_UMAX:

19822

case ISD::ATOMIC_LOAD: {

19823

// Delegate to generic TypeLegalization. Situations we can really handle

19824

// should have already been dealt with by AtomicExpandPass.cpp.

19825

break;

19826

}

19827

case ISD::BITCAST: {

19828

19829

EVT DstVT = N->getValueType(0);

19830

EVT SrcVT = N->getOperand(0)->getValueType(0);

19831

19832

if (SrcVT != MVT::f64 ||

19833

(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))

19834

return;

19835

19836

unsigned NumElts = DstVT.getVectorNumElements();

19837

EVT SVT = DstVT.getVectorElementType();

19838

EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);

19839

SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

19840

MVT::v2f64, N->getOperand(0));

19841

SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);

19842

19843

if (ExperimentalVectorWideningLegalization) {

19844

// If we are legalizing vectors by widening, we already have the desired

19845

// legal vector type, just return it.

19846

Results.push_back(ToVecInt);

19847

return;

19848

}

19849

19850

SmallVector<SDValue, 8> Elts;

19851

for (unsigned i = 0, e = NumElts; i != e; ++i)

19852

Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,

19853

ToVecInt, DAG.getIntPtrConstant(i)));

19854

19855

Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));

19856

}

19857

}

19858

}

19859

19860

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

19861

switch (Opcode) {

19862

default: return nullptr;

19863

case X86ISD::BSF: return "X86ISD::BSF";

19864

case X86ISD::BSR: return "X86ISD::BSR";

19865

case X86ISD::SHLD: return "X86ISD::SHLD";

19866

case X86ISD::SHRD: return "X86ISD::SHRD";

19867

case X86ISD::FAND: return "X86ISD::FAND";

19868

case X86ISD::FANDN: return "X86ISD::FANDN";

19869

case X86ISD::FOR: return "X86ISD::FOR";

19870

case X86ISD::FXOR: return "X86ISD::FXOR";

19871

case X86ISD::FSRL: return "X86ISD::FSRL";

19872

case X86ISD::FILD: return "X86ISD::FILD";

19873

case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";

19874

case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";

19875

case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";

19876

case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";

19877

case X86ISD::FLD: return "X86ISD::FLD";

19878

case X86ISD::FST: return "X86ISD::FST";

19879

case X86ISD::CALL: return "X86ISD::CALL";

19880

case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";

19881

case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";

19882

case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";

19883

case X86ISD::BT: return "X86ISD::BT";

19884

case X86ISD::CMP: return "X86ISD::CMP";

19885

case X86ISD::COMI: return "X86ISD::COMI";

19886

case X86ISD::UCOMI: return "X86ISD::UCOMI";

19887

case X86ISD::CMPM: return "X86ISD::CMPM";

19888

case X86ISD::CMPMU: return "X86ISD::CMPMU";

19889

case X86ISD::SETCC: return "X86ISD::SETCC";

19890

case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";

19891

case X86ISD::FSETCC: return "X86ISD::FSETCC";

19892

case X86ISD::CMOV: return "X86ISD::CMOV";

19893

case X86ISD::BRCOND: return "X86ISD::BRCOND";

19894

case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";

19895

case X86ISD::REP_STOS: return "X86ISD::REP_STOS";

19896

case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";

19897

case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";

19898

case X86ISD::Wrapper: return "X86ISD::Wrapper";

19899

case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";

19900

case X86ISD::PEXTRB: return "X86ISD::PEXTRB";

19901

case X86ISD::PEXTRW: return "X86ISD::PEXTRW";

19902

case X86ISD::INSERTPS: return "X86ISD::INSERTPS";

19903

case X86ISD::PINSRB: return "X86ISD::PINSRB";

19904

case X86ISD::PINSRW: return "X86ISD::PINSRW";

19905

case X86ISD::PSHUFB: return "X86ISD::PSHUFB";

19906

case X86ISD::ANDNP: return "X86ISD::ANDNP";

19907

case X86ISD::PSIGN: return "X86ISD::PSIGN";

19908

case X86ISD::BLENDI: return "X86ISD::BLENDI";

19909

case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";

19910

case X86ISD::SUBUS: return "X86ISD::SUBUS";

19911

case X86ISD::HADD: return "X86ISD::HADD";

19912

case X86ISD::HSUB: return "X86ISD::HSUB";

19913

case X86ISD::FHADD: return "X86ISD::FHADD";

19914

case X86ISD::FHSUB: return "X86ISD::FHSUB";

19915

case X86ISD::UMAX: return "X86ISD::UMAX";

19916

case X86ISD::UMIN: return "X86ISD::UMIN";

19917

case X86ISD::SMAX: return "X86ISD::SMAX";

19918

case X86ISD::SMIN: return "X86ISD::SMIN";

19919

case X86ISD::FMAX: return "X86ISD::FMAX";

19920

case X86ISD::FMIN: return "X86ISD::FMIN";

19921

case X86ISD::FMAXC: return "X86ISD::FMAXC";

19922

case X86ISD::FMINC: return "X86ISD::FMINC";

19923

case X86ISD::FRSQRT: return "X86ISD::FRSQRT";

19924

case X86ISD::FRCP: return "X86ISD::FRCP";

19925

case X86ISD::TLSADDR: return "X86ISD::TLSADDR";

19926

case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";

19927

case X86ISD::TLSCALL: return "X86ISD::TLSCALL";

19928

case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";

19929

case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";

19930

case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";

19931

case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";

19932

case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";

19933

case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";

19934

case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";

19935

case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";

19936

case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";

19937

case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";

19938

case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";

19939

case X86ISD::VZEXT: return "X86ISD::VZEXT";

19940

case X86ISD::VSEXT: return "X86ISD::VSEXT";

19941

case X86ISD::VTRUNC: return "X86ISD::VTRUNC";

19942

case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM";

19943

case X86ISD::VINSERT: return "X86ISD::VINSERT";

19944

case X86ISD::VFPEXT: return "X86ISD::VFPEXT";

19945

case X86ISD::VFPROUND: return "X86ISD::VFPROUND";

19946

case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";

19947

case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";

19948

case X86ISD::VSHL: return "X86ISD::VSHL";

19949

case X86ISD::VSRL: return "X86ISD::VSRL";

19950

case X86ISD::VSRA: return "X86ISD::VSRA";

19951

case X86ISD::VSHLI: return "X86ISD::VSHLI";

19952

case X86ISD::VSRLI: return "X86ISD::VSRLI";

19953

case X86ISD::VSRAI: return "X86ISD::VSRAI";

19954

case X86ISD::CMPP: return "X86ISD::CMPP";

19955

case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";

19956

case X86ISD::PCMPGT: return "X86ISD::PCMPGT";

19957

case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";

19958

case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";

19959

case X86ISD::ADD: return "X86ISD::ADD";

19960

case X86ISD::SUB: return "X86ISD::SUB";

19961

case X86ISD::ADC: return "X86ISD::ADC";

19962

case X86ISD::SBB: return "X86ISD::SBB";

19963

case X86ISD::SMUL: return "X86ISD::SMUL";

19964

case X86ISD::UMUL: return "X86ISD::UMUL";

19965

case X86ISD::SMUL8: return "X86ISD::SMUL8";

19966

case X86ISD::UMUL8: return "X86ISD::UMUL8";

19967

case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";

19968

case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";

19969

case X86ISD::INC: return "X86ISD::INC";

19970

case X86ISD::DEC: return "X86ISD::DEC";

19971

case X86ISD::OR: return "X86ISD::OR";

19972

case X86ISD::XOR: return "X86ISD::XOR";

19973

case X86ISD::AND: return "X86ISD::AND";

19974

case X86ISD::BEXTR: return "X86ISD::BEXTR";

19975

case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";

19976

case X86ISD::PTEST: return "X86ISD::PTEST";

19977

case X86ISD::TESTP: return "X86ISD::TESTP";

19978

case X86ISD::TESTM: return "X86ISD::TESTM";

19979

case X86ISD::TESTNM: return "X86ISD::TESTNM";

19980

case X86ISD::KORTEST: return "X86ISD::KORTEST";

19981

case X86ISD::PACKSS: return "X86ISD::PACKSS";

19982

case X86ISD::PACKUS: return "X86ISD::PACKUS";

19983

case X86ISD::PALIGNR: return "X86ISD::PALIGNR";

19984

case X86ISD::VALIGN: return "X86ISD::VALIGN";

19985

case X86ISD::PSHUFD: return "X86ISD::PSHUFD";

19986

case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";

19987

case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";

19988

case X86ISD::SHUFP: return "X86ISD::SHUFP";

19989

case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";

19990

case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";

19991

case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";

19992

case X86ISD::MOVLPS: return "X86ISD::MOVLPS";

19993

case X86ISD::MOVLPD: return "X86ISD::MOVLPD";

19994

case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";

19995

case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";

19996

case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";

19997

case X86ISD::MOVSD: return "X86ISD::MOVSD";

19998

case X86ISD::MOVSS: return "X86ISD::MOVSS";

19999

case X86ISD::UNPCKL: return "X86ISD::UNPCKL";

20000

case X86ISD::UNPCKH: return "X86ISD::UNPCKH";

20001

case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";

20002

case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";

20003

case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";

20004

case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";

20005

case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";

20006

case X86ISD::VPERMV: return "X86ISD::VPERMV";

20007

case X86ISD::VPERMV3: return "X86ISD::VPERMV3";

20008

case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";

20009

case X86ISD::VPERMI: return "X86ISD::VPERMI";

20010

case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";

20011

case X86ISD::PMULDQ: return "X86ISD::PMULDQ";

20012

case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";

20013

case X86ISD::VAARG_64: return "X86ISD::VAARG_64";

20014

case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";

20015

case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";

20016

case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";

20017

case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";

20018

case X86ISD::SAHF: return "X86ISD::SAHF";

20019

case X86ISD::RDRAND: return "X86ISD::RDRAND";

20020

case X86ISD::RDSEED: return "X86ISD::RDSEED";

20021

case X86ISD::FMADD: return "X86ISD::FMADD";

20022

case X86ISD::FMSUB: return "X86ISD::FMSUB";

20023

case X86ISD::FNMADD: return "X86ISD::FNMADD";

20024

case X86ISD::FNMSUB: return "X86ISD::FNMSUB";

20025

case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";

20026

case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";

20027

case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";

20028

case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";

20029

case X86ISD::XTEST: return "X86ISD::XTEST";

20030

case X86ISD::COMPRESS: return "X86ISD::COMPRESS";

20031

case X86ISD::EXPAND: return "X86ISD::EXPAND";

20032

case X86ISD::SELECT: return "X86ISD::SELECT";

20033

case X86ISD::ADDSUB: return "X86ISD::ADDSUB";

20034

case X86ISD::RCP28: return "X86ISD::RCP28";

20035

case X86ISD::RSQRT28: return "X86ISD::RSQRT28";

20036

}

20037

}

20038

20039

// isLegalAddressingMode - Return true if the addressing mode represented

20040

// by AM is legal for this target, for a load/store of the specified type.

20041

bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,

20042

Type *Ty) const {

20043

// X86 supports extremely general addressing modes.

20044

CodeModel::Model M = getTargetMachine().getCodeModel();

20045

Reloc::Model R = getTargetMachine().getRelocationModel();

20046

20047

// X86 allows a sign-extended 32-bit immediate field as a displacement.

20048

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

20049

return false;

20050

20051

if (AM.BaseGV) {

20052

unsigned GVFlags =

20053

Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());

20054

20055

// If a reference to this global requires an extra load, we can't fold it.

20056

if (isGlobalStubReference(GVFlags))

20057

return false;

20058

20059

// If BaseGV requires a register for the PIC base, we cannot also have a

20060

// BaseReg specified.

20061

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

20062

return false;

20063

20064

// If lower 4G is not available, then we must use rip-relative addressing.

20065

if ((M != CodeModel::Small || R != Reloc::Static) &&

20066

Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))

20067

return false;

20068

}

20069

20070

switch (AM.Scale) {

20071

case 0:

20072

case 1:

20073

case 2:

20074

case 4:

20075

case 8:

20076

// These scales always work.

20077

break;

20078

case 3:

20079

case 5:

20080

case 9:

20081

// These scales are formed with basereg+scalereg. Only accept if there is

20082

// no basereg yet.

20083

if (AM.HasBaseReg)

20084

return false;

20085

break;

20086

default: // Other stuff never works.

20087

return false;

20088

}

20089

20090

return true;

20091

}

20092

20093

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

20094

unsigned Bits = Ty->getScalarSizeInBits();

20095

20096

// 8-bit shifts are always expensive, but versions with a scalar amount aren't

20097

// particularly cheaper than those without.

20098

if (Bits == 8)

20099

return false;

20100

20101

// On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make

20102

// variable shifts just as cheap as scalar ones.

20103

if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))

20104

return false;

20105

20106

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

20107

// fully general vector.

20108

return true;

20109

}

20110

20111

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

20112

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

20113

return false;

20114

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

20115

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

20116

return NumBits1 > NumBits2;

20117

}

20118

20119

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

20120

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

20121

return false;

20122

20123

if (!isTypeLegal(EVT::getEVT(Ty1)))

20124

return false;

20125

20126

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"
) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20126, __PRETTY_FUNCTION__));

20127

20128

// Assuming the caller doesn't have a zeroext or signext return parameter,

20129

// truncation all the way down to i1 is valid.

20130

return true;

20131

}

20132

20133

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

20134

return isInt<32>(Imm);

20135

}

20136

20137

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

20138

// Can also use sub to handle negated immediates.

20139

return isInt<32>(Imm);

20140

}

20141

20142

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

20143

if (!VT1.isInteger() || !VT2.isInteger())

20144

return false;

20145

unsigned NumBits1 = VT1.getSizeInBits();

20146

unsigned NumBits2 = VT2.getSizeInBits();

20147

return NumBits1 > NumBits2;

20148

}

20149

20150

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

20151

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

20152

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();

20153

}

20154

20155

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

20156

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

20157

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();

20158

}

20159

20160

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

20161

EVT VT1 = Val.getValueType();

20162

if (isZExtFree(VT1, VT2))

20163

return true;

20164

20165

if (Val.getOpcode() != ISD::LOAD)

20166

return false;

20167

20168

if (!VT1.isSimple() || !VT1.isInteger() ||

20169

!VT2.isSimple() || !VT2.isInteger())

20170

return false;

20171

20172

switch (VT1.getSimpleVT().SimpleTy) {

20173

default: break;

20174

case MVT::i8:

20175

case MVT::i16:

20176

case MVT::i32:

20177

// X86 has 8, 16, and 32-bit zero-extending loads.

20178

return true;

20179

}

20180

20181

return false;

20182

}

20183

20184

bool

20185

X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {

20186

if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))

20187

return false;

20188

20189

VT = VT.getScalarType();

20190

20191

if (!VT.isSimple())

20192

return false;

20193

20194

switch (VT.getSimpleVT().SimpleTy) {

20195

case MVT::f32:

20196

case MVT::f64:

20197

return true;

20198

default:

20199

break;

20200

}

20201

20202

return false;

20203

}

20204

20205

bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {

20206

// i16 instructions are longer (0x66 prefix) and potentially slower.

20207

return !(VT1 == MVT::i32 && VT2 == MVT::i16);

20208

}

20209

20210

/// isShuffleMaskLegal - Targets can use this to indicate that they only

20211

/// support *some* VECTOR_SHUFFLE operations, those with specific masks.

20212

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

20213

/// are assumed to be legal.

20214

bool

20215

X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,

20216

EVT VT) const {

20217

if (!VT.isSimple())

20218

return false;

20219

20220

MVT SVT = VT.getSimpleVT();

20221

20222

// Very little shuffling can be done for 64-bit vectors right now.

20223

if (VT.getSizeInBits() == 64)

20224

return false;

20225

20226

// This is an experimental legality test that is tailored to match the

20227

// legality test of the experimental lowering more closely. They are gated

20228

// separately to ease testing of performance differences.

20229

if (ExperimentalVectorShuffleLegality)

20230

// We only care that the types being shuffled are legal. The lowering can

20231

// handle any possible shuffle mask that results.

20232

return isTypeLegal(SVT);

20233

20234

// If this is a single-input shuffle with no 128 bit lane crossings we can

20235

// lower it into pshufb.

20236

if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||

20237

(SVT.is256BitVector() && Subtarget->hasInt256())) {

20238

bool isLegal = true;

20239

for (unsigned I = 0, E = M.size(); I != E; ++I) {

20240

if (M[I] >= (int)SVT.getVectorNumElements() ||

20241

ShuffleCrosses128bitLane(SVT, I, M[I])) {

20242

isLegal = false;

20243

break;

20244

}

20245

}

20246

if (isLegal)

20247

return true;

20248

}

20249

20250

// FIXME: blends, shifts.

20251

return (SVT.getVectorNumElements() == 2 ||

20252

ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||

20253

isMOVLMask(M, SVT) ||

20254

isCommutedMOVLMask(M, SVT) ||

20255

isMOVHLPSMask(M, SVT) ||

20256

isSHUFPMask(M, SVT) ||

20257

isSHUFPMask(M, SVT, /* Commuted */ true) ||

20258

isPSHUFDMask(M, SVT) ||

20259

isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||

20260

isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||

20261

isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||

20262

isPALIGNRMask(M, SVT, Subtarget) ||

20263

isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||

20264

isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||

20265

isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||

20266

isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||

20267

isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||

20268

(Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));

20269

}

20270

20271

bool

20272

X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,

20273

EVT VT) const {

20274

if (!VT.isSimple())

20275

return false;

20276

20277

MVT SVT = VT.getSimpleVT();

20278

20279

// This is an experimental legality test that is tailored to match the

20280

// legality test of the experimental lowering more closely. They are gated

20281

// separately to ease testing of performance differences.

20282

if (ExperimentalVectorShuffleLegality)

20283

// The new vector shuffle lowering is very good at managing zero-inputs.

20284

return isShuffleMaskLegal(Mask, VT);

20285

20286

unsigned NumElts = SVT.getVectorNumElements();

20287

// FIXME: This collection of masks seems suspect.

20288

if (NumElts == 2)

20289

return true;

20290

if (NumElts == 4 && SVT.is128BitVector()) {

20291

return (isMOVLMask(Mask, SVT) ||

20292

isCommutedMOVLMask(Mask, SVT, true) ||

20293

isSHUFPMask(Mask, SVT) ||

20294

isSHUFPMask(Mask, SVT, /* Commuted */ true) ||

20295

isBlendMask(Mask, SVT, Subtarget->hasSSE41(),

20296

Subtarget->hasInt256()));

20297

}

20298

return false;

20299

}

20300

20301

//===----------------------------------------------------------------------===//

20302

// X86 Scheduler Hooks

20303

//===----------------------------------------------------------------------===//

20304

20305

/// Utility function to emit xbegin specifying the start of an RTM region.

20306

static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,

20307

const TargetInstrInfo *TII) {

20308

DebugLoc DL = MI->getDebugLoc();

20309

20310

const BasicBlock *BB = MBB->getBasicBlock();

20311

MachineFunction::iterator I = MBB;

20312

++I;

20313

20314

// For the v = xbegin(), we generate

20315

20316

// thisMBB:

20317

// xbegin sinkMBB

20318

20319

// mainMBB:

20320

// eax = -1

20321

20322

// sinkMBB:

20323

// v = eax

20324

20325

MachineBasicBlock *thisMBB = MBB;

20326

MachineFunction *MF = MBB->getParent();

20327

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

20328

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

20329

MF->insert(I, mainMBB);

20330

MF->insert(I, sinkMBB);

20331

20332

// Transfer the remainder of BB and its successor edges to sinkMBB.

20333

sinkMBB->splice(sinkMBB->begin(), MBB,

20334

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

20335

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

20336

20337

// thisMBB:

20338

// xbegin sinkMBB

20339

// # fallthrough to mainMBB

20340

// # abortion to sinkMBB

20341

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);

20342

thisMBB->addSuccessor(mainMBB);

20343

thisMBB->addSuccessor(sinkMBB);

20344

20345

// mainMBB:

20346

// EAX = -1

20347

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);

20348

mainMBB->addSuccessor(sinkMBB);

20349

20350

// sinkMBB:

20351

// EAX is live into the sinkMBB

20352

sinkMBB->addLiveIn(X86::EAX);

20353

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

20354

TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())

20355

.addReg(X86::EAX);

20356

20357

MI->eraseFromParent();

20358

return sinkMBB;

20359

}

20360

20361

// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8

20362

// or XMM0_V32I8 in AVX all of this code can be replaced with that

20363

// in the .td file.

20364

static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,

20365

const TargetInstrInfo *TII) {

20366

unsigned Opc;

20367

switch (MI->getOpcode()) {

20368

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20368);

20369

case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;

20370

case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;

20371

case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;

20372

case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;

20373

case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;

20374

case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;

20375

case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;

20376

case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;

20377

}

20378

20379

DebugLoc dl = MI->getDebugLoc();

20380

MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

20381

20382

unsigned NumArgs = MI->getNumOperands();

20383

for (unsigned i = 1; i < NumArgs; ++i) {

20384

MachineOperand &Op = MI->getOperand(i);

20385

if (!(Op.isReg() && Op.isImplicit()))

20386

MIB.addOperand(Op);

20387

}

20388

if (MI->hasOneMemOperand())

20389

MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());

20390

20391

BuildMI(*BB, MI, dl,

20392

TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())

20393

.addReg(X86::XMM0);

20394

20395

MI->eraseFromParent();

20396

return BB;

20397

}

20398

20399

// FIXME: Custom handling because TableGen doesn't support multiple implicit

20400

// defs in an instruction pattern

20401

static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,

20402

const TargetInstrInfo *TII) {

20403

unsigned Opc;

20404

switch (MI->getOpcode()) {

20405

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20405);

20406

case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;

20407

case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;

20408

case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;

20409

case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;

20410

case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;

20411

case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;

20412

case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;

20413

case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;

20414

}

20415

20416

DebugLoc dl = MI->getDebugLoc();

20417

MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

20418

20419

unsigned NumArgs = MI->getNumOperands(); // remove the results

20420

for (unsigned i = 1; i < NumArgs; ++i) {

20421

MachineOperand &Op = MI->getOperand(i);

20422

if (!(Op.isReg() && Op.isImplicit()))

20423

MIB.addOperand(Op);

20424

}

20425

if (MI->hasOneMemOperand())

20426

MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());

20427

20428

BuildMI(*BB, MI, dl,

20429

TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())

20430

.addReg(X86::ECX);

20431

20432

MI->eraseFromParent();

20433

return BB;

20434

}

20435

20436

static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,

20437

const TargetInstrInfo *TII,

20438

const X86Subtarget* Subtarget) {

20439

DebugLoc dl = MI->getDebugLoc();

20440

20441

// Address into RAX/EAX, other two args into ECX, EDX.

20442

unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;

20443

unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;

20444

MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);

20445

for (int i = 0; i < X86::AddrNumOperands; ++i)

20446

MIB.addOperand(MI->getOperand(i));

20447

20448

unsigned ValOps = X86::AddrNumOperands;

20449

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)

20450

.addReg(MI->getOperand(ValOps).getReg());

20451

BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)

20452

.addReg(MI->getOperand(ValOps+1).getReg());

20453

20454

// The instruction doesn't actually take any operands though.

20455

BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));

20456

20457

MI->eraseFromParent(); // The pseudo is gone now.

20458

return BB;

20459

}

20460

20461

MachineBasicBlock *

20462

X86TargetLowering::EmitVAARG64WithCustomInserter(

20463

MachineInstr *MI,

20464

MachineBasicBlock *MBB) const {

20465

// Emit va_arg instruction on X86-64.

20466

20467

// Operands to this pseudo-instruction:

20468

// 0 ) Output : destination address (reg)

20469

// 1-5) Input : va_list address (addr, i64mem)

20470

// 6 ) ArgSize : Size (in bytes) of vararg type

20471

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

20472

// 8 ) Align : Alignment of type

20473

// 9 ) EFLAGS (implicit-def)

20474

20475

assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"
) ? static_cast<void> (0) : __assert_fail ("MI->getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20475, __PRETTY_FUNCTION__));

20476

assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands")((X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"
) ? static_cast<void> (0) : __assert_fail ("X86::AddrNumOperands == 5 && \"VAARG_64 assumes 5 address operands\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20476, __PRETTY_FUNCTION__));

20477

20478

unsigned DestReg = MI->getOperand(0).getReg();

20479

MachineOperand &Base = MI->getOperand(1);

20480

MachineOperand &Scale = MI->getOperand(2);

20481

MachineOperand &Index = MI->getOperand(3);

20482

MachineOperand &Disp = MI->getOperand(4);

20483

MachineOperand &Segment = MI->getOperand(5);

20484

unsigned ArgSize = MI->getOperand(6).getImm();

20485

unsigned ArgMode = MI->getOperand(7).getImm();

20486

unsigned Align = MI->getOperand(8).getImm();

20487

20488

// Memory Reference

20489

assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? static_cast<void> (0) : __assert_fail ("MI->hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20489, __PRETTY_FUNCTION__));

20490

MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();

20491

MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();

20492

20493

// Machine Information

20494

const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();

20495

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

20496

const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);

20497

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

20498

DebugLoc DL = MI->getDebugLoc();

20499

20500

// struct va_list {

20501

// i32 gp_offset

20502

// i32 fp_offset

20503

// i64 overflow_area (address)

20504

// i64 reg_save_area (address)

20505

// }

20506

// sizeof(va_list) = 24

20507

// alignment(va_list) = 8

20508

20509

unsigned TotalNumIntRegs = 6;

20510

unsigned TotalNumXMMRegs = 8;

20511

bool UseGPOffset = (ArgMode == 1);

20512

bool UseFPOffset = (ArgMode == 2);

20513

unsigned MaxOffset = TotalNumIntRegs * 8 +

20514

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

20515

20516

/* Align ArgSize to a multiple of 8 */

20517

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

20518

bool NeedsAlign = (Align > 8);

20519

20520

MachineBasicBlock *thisMBB = MBB;

20521

MachineBasicBlock *overflowMBB;

20522

MachineBasicBlock *offsetMBB;

20523

MachineBasicBlock *endMBB;

20524

20525

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

20526

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

20527

unsigned OffsetReg = 0;

20528

20529

if (!UseGPOffset && !UseFPOffset) {

20530

// If we only pull from the overflow region, we don't create a branch.

20531

// We don't need to alter control flow.

20532

OffsetDestReg = 0; // unused

20533

OverflowDestReg = DestReg;

20534

20535

offsetMBB = nullptr;

20536

overflowMBB = thisMBB;

20537

endMBB = thisMBB;

20538

} else {

20539

// First emit code to check if gp_offset (or fp_offset) is below the bound.

20540

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

20541

// If not, pull from overflow_area. (branch to overflowMBB)

20542

20543

// thisMBB

20544

// | .

20545

// | .

20546

// offsetMBB overflowMBB

20547

// | .

20548

// | .

20549

// endMBB

20550

20551

// Registers for the PHI in endMBB

20552

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

20553

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

20554

20555

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

20556

MachineFunction *MF = MBB->getParent();

20557

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

20558

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

20559

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

20560

20561

MachineFunction::iterator MBBIter = MBB;

20562

++MBBIter;

20563

20564

// Insert the new basic blocks

20565

MF->insert(MBBIter, offsetMBB);

20566

MF->insert(MBBIter, overflowMBB);

20567

MF->insert(MBBIter, endMBB);

20568

20569

// Transfer the remainder of MBB and its successor edges to endMBB.

20570

endMBB->splice(endMBB->begin(), thisMBB,

20571

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

20572

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

20573

20574

// Make offsetMBB and overflowMBB successors of thisMBB

20575

thisMBB->addSuccessor(offsetMBB);

20576

thisMBB->addSuccessor(overflowMBB);

20577

20578

// endMBB is a successor of both offsetMBB and overflowMBB

20579

offsetMBB->addSuccessor(endMBB);

20580

overflowMBB->addSuccessor(endMBB);

20581

20582

// Load the offset value into a register

20583

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

20584

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

20585

.addOperand(Base)

20586

.addOperand(Scale)

20587

.addOperand(Index)

20588

.addDisp(Disp, UseFPOffset ? 4 : 0)

20589

.addOperand(Segment)

20590

.setMemRefs(MMOBegin, MMOEnd);

20591

20592

// Check if there is enough room left to pull this argument.

20593

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

20594

.addReg(OffsetReg)

20595

.addImm(MaxOffset + 8 - ArgSizeA8);

20596

20597

// Branch to "overflowMBB" if offset >= max

20598

// Fall through to "offsetMBB" otherwise

20599

BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))

20600

.addMBB(overflowMBB);

20601

}

20602

20603

// In offsetMBB, emit code to use the reg_save_area.

20604

if (offsetMBB) {

20605

assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail
("OffsetReg != 0", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20605, __PRETTY_FUNCTION__));

20606

20607

// Read the reg_save_area address.

20608

unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

20609

BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)

20610

.addOperand(Base)

20611

.addOperand(Scale)

20612

.addOperand(Index)

20613

.addDisp(Disp, 16)

20614

.addOperand(Segment)

20615

.setMemRefs(MMOBegin, MMOEnd);

20616

20617

// Zero-extend the offset

20618

unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

20619

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

20620

.addImm(0)

20621

.addReg(OffsetReg)

20622

.addImm(X86::sub_32bit);

20623

20624

// Add the offset to the reg_save_area to get the final address.

20625

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

20626

.addReg(OffsetReg64)

20627

.addReg(RegSaveReg);

20628

20629

// Compute the offset for the next argument

20630

unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

20631

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

20632

.addReg(OffsetReg)

20633

.addImm(UseFPOffset ? 16 : 8);

20634

20635

// Store it back into the va_list.

20636

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

20637

.addOperand(Base)

20638

.addOperand(Scale)

20639

.addOperand(Index)

20640

.addDisp(Disp, UseFPOffset ? 4 : 0)

20641

.addOperand(Segment)

20642

.addReg(NextOffsetReg)

20643

.setMemRefs(MMOBegin, MMOEnd);

20644

20645

// Jump to endMBB

20646

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

20647

.addMBB(endMBB);

20648

}

20649

20650

20651

// Emit code to use overflow area

20652

20653

20654

// Load the overflow_area address into a register.

20655

unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

20656

BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)

20657

.addOperand(Base)

20658

.addOperand(Scale)

20659

.addOperand(Index)

20660

.addDisp(Disp, 8)

20661

.addOperand(Segment)

20662

.setMemRefs(MMOBegin, MMOEnd);

20663

20664

// If we need to align it, do so. Otherwise, just copy the address

20665

// to OverflowDestReg.

20666

if (NeedsAlign) {

20667

// Align the overflow address

20668

assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2")(((Align & (Align-1)) == 0 && "Alignment must be a power of 2"
) ? static_cast<void> (0) : __assert_fail ("(Align & (Align-1)) == 0 && \"Alignment must be a power of 2\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20668, __PRETTY_FUNCTION__));

20669

unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

20670

20671

// aligned_addr = (addr + (align-1)) & ~(align-1)

20672

BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)

20673

.addReg(OverflowAddrReg)

20674

.addImm(Align-1);

20675

20676

BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)

20677

.addReg(TmpReg)

20678

.addImm(~(uint64_t)(Align-1));

20679

} else {

20680

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

20681

.addReg(OverflowAddrReg);

20682

}

20683

20684

// Compute the next overflow address after this argument.

20685

// (the overflow address should be kept 8-byte aligned)

20686

unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

20687

BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)

20688

.addReg(OverflowDestReg)

20689

.addImm(ArgSizeA8);

20690

20691

// Store the new overflow address.

20692

BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))

20693

.addOperand(Base)

20694

.addOperand(Scale)

20695

.addOperand(Index)

20696

.addDisp(Disp, 8)

20697

.addOperand(Segment)

20698

.addReg(NextAddrReg)

20699

.setMemRefs(MMOBegin, MMOEnd);

20700

20701

// If we branched, emit the PHI to the front of endMBB.

20702

if (offsetMBB) {

20703

BuildMI(*endMBB, endMBB->begin(), DL,

20704

TII->get(X86::PHI), DestReg)

20705

.addReg(OffsetDestReg).addMBB(offsetMBB)

20706

.addReg(OverflowDestReg).addMBB(overflowMBB);

20707

}

20708

20709

// Erase the pseudo instruction

20710

MI->eraseFromParent();

20711

20712

return endMBB;

20713

}

20714

20715

MachineBasicBlock *

20716

X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(

20717

MachineInstr *MI,

20718

MachineBasicBlock *MBB) const {

20719

// Emit code to save XMM registers to the stack. The ABI says that the

20720

// number of registers to save is given in %al, so it's theoretically

20721

// possible to do an indirect jump trick to avoid saving all of them,

20722

// however this code takes a simpler approach and just executes all

20723

// of the stores if %al is non-zero. It's less code, and it's probably

20724

// easier on the hardware branch predictor, and stores aren't all that

20725

// expensive anyway.

20726

20727

// Create the new basic blocks. One block contains all the XMM stores,

20728

// and one block is the final destination regardless of whether any

20729

// stores were performed.

20730

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

20731

MachineFunction *F = MBB->getParent();

20732

MachineFunction::iterator MBBIter = MBB;

20733

++MBBIter;

20734

MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);

20735

MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);

20736

F->insert(MBBIter, XMMSaveMBB);

20737

F->insert(MBBIter, EndMBB);

20738

20739

// Transfer the remainder of MBB and its successor edges to EndMBB.

20740

EndMBB->splice(EndMBB->begin(), MBB,

20741

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

20742

EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

20743

20744

// The original block will now fall through to the XMM save block.

20745

MBB->addSuccessor(XMMSaveMBB);

20746

// The XMMSaveMBB will fall through to the end block.

20747

XMMSaveMBB->addSuccessor(EndMBB);

20748

20749

// Now add the instructions.

20750

const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();

20751

DebugLoc DL = MI->getDebugLoc();

20752

20753

unsigned CountReg = MI->getOperand(0).getReg();

20754

int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();

20755

int64_t VarArgsFPOffset = MI->getOperand(2).getImm();

20756

20757

if (!Subtarget->isTargetWin64()) {

20758

// If %al is 0, branch around the XMM save block.

20759

BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);

20760

BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);

20761

MBB->addSuccessor(EndMBB);

20762

}

20763

20764

// Make sure the last operand is EFLAGS, which gets clobbered by the branch

20765

// that was just emitted, but clearly shouldn't be "saved".

20766

assert((MI->getNumOperands() <= 3 ||(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__))

20767

!MI->getOperand(MI->getNumOperands() - 1).isReg() ||(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__))

20768

MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__))

20769

&& "Expected last argument to be EFLAGS")(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__));

20770

unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;

20771

// In the XMM save block, save all the XMM argument registers.

20772

for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {

20773

int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;

20774

MachineMemOperand *MMO =

20775

F->getMachineMemOperand(

20776

MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),

20777

MachineMemOperand::MOStore,

20778

/*Size=*/16, /*Align=*/16);

20779

BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))

20780

.addFrameIndex(RegSaveFrameIndex)

20781

.addImm(/*Scale=*/1)

20782

.addReg(/*IndexReg=*/0)

20783

.addImm(/*Disp=*/Offset)

20784

.addReg(/*Segment=*/0)

20785

.addReg(MI->getOperand(i).getReg())

20786

.addMemOperand(MMO);

20787

}

20788

20789

MI->eraseFromParent(); // The pseudo instruction is gone now.

20790

20791

return EndMBB;

20792

}

20793

20794

// The EFLAGS operand of SelectItr might be missing a kill marker

20795

// because there were multiple uses of EFLAGS, and ISel didn't know

20796

// which to mark. Figure out whether SelectItr should have had a

20797

// kill marker, and set it if it should. Returns the correct kill

20798

// marker value.

20799

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

20800

MachineBasicBlock* BB,

20801

const TargetRegisterInfo* TRI) {

20802

// Scan forward through BB for a use/def of EFLAGS.

20803

MachineBasicBlock::iterator miI(std::next(SelectItr));

20804

for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {

20805

const MachineInstr& mi = *miI;

20806

if (mi.readsRegister(X86::EFLAGS))

20807

return false;

20808

if (mi.definesRegister(X86::EFLAGS))

20809

break; // Should have kill-flag - update below.

20810

}

20811

20812

// If we hit the end of the block, check whether EFLAGS is live into a

20813

// successor.

20814

if (miI == BB->end()) {

20815

for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),

20816

sEnd = BB->succ_end();

20817

sItr != sEnd; ++sItr) {

20818

MachineBasicBlock* succ = *sItr;

20819

if (succ->isLiveIn(X86::EFLAGS))

20820

return false;

20821

}

20822

}

20823

20824

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

20825

// out. SelectMI should have a kill flag on EFLAGS.

20826

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

20827

return true;

20828

}

20829

20830

MachineBasicBlock *

20831

X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,

20832

MachineBasicBlock *BB) const {

20833

const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();

20834

DebugLoc DL = MI->getDebugLoc();

20835

20836

// To "insert" a SELECT_CC instruction, we actually have to insert the

20837

// diamond control-flow pattern. The incoming instruction knows the

20838

// destination vreg to set, the condition code register to branch on, the

20839

// true/false values to select between, and a branch opcode to use.

20840

const BasicBlock *LLVM_BB = BB->getBasicBlock();

20841

MachineFunction::iterator It = BB;

20842

++It;

20843

20844

// thisMBB:

20845

// ...

20846

// TrueVal = ...

20847

// cmpTY ccX, r1, r2

20848

// bCC copy1MBB

20849

// fallthrough --> copy0MBB

20850

MachineBasicBlock *thisMBB = BB;

20851

MachineFunction *F = BB->getParent();

20852

MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);

20853

MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

20854

F->insert(It, copy0MBB);

20855

F->insert(It, sinkMBB);

20856

20857

// If the EFLAGS register isn't dead in the terminator, then claim that it's

20858

// live into the sink and copy blocks.

20859

const TargetRegisterInfo *TRI =

20860

BB->getParent()->getSubtarget().getRegisterInfo();

20861

if (!MI->killsRegister(X86::EFLAGS) &&

20862

!checkAndUpdateEFLAGSKill(MI, BB, TRI)) {

20863

copy0MBB->addLiveIn(X86::EFLAGS);

20864

sinkMBB->addLiveIn(X86::EFLAGS);

20865

}

20866

20867

// Transfer the remainder of BB and its successor edges to sinkMBB.

20868

sinkMBB->splice(sinkMBB->begin(), BB,

20869

std::next(MachineBasicBlock::iterator(MI)), BB->end());

20870

sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

20871

20872

// Add the true and fallthrough blocks as its successors.

20873

BB->addSuccessor(copy0MBB);

20874

BB->addSuccessor(sinkMBB);

20875

20876

// Create the conditional branch instruction.

20877

unsigned Opc =

20878

X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());

20879

BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);

20880

20881

// copy0MBB:

20882

// %FalseValue = ...

20883

// # fallthrough to sinkMBB

20884

copy0MBB->addSuccessor(sinkMBB);

20885

20886

// sinkMBB:

20887

// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]

20888

// ...

20889

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

20890

TII->get(X86::PHI), MI->getOperand(0).getReg())

20891

.addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)

20892

.addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);

20893

20894

MI->eraseFromParent(); // The pseudo instruction is gone now.

20895

return sinkMBB;

20896

}

20897

20898

MachineBasicBlock *

20899

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,

20900

MachineBasicBlock *BB) const {

20901

MachineFunction *MF = BB->getParent();

20902

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

20903

DebugLoc DL = MI->getDebugLoc();

20904

const BasicBlock *LLVM_BB = BB->getBasicBlock();

20905

20906

assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail
("MF->shouldSplitStack()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20906, __PRETTY_FUNCTION__));

20907

20908

const bool Is64Bit = Subtarget->is64Bit();

20909

const bool IsLP64 = Subtarget->isTarget64BitLP64();

20910

20911

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

20912

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

20913

20914

// BB:

20915

// ... [Till the alloca]

20916

// If stacklet is not large enough, jump to mallocMBB

20917

20918

// bumpMBB:

20919

// Allocate by subtracting from RSP

20920

// Jump to continueMBB

20921

20922

// mallocMBB:

20923

// Allocate by call to runtime

20924

20925

// continueMBB:

20926

// ...

20927

// [rest of original BB]

20928

20929

20930

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

20931

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

20932

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

20933

20934

MachineRegisterInfo &MRI = MF->getRegInfo();

20935

const TargetRegisterClass *AddrRegClass =

20936

getRegClassFor(getPointerTy());

20937

20938

unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

20939

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

20940

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

20941

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

20942

sizeVReg = MI->getOperand(1).getReg(),

20943

physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;

20944

20945

MachineFunction::iterator MBBIter = BB;

20946

++MBBIter;

20947

20948

MF->insert(MBBIter, bumpMBB);

20949

MF->insert(MBBIter, mallocMBB);

20950

MF->insert(MBBIter, continueMBB);

20951

20952

continueMBB->splice(continueMBB->begin(), BB,

20953

std::next(MachineBasicBlock::iterator(MI)), BB->end());

20954

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

20955

20956

// Add code to the main basic block to check if the stack limit has been hit,

20957

// and if so, jump to mallocMBB otherwise to bumpMBB.

20958

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

20959

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

20960

.addReg(tmpSPVReg).addReg(sizeVReg);

20961

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

20962

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

20963

.addReg(SPLimitVReg);

20964

BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);

20965

20966

// bumpMBB simply decreases the stack pointer, since we know the current

20967

// stacklet has enough space.

20968

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

20969

.addReg(SPLimitVReg);

20970

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

20971

.addReg(SPLimitVReg);

20972

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

20973

20974

// Calls into a routine in libgcc to allocate more space from the heap.

20975

const uint32_t *RegMask = MF->getTarget()

20976

.getSubtargetImpl()

20977

->getRegisterInfo()

20978

->getCallPreservedMask(CallingConv::C);

20979

if (IsLP64) {

20980

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

20981

.addReg(sizeVReg);

20982

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

20983

.addExternalSymbol("__morestack_allocate_stack_space")

20984

.addRegMask(RegMask)

20985

.addReg(X86::RDI, RegState::Implicit)

20986

.addReg(X86::RAX, RegState::ImplicitDefine);

20987

} else if (Is64Bit) {

20988

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

20989

.addReg(sizeVReg);

20990

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

20991

.addExternalSymbol("__morestack_allocate_stack_space")

20992

.addRegMask(RegMask)

20993

.addReg(X86::EDI, RegState::Implicit)

20994

.addReg(X86::EAX, RegState::ImplicitDefine);

20995

} else {

20996

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

20997

.addImm(12);

20998

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

20999

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

21000

.addExternalSymbol("__morestack_allocate_stack_space")

21001

.addRegMask(RegMask)

21002

.addReg(X86::EAX, RegState::ImplicitDefine);

21003

}

21004

21005

if (!Is64Bit)

21006

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

21007

.addImm(16);

21008

21009

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

21010

.addReg(IsLP64 ? X86::RAX : X86::EAX);

21011

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

21012

21013

// Set up the CFG correctly.

21014

BB->addSuccessor(bumpMBB);

21015

BB->addSuccessor(mallocMBB);

21016

mallocMBB->addSuccessor(continueMBB);

21017

bumpMBB->addSuccessor(continueMBB);

21018

21019

// Take care of the PHI nodes.

21020

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

21021

MI->getOperand(0).getReg())

21022

.addReg(mallocPtrVReg).addMBB(mallocMBB)

21023

.addReg(bumpSPPtrVReg).addMBB(bumpMBB);

21024

21025

// Delete the original pseudo instruction.

21026

MI->eraseFromParent();

21027

21028

// And we're done.

21029

return continueMBB;

21030

}

21031

21032

MachineBasicBlock *

21033

X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,

21034

MachineBasicBlock *BB) const {

21035

DebugLoc DL = MI->getDebugLoc();

21036

21037

assert(!Subtarget->isTargetMachO())((!Subtarget->isTargetMachO()) ? static_cast<void> (
0) : __assert_fail ("!Subtarget->isTargetMachO()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21037, __PRETTY_FUNCTION__));

21038

21039

X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);

21040

21041

MI->eraseFromParent(); // The pseudo instruction is gone now.

21042

return BB;

21043

}

21044

21045

MachineBasicBlock *

21046

X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,

21047

MachineBasicBlock *BB) const {

21048

// This is pretty easy. We're taking the value that we received from

21049

// our load from the relocation, sticking it in either RDI (x86-64)

21050

// or EAX and doing an indirect call. The return value will then

21051

// be in the normal return register.

21052

MachineFunction *F = BB->getParent();

21053

const X86InstrInfo *TII =

21054

static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());

21055

DebugLoc DL = MI->getDebugLoc();

21056

21057

assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?")((Subtarget->isTargetDarwin() && "Darwin only instr emitted?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->isTargetDarwin() && \"Darwin only instr emitted?\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21057, __PRETTY_FUNCTION__));

21058

assert(MI->getOperand(3).isGlobal() && "This should be a global")((MI->getOperand(3).isGlobal() && "This should be a global"
) ? static_cast<void> (0) : __assert_fail ("MI->getOperand(3).isGlobal() && \"This should be a global\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21058, __PRETTY_FUNCTION__));

21059

21060

// Get a register mask for the lowered call.

21061

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

21062

// proper register mask.

21063

const uint32_t *RegMask = F->getTarget()

21064

.getSubtargetImpl()

21065

->getRegisterInfo()

21066

->getCallPreservedMask(CallingConv::C);

21067

if (Subtarget->is64Bit()) {

21068

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,

21069

TII->get(X86::MOV64rm), X86::RDI)

21070

.addReg(X86::RIP)

21071

.addImm(0).addReg(0)

21072

.addGlobalAddress(MI->getOperand(3).getGlobal(), 0,

21073

MI->getOperand(3).getTargetFlags())

21074

.addReg(0);

21075

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

21076

addDirectMem(MIB, X86::RDI);

21077

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

21078

} else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {

21079

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,

21080

TII->get(X86::MOV32rm), X86::EAX)

21081

.addReg(0)

21082

.addImm(0).addReg(0)

21083

.addGlobalAddress(MI->getOperand(3).getGlobal(), 0,

21084

MI->getOperand(3).getTargetFlags())

21085

.addReg(0);

21086

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

21087

addDirectMem(MIB, X86::EAX);

21088

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

21089

} else {

21090

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,

21091

TII->get(X86::MOV32rm), X86::EAX)

21092

.addReg(TII->getGlobalBaseReg(F))

21093

.addImm(0).addReg(0)

21094

.addGlobalAddress(MI->getOperand(3).getGlobal(), 0,

21095

MI->getOperand(3).getTargetFlags())

21096

.addReg(0);

21097

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

21098

addDirectMem(MIB, X86::EAX);

21099

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

21100

}

21101

21102

MI->eraseFromParent(); // The pseudo instruction is gone now.

21103

return BB;

21104

}

21105

21106

MachineBasicBlock *

21107

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,

21108

MachineBasicBlock *MBB) const {

21109

DebugLoc DL = MI->getDebugLoc();

21110

MachineFunction *MF = MBB->getParent();

21111

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

21112

MachineRegisterInfo &MRI = MF->getRegInfo();

21113

21114

const BasicBlock *BB = MBB->getBasicBlock();

21115

MachineFunction::iterator I = MBB;

21116

++I;

21117

21118

// Memory Reference

21119

MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();

21120

MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();

21121

21122

unsigned DstReg;

21123

unsigned MemOpndSlot = 0;

21124

21125

unsigned CurOp = 0;

21126

21127

DstReg = MI->getOperand(CurOp++).getReg();

21128

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

21129

assert(RC->hasType(MVT::i32) && "Invalid destination!")((RC->hasType(MVT::i32) && "Invalid destination!")
? static_cast<void> (0) : __assert_fail ("RC->hasType(MVT::i32) && \"Invalid destination!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21129, __PRETTY_FUNCTION__));

21130

unsigned mainDstReg = MRI.createVirtualRegister(RC);

21131

unsigned restoreDstReg = MRI.createVirtualRegister(RC);

21132

21133

MemOpndSlot = CurOp;

21134

21135

MVT PVT = getPointerTy();

21136

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21137, __PRETTY_FUNCTION__))

21137

"Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21137, __PRETTY_FUNCTION__));

21138

21139

// For v = setjmp(buf), we generate

21140

21141

// thisMBB:

21142

// buf[LabelOffset] = restoreMBB

21143

// SjLjSetup restoreMBB

21144

21145

// mainMBB:

21146

// v_main = 0

21147

21148

// sinkMBB:

21149

// v = phi(main, restore)

21150

21151

// restoreMBB:

21152

// if base pointer being used, load it from frame

21153

// v_restore = 1

21154

21155

MachineBasicBlock *thisMBB = MBB;

21156

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

21157

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

21158

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

21159

MF->insert(I, mainMBB);

21160

MF->insert(I, sinkMBB);

21161

MF->push_back(restoreMBB);

21162

21163

MachineInstrBuilder MIB;

21164

21165

// Transfer the remainder of BB and its successor edges to sinkMBB.

21166

sinkMBB->splice(sinkMBB->begin(), MBB,

21167

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

21168

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

21169

21170

// thisMBB:

21171

unsigned PtrStoreOpc = 0;

21172

unsigned LabelReg = 0;

21173

const int64_t LabelOffset = 1 * PVT.getStoreSize();

21174

Reloc::Model RM = MF->getTarget().getRelocationModel();

21175

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

21176

(RM == Reloc::Static || RM == Reloc::DynamicNoPIC);

21177

21178

// Prepare IP either in reg or imm.

21179

if (!UseImmLabel) {

21180

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

21181

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

21182

LabelReg = MRI.createVirtualRegister(PtrRC);

21183

if (Subtarget->is64Bit()) {

21184

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

21185

.addReg(X86::RIP)

21186

.addImm(0)

21187

.addReg(0)

21188

.addMBB(restoreMBB)

21189

.addReg(0);

21190

} else {

21191

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

21192

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

21193

.addReg(XII->getGlobalBaseReg(MF))

21194

.addImm(0)

21195

.addReg(0)

21196

.addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())

21197

.addReg(0);

21198

}

21199

} else

21200

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

21201

// Store IP

21202

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

21203

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

21204

if (i == X86::AddrDisp)

21205

MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);

21206

else

21207

MIB.addOperand(MI->getOperand(MemOpndSlot + i));

21208

}

21209

if (!UseImmLabel)

21210

MIB.addReg(LabelReg);

21211

else

21212

MIB.addMBB(restoreMBB);

21213

MIB.setMemRefs(MMOBegin, MMOEnd);

21214

// Setup

21215

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

21216

.addMBB(restoreMBB);

21217

21218

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

21219

MF->getSubtarget().getRegisterInfo());

21220

MIB.addRegMask(RegInfo->getNoPreservedMask());

21221

thisMBB->addSuccessor(mainMBB);

21222

thisMBB->addSuccessor(restoreMBB);

21223

21224

// mainMBB:

21225

// EAX = 0

21226

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

21227

mainMBB->addSuccessor(sinkMBB);

21228

21229

// sinkMBB:

21230

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

21231

TII->get(X86::PHI), DstReg)

21232

.addReg(mainDstReg).addMBB(mainMBB)

21233

.addReg(restoreDstReg).addMBB(restoreMBB);

21234

21235

// restoreMBB:

21236

if (RegInfo->hasBasePointer(*MF)) {

21237

const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();

21238

const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();

21239

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

21240

X86FI->setRestoreBasePointer(MF);

21241

unsigned FramePtr = RegInfo->getFrameRegister(*MF);

21242

unsigned BasePtr = RegInfo->getBaseRegister();

21243

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

21244

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

21245

FramePtr, true, X86FI->getRestoreBasePointerOffset())

21246

.setMIFlag(MachineInstr::FrameSetup);

21247

}

21248

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

21249

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

21250

restoreMBB->addSuccessor(sinkMBB);

21251

21252

MI->eraseFromParent();

21253

return sinkMBB;

21254

}

21255

21256

MachineBasicBlock *

21257

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,

21258

MachineBasicBlock *MBB) const {

21259

DebugLoc DL = MI->getDebugLoc();

21260

MachineFunction *MF = MBB->getParent();

21261

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

21262

MachineRegisterInfo &MRI = MF->getRegInfo();

21263

21264

// Memory Reference

21265

MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();

21266

MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();

21267

21268

MVT PVT = getPointerTy();

21269

21270

21271

21272

const TargetRegisterClass *RC =

21273

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

21274

unsigned Tmp = MRI.createVirtualRegister(RC);

21275

// Since FP is only updated here but NOT referenced, it's treated as GPR.

21276

const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

21277

MF->getSubtarget().getRegisterInfo());

21278

unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

21279

unsigned SP = RegInfo->getStackRegister();

21280

21281

MachineInstrBuilder MIB;

21282

21283

const int64_t LabelOffset = 1 * PVT.getStoreSize();

21284

const int64_t SPOffset = 2 * PVT.getStoreSize();

21285

21286

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

21287

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

21288

21289

// Reload FP

21290

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);

21291

for (unsigned i = 0; i < X86::AddrNumOperands; ++i)

21292

MIB.addOperand(MI->getOperand(i));

21293

MIB.setMemRefs(MMOBegin, MMOEnd);

21294

// Reload IP

21295

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

21296

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

21297

if (i == X86::AddrDisp)

21298

MIB.addDisp(MI->getOperand(i), LabelOffset);

21299

else

21300

MIB.addOperand(MI->getOperand(i));

21301

}

21302

MIB.setMemRefs(MMOBegin, MMOEnd);

21303

// Reload SP

21304

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);

21305

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

21306

if (i == X86::AddrDisp)

21307

MIB.addDisp(MI->getOperand(i), SPOffset);

21308

else

21309

MIB.addOperand(MI->getOperand(i));

21310

}

21311

MIB.setMemRefs(MMOBegin, MMOEnd);

21312

// Jump

21313

BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

21314

21315

MI->eraseFromParent();

21316

return MBB;

21317

}

21318

21319

// Replace 213-type (isel default) FMA3 instructions with 231-type for

21320

// accumulator loops. Writing back to the accumulator allows the coalescer

21321

// to remove extra copies in the loop.

21322

MachineBasicBlock *

21323

X86TargetLowering::emitFMA3Instr(MachineInstr *MI,

21324

MachineBasicBlock *MBB) const {

21325

MachineOperand &AddendOp = MI->getOperand(3);

21326

21327

// Bail out early if the addend isn't a register - we can't switch these.

21328

if (!AddendOp.isReg())

21329

return MBB;

21330

21331

MachineFunction &MF = *MBB->getParent();

21332

MachineRegisterInfo &MRI = MF.getRegInfo();

21333

21334

// Check whether the addend is defined by a PHI:

21335

assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?")((MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"
) ? static_cast<void> (0) : __assert_fail ("MRI.hasOneDef(AddendOp.getReg()) && \"Multiple defs in SSA?\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21335, __PRETTY_FUNCTION__));

21336

MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());

21337

if (!AddendDef.isPHI())

21338

return MBB;

21339

21340

// Look for the following pattern:

21341

// loop:

21342

// %addend = phi [%entry, 0], [%loop, %result]

21343

// ...

21344

// %result<tied1> = FMA213 %m2<tied0>, %m1, %addend

21345

21346

// Replace with:

21347

// loop:

21348

// %addend = phi [%entry, 0], [%loop, %result]

21349

// ...

21350

// %result<tied1> = FMA231 %addend<tied0>, %m1, %m2

21351

21352

for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {

21353

assert(AddendDef.getOperand(i).isReg())((AddendDef.getOperand(i).isReg()) ? static_cast<void> (
0) : __assert_fail ("AddendDef.getOperand(i).isReg()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21353, __PRETTY_FUNCTION__));

21354

MachineOperand PHISrcOp = AddendDef.getOperand(i);

21355

MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());

21356

if (&PHISrcInst == MI) {

21357

// Found a matching instruction.

21358

unsigned NewFMAOpc = 0;

21359

switch (MI->getOpcode()) {

21360

case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;

21361

case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;

21362

case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;

21363

case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;

21364

case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;

21365

case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;

21366

case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;

21367

case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;

21368

case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;

21369

case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;

21370

case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;

21371

case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;

21372

case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;

21373

case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;

21374

case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;

21375

case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;

21376

case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;

21377

case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;

21378

case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;

21379

case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;

21380

21381

case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;

21382

case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;

21383

case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;

21384

case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;

21385

case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;

21386

case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;

21387

case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;

21388

case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;

21389

case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;

21390

case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;

21391

case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;

21392

case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;

21393

default: llvm_unreachable("Unrecognized FMA variant.")::llvm::llvm_unreachable_internal("Unrecognized FMA variant."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21393);

21394

}

21395

21396

const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();

21397

MachineInstrBuilder MIB =

21398

BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))

21399

.addOperand(MI->getOperand(0))

21400

.addOperand(MI->getOperand(3))

21401

.addOperand(MI->getOperand(2))

21402

.addOperand(MI->getOperand(1));

21403

MBB->insert(MachineBasicBlock::iterator(MI), MIB);

21404

MI->eraseFromParent();

21405

}

21406

}

21407

21408

return MBB;

21409

}

21410

21411

MachineBasicBlock *

21412

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,

21413

MachineBasicBlock *BB) const {

21414

switch (MI->getOpcode()) {

21415

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21415);

21416

case X86::TAILJMPd64:

21417

case X86::TAILJMPr64:

21418

case X86::TAILJMPm64:

21419

case X86::TAILJMPd64_REX:

21420

case X86::TAILJMPr64_REX:

21421

case X86::TAILJMPm64_REX:

21422

llvm_unreachable("TAILJMP64 would not be touched here.")::llvm::llvm_unreachable_internal("TAILJMP64 would not be touched here."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21422);

21423

case X86::TCRETURNdi64:

21424

case X86::TCRETURNri64:

21425

case X86::TCRETURNmi64:

21426

return BB;

21427

case X86::WIN_ALLOCA:

21428

return EmitLoweredWinAlloca(MI, BB);

21429

case X86::SEG_ALLOCA_32:

21430

case X86::SEG_ALLOCA_64:

21431

return EmitLoweredSegAlloca(MI, BB);

21432

case X86::TLSCall_32:

21433

case X86::TLSCall_64:

21434

return EmitLoweredTLSCall(MI, BB);

21435

case X86::CMOV_GR8:

21436

case X86::CMOV_FR32:

21437

case X86::CMOV_FR64:

21438

case X86::CMOV_V4F32:

21439

case X86::CMOV_V2F64:

21440

case X86::CMOV_V2I64:

21441

case X86::CMOV_V8F32:

21442

case X86::CMOV_V4F64:

21443

case X86::CMOV_V4I64:

21444

case X86::CMOV_V16F32:

21445

case X86::CMOV_V8F64:

21446

case X86::CMOV_V8I64:

21447

case X86::CMOV_GR16:

21448

case X86::CMOV_GR32:

21449

case X86::CMOV_RFP32:

21450

case X86::CMOV_RFP64:

21451

case X86::CMOV_RFP80:

21452

return EmitLoweredSelect(MI, BB);

21453

21454

case X86::FP32_TO_INT16_IN_MEM:

21455

case X86::FP32_TO_INT32_IN_MEM:

21456

case X86::FP32_TO_INT64_IN_MEM:

21457

case X86::FP64_TO_INT16_IN_MEM:

21458

case X86::FP64_TO_INT32_IN_MEM:

21459

case X86::FP64_TO_INT64_IN_MEM:

21460

case X86::FP80_TO_INT16_IN_MEM:

21461

case X86::FP80_TO_INT32_IN_MEM:

21462

case X86::FP80_TO_INT64_IN_MEM: {

21463

MachineFunction *F = BB->getParent();

21464

const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();

21465

DebugLoc DL = MI->getDebugLoc();

21466

21467

// Change the floating point control register to use "round towards zero"

21468

// mode when truncating to an integer value.

21469

int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);

21470

addFrameReference(BuildMI(*BB, MI, DL,

21471

TII->get(X86::FNSTCW16m)), CWFrameIdx);

21472

21473

// Load the old value of the high byte of the control word...

21474

unsigned OldCW =

21475

F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

21476

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),

21477

CWFrameIdx);

21478

21479

// Set the high part to be round to zero...

21480

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)

21481

.addImm(0xC7F);

21482

21483

// Reload the modified control word now...

21484

addFrameReference(BuildMI(*BB, MI, DL,

21485

TII->get(X86::FLDCW16m)), CWFrameIdx);

21486

21487

// Restore the memory image of control word to original value

21488

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)

21489

.addReg(OldCW);

21490

21491

// Get the X86 opcode to use.

21492

unsigned Opc;

21493

switch (MI->getOpcode()) {

21494

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21494);

21495

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

21496

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

21497

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

21498

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

21499

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

21500

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

21501

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

21502

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

21503

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

21504

}

21505

21506

X86AddressMode AM;

21507

MachineOperand &Op = MI->getOperand(0);

21508

if (Op.isReg()) {

21509

AM.BaseType = X86AddressMode::RegBase;

21510

AM.Base.Reg = Op.getReg();

21511

} else {

21512

AM.BaseType = X86AddressMode::FrameIndexBase;

21513

AM.Base.FrameIndex = Op.getIndex();

21514

}

21515

Op = MI->getOperand(1);

21516

if (Op.isImm())

21517

AM.Scale = Op.getImm();

21518

Op = MI->getOperand(2);

21519

if (Op.isImm())

21520

AM.IndexReg = Op.getImm();

21521

Op = MI->getOperand(3);

21522

if (Op.isGlobal()) {

21523

AM.GV = Op.getGlobal();

21524

} else {

21525

AM.Disp = Op.getImm();

21526

}

21527

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

21528

.addReg(MI->getOperand(X86::AddrNumOperands).getReg());

21529

21530

// Reload the original control word now.

21531

addFrameReference(BuildMI(*BB, MI, DL,

21532

TII->get(X86::FLDCW16m)), CWFrameIdx);

21533

21534

MI->eraseFromParent(); // The pseudo instruction is gone now.

21535

return BB;

21536

}

21537

// String/text processing lowering.

21538

case X86::PCMPISTRM128REG:

21539

case X86::VPCMPISTRM128REG:

21540

case X86::PCMPISTRM128MEM:

21541

case X86::VPCMPISTRM128MEM:

21542

case X86::PCMPESTRM128REG:

21543

case X86::VPCMPESTRM128REG:

21544

case X86::PCMPESTRM128MEM:

21545

case X86::VPCMPESTRM128MEM:

21546

assert(Subtarget->hasSSE42() &&((Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21547, __PRETTY_FUNCTION__))

21547

"Target must have SSE4.2 or AVX features enabled")((Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21547, __PRETTY_FUNCTION__));

21548

return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());

21549

21550

// String/text processing lowering.

21551

case X86::PCMPISTRIREG:

21552

case X86::VPCMPISTRIREG:

21553

case X86::PCMPISTRIMEM:

21554

case X86::VPCMPISTRIMEM:

21555

case X86::PCMPESTRIREG:

21556

case X86::VPCMPESTRIREG:

21557

case X86::PCMPESTRIMEM:

21558

case X86::VPCMPESTRIMEM:

21559

21560

21561

return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());

21562

21563

// Thread synchronization.

21564

case X86::MONITOR:

21565

return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),

21566

Subtarget);

21567

21568

// xbegin

21569

case X86::XBEGIN:

21570

return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());

21571

21572

case X86::VASTART_SAVE_XMM_REGS:

21573

return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

21574

21575

case X86::VAARG_64:

21576

return EmitVAARG64WithCustomInserter(MI, BB);

21577

21578

case X86::EH_SjLj_SetJmp32:

21579

case X86::EH_SjLj_SetJmp64:

21580

return emitEHSjLjSetJmp(MI, BB);

21581

21582

case X86::EH_SjLj_LongJmp32:

21583

case X86::EH_SjLj_LongJmp64:

21584

return emitEHSjLjLongJmp(MI, BB);

21585

21586

case TargetOpcode::STATEPOINT:

21587

// As an implementation detail, STATEPOINT shares the STACKMAP format at

21588

// this point in the process. We diverge later.

21589

return emitPatchPoint(MI, BB);

21590

21591

case TargetOpcode::STACKMAP:

21592

case TargetOpcode::PATCHPOINT:

21593

return emitPatchPoint(MI, BB);

21594

21595

case X86::VFMADDPDr213r:

21596

case X86::VFMADDPSr213r:

21597

case X86::VFMADDSDr213r:

21598

case X86::VFMADDSSr213r:

21599

case X86::VFMSUBPDr213r:

21600

case X86::VFMSUBPSr213r:

21601

case X86::VFMSUBSDr213r:

21602

case X86::VFMSUBSSr213r:

21603

case X86::VFNMADDPDr213r:

21604

case X86::VFNMADDPSr213r:

21605

case X86::VFNMADDSDr213r:

21606

case X86::VFNMADDSSr213r:

21607

case X86::VFNMSUBPDr213r:

21608

case X86::VFNMSUBPSr213r:

21609

case X86::VFNMSUBSDr213r:

21610

case X86::VFNMSUBSSr213r:

21611

case X86::VFMADDSUBPDr213r:

21612

case X86::VFMADDSUBPSr213r:

21613

case X86::VFMSUBADDPDr213r:

21614

case X86::VFMSUBADDPSr213r:

21615

case X86::VFMADDPDr213rY:

21616

case X86::VFMADDPSr213rY:

21617

case X86::VFMSUBPDr213rY:

21618

case X86::VFMSUBPSr213rY:

21619

case X86::VFNMADDPDr213rY:

21620

case X86::VFNMADDPSr213rY:

21621

case X86::VFNMSUBPDr213rY:

21622

case X86::VFNMSUBPSr213rY:

21623

case X86::VFMADDSUBPDr213rY:

21624

case X86::VFMADDSUBPSr213rY:

21625

case X86::VFMSUBADDPDr213rY:

21626

case X86::VFMSUBADDPSr213rY:

21627

return emitFMA3Instr(MI, BB);

21628

}

21629

}

21630

21631

//===----------------------------------------------------------------------===//

21632

// X86 Optimization Hooks

21633

//===----------------------------------------------------------------------===//

21634

21635

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

21636

APInt &KnownZero,

21637

APInt &KnownOne,

21638

const SelectionDAG &DAG,

21639

unsigned Depth) const {

21640

unsigned BitWidth = KnownZero.getBitWidth();

21641

unsigned Opc = Op.getOpcode();

21642

assert((Opc >= ISD::BUILTIN_OP_END ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))

21643

Opc == ISD::INTRINSIC_WO_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))

21644

Opc == ISD::INTRINSIC_W_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))

21645

Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))

21646

"Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))

21647

" is a target node!")(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__));

21648

21649

KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.

21650

switch (Opc) {

21651

default: break;

21652

case X86ISD::ADD:

21653

case X86ISD::SUB:

21654

case X86ISD::ADC:

21655

case X86ISD::SBB:

21656

case X86ISD::SMUL:

21657

case X86ISD::UMUL:

21658

case X86ISD::INC:

21659

case X86ISD::DEC:

21660

case X86ISD::OR:

21661

case X86ISD::XOR:

21662

case X86ISD::AND:

21663

// These nodes' second result is a boolean.

21664

if (Op.getResNo() == 0)

21665

break;

21666

// Fallthrough

21667

case X86ISD::SETCC:

21668

KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);

21669

break;

21670

case ISD::INTRINSIC_WO_CHAIN: {

21671

unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

21672

unsigned NumLoBits = 0;

21673

switch (IntId) {

21674

default: break;

21675

case Intrinsic::x86_sse_movmsk_ps:

21676

case Intrinsic::x86_avx_movmsk_ps_256:

21677

case Intrinsic::x86_sse2_movmsk_pd:

21678

case Intrinsic::x86_avx_movmsk_pd_256:

21679

case Intrinsic::x86_mmx_pmovmskb:

21680

case Intrinsic::x86_sse2_pmovmskb_128:

21681

case Intrinsic::x86_avx2_pmovmskb: {

21682

// High bits of movmskp{s|d}, pmovmskb are known zero.

21683

switch (IntId) {

21684

21685

case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;

21686

case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;

21687

case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;

21688

case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;

21689

case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;

21690

case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;

21691

case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;

21692

}

21693

KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);

21694

break;

21695

}

21696

}

21697

break;

21698

}

21699

}

21700

}

21701

21702

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

21703

SDValue Op,

21704

const SelectionDAG &,

21705

unsigned Depth) const {

21706

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

21707

if (Op.getOpcode() == X86ISD::SETCC_CARRY)

21708

return Op.getValueType().getScalarType().getSizeInBits();

21709

21710

// Fallback case.

21711

return 1;

21712

}

21713

21714

/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the

21715

/// node is a GlobalAddress + offset.

21716

bool X86TargetLowering::isGAPlusOffset(SDNode *N,

21717

const GlobalValue* &GA,

21718

int64_t &Offset) const {

21719

if (N->getOpcode() == X86ISD::Wrapper) {

21720

if (isa<GlobalAddressSDNode>(N->getOperand(0))) {

21721

GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();

21722

Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();

21723

return true;

21724

}

21725

}

21726

return TargetLowering::isGAPlusOffset(N, GA, Offset);

21727

}

21728

21729

/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the

21730

/// same as extracting the high 128-bit part of 256-bit vector and then

21731

/// inserting the result into the low part of a new 256-bit vector

21732

static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {

21733

EVT VT = SVOp->getValueType(0);

21734

unsigned NumElems = VT.getVectorNumElements();

21735

21736

// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

21737

for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)

21738

if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||

21739

SVOp->getMaskElt(j) >= 0)

21740

return false;

21741

21742

return true;

21743

}

21744

21745

/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the

21746

/// same as extracting the low 128-bit part of 256-bit vector and then

21747

/// inserting the result into the high part of a new 256-bit vector

21748

static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {

21749

EVT VT = SVOp->getValueType(0);

21750

unsigned NumElems = VT.getVectorNumElements();

21751

21752

// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

21753

for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)

21754

if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||

21755

SVOp->getMaskElt(j) >= 0)

21756

return false;

21757

21758

return true;

21759

}

21760

21761

/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.

21762

static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,

21763

TargetLowering::DAGCombinerInfo &DCI,

21764

const X86Subtarget* Subtarget) {

21765

SDLoc dl(N);

21766

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

21767

SDValue V1 = SVOp->getOperand(0);

21768

SDValue V2 = SVOp->getOperand(1);

21769

EVT VT = SVOp->getValueType(0);

21770

unsigned NumElems = VT.getVectorNumElements();

21771

21772

if (V1.getOpcode() == ISD::CONCAT_VECTORS &&

21773

V2.getOpcode() == ISD::CONCAT_VECTORS) {

21774

21775

// 0,0,0,...

21776

// |

21777

// V UNDEF BUILD_VECTOR UNDEF

21778

// \ / \ /

21779

// CONCAT_VECTOR CONCAT_VECTOR

21780

// \ /

21781

// \ /

21782

// RESULT: V + zero extended

21783

21784

if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||

21785

V2.getOperand(1).getOpcode() != ISD::UNDEF ||

21786

V1.getOperand(1).getOpcode() != ISD::UNDEF)

21787

return SDValue();

21788

21789

if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))

21790

return SDValue();

21791

21792

// To match the shuffle mask, the first half of the mask should

21793

// be exactly the first vector, and all the rest a splat with the

21794

// first element of the second one.

21795

for (unsigned i = 0; i != NumElems/2; ++i)

21796

if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||

21797

!isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))

21798

return SDValue();

21799

21800

// If V1 is coming from a vector load then just fold to a VZEXT_LOAD.

21801

if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {

21802

if (Ld->hasNUsesOfValue(1, 0)) {

21803

SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);

21804

SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };

21805

SDValue ResNode =

21806

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

21807

Ld->getMemoryVT(),

21808

Ld->getPointerInfo(),

21809

Ld->getAlignment(),

21810

false/*isVolatile*/, true/*ReadMem*/,

21811

false/*WriteMem*/);

21812

21813

// Make sure the newly-created LOAD is in the same position as Ld in

21814

// terms of dependency. We create a TokenFactor for Ld and ResNode,

21815

// and update uses of Ld's output chain to use the TokenFactor.

21816

if (Ld->hasAnyUseOfValue(1)) {

21817

SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

21818

SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));

21819

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);

21820

DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),

21821

SDValue(ResNode.getNode(), 1));

21822

}

21823

21824

return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);

21825

}

21826

}

21827

21828

// Emit a zeroed vector and insert the desired subvector on its

21829

// first half.

21830

SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);

21831

SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);

21832

return DCI.CombineTo(N, InsV);

21833

}

21834

21835

//===--------------------------------------------------------------------===//

21836

// Combine some shuffles into subvector extracts and inserts:

21837

21838

21839

// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

21840

if (isShuffleHigh128VectorInsertLow(SVOp)) {

21841

SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);

21842

SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);

21843

return DCI.CombineTo(N, InsV);

21844

}

21845

21846

// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

21847

if (isShuffleLow128VectorInsertHigh(SVOp)) {

21848

SDValue V = Extract128BitVector(V1, 0, DAG, dl);

21849

SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);

21850

return DCI.CombineTo(N, InsV);

21851

}

21852

21853

return SDValue();

21854

}

21855

21856

/// \brief Combine an arbitrary chain of shuffles into a single instruction if

21857

/// possible.

21858

///

21859

/// This is the leaf of the recursive combinine below. When we have found some

21860

/// chain of single-use x86 shuffle instructions and accumulated the combined

21861

/// shuffle mask represented by them, this will try to pattern match that mask

21862

/// into either a single instruction if there is a special purpose instruction

21863

/// for this operation, or into a PSHUFB instruction which is a fully general

21864

/// instruction but should only be used to replace chains over a certain depth.

21865

static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,

21866

int Depth, bool HasPSHUFB, SelectionDAG &DAG,

21867

TargetLowering::DAGCombinerInfo &DCI,

21868

const X86Subtarget *Subtarget) {

21869

assert(!Mask.empty() && "Cannot combine an empty shuffle mask!")((!Mask.empty() && "Cannot combine an empty shuffle mask!"
) ? static_cast<void> (0) : __assert_fail ("!Mask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21869, __PRETTY_FUNCTION__));

21870

21871

// Find the operand that enters the chain. Note that multiple uses are OK

21872

// here, we're not going to remove the operand we find.

21873

SDValue Input = Op.getOperand(0);

21874

while (Input.getOpcode() == ISD::BITCAST)

21875

Input = Input.getOperand(0);

21876

21877

MVT VT = Input.getSimpleValueType();

21878

MVT RootVT = Root.getSimpleValueType();

21879

SDLoc DL(Root);

21880

21881

// Just remove no-op shuffle masks.

21882

if (Mask.size() == 1) {

21883

DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),

21884

/*AddTo*/ true);

21885

return true;

21886

}

21887

21888

// Use the float domain if the operand type is a floating point type.

21889

bool FloatDomain = VT.isFloatingPoint();

21890

21891

// For floating point shuffles, we don't have free copies in the shuffle

21892

// instructions or the ability to load as part of the instruction, so

21893

// canonicalize their shuffles to UNPCK or MOV variants.

21894

21895

// Note that even with AVX we prefer the PSHUFD form of shuffle for integer

21896

// vectors because it can have a load folded into it that UNPCK cannot. This

21897

// doesn't preclude something switching to the shorter encoding post-RA.

21898

if (FloatDomain) {

21899

if (Mask.equals(0, 0) || Mask.equals(1, 1)) {

21900

bool Lo = Mask.equals(0, 0);

21901

unsigned Shuffle;

21902

MVT ShuffleVT;

21903

// Check if we have SSE3 which will let us use MOVDDUP. That instruction

21904

// is no slower than UNPCKLPD but has the option to fold the input operand

21905

// into even an unaligned memory load.

21906

if (Lo && Subtarget->hasSSE3()) {

21907

Shuffle = X86ISD::MOVDDUP;

21908

ShuffleVT = MVT::v2f64;

21909

} else {

21910

// We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller

21911

// than the UNPCK variants.

21912

Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;

21913

ShuffleVT = MVT::v4f32;

21914

}

21915

if (Depth == 1 && Root->getOpcode() == Shuffle)

21916

return false; // Nothing to do!

21917

Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);

21918

DCI.AddToWorklist(Op.getNode());

21919

if (Shuffle == X86ISD::MOVDDUP)

21920

Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);

21921

else

21922

Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);

21923

DCI.AddToWorklist(Op.getNode());

21924

DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),

21925

/*AddTo*/ true);

21926

return true;

21927

}

21928

if (Subtarget->hasSSE3() &&

21929

(Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {

21930

bool Lo = Mask.equals(0, 0, 2, 2);

21931

unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;

21932

MVT ShuffleVT = MVT::v4f32;

21933

if (Depth == 1 && Root->getOpcode() == Shuffle)

21934

return false; // Nothing to do!

21935

Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);

21936

DCI.AddToWorklist(Op.getNode());

21937

Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);

21938

DCI.AddToWorklist(Op.getNode());

21939

DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),

21940

/*AddTo*/ true);

21941

return true;

21942

}

21943

if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {

21944

bool Lo = Mask.equals(0, 0, 1, 1);

21945

unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

21946

MVT ShuffleVT = MVT::v4f32;

21947

if (Depth == 1 && Root->getOpcode() == Shuffle)

21948

return false; // Nothing to do!

21949

Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);

21950

DCI.AddToWorklist(Op.getNode());

21951

Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);

21952

DCI.AddToWorklist(Op.getNode());

21953

DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),

21954

/*AddTo*/ true);

21955

return true;

21956

}

21957

}

21958

21959

// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK

21960

// variants as none of these have single-instruction variants that are

21961

// superior to the UNPCK formulation.

21962

if (!FloatDomain &&

21963

(Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||

21964

Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||

21965

Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||

21966

Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,

21967

15))) {

21968

bool Lo = Mask[0] == 0;

21969

unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

21970

if (Depth == 1 && Root->getOpcode() == Shuffle)

21971

return false; // Nothing to do!

21972

MVT ShuffleVT;

21973

switch (Mask.size()) {

21974

case 8:

21975

ShuffleVT = MVT::v8i16;

21976

break;

21977

case 16:

21978

ShuffleVT = MVT::v16i8;

21979

break;

21980

default:

21981

llvm_unreachable("Impossible mask size!")::llvm::llvm_unreachable_internal("Impossible mask size!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21981);

21982

};

21983

Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);

21984

DCI.AddToWorklist(Op.getNode());

21985

Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);

21986

DCI.AddToWorklist(Op.getNode());

21987

DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),

21988

/*AddTo*/ true);

21989

return true;

21990

}

21991

21992

// Don't try to re-form single instruction chains under any circumstances now

21993

// that we've done encoding canonicalization for them.

21994

if (Depth < 2)

21995

return false;

21996

21997

// If we have 3 or more shuffle instructions or a chain involving PSHUFB, we

21998

// can replace them with a single PSHUFB instruction profitably. Intel's

21999

// manuals suggest only using PSHUFB if doing so replacing 5 instructions, but

22000

// in practice PSHUFB tends to be *very* fast so we're more aggressive.

22001

if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {

22002

SmallVector<SDValue, 16> PSHUFBMask;

22003

assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!")((Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can't shuffle elements smaller than bytes!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22003, __PRETTY_FUNCTION__));

22004

int Ratio = 16 / Mask.size();

22005

for (unsigned i = 0; i < 16; ++i) {

22006

if (Mask[i / Ratio] == SM_SentinelUndef) {

22007

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

22008

continue;

22009

}

22010

int M = Mask[i / Ratio] != SM_SentinelZero

22011

? Ratio * Mask[i / Ratio] + i % Ratio

22012

: 255;

22013

PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));

22014

}

22015

Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);

22016

DCI.AddToWorklist(Op.getNode());

22017

SDValue PSHUFBMaskOp =

22018

DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);

22019

DCI.AddToWorklist(PSHUFBMaskOp.getNode());

22020

Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);

22021

DCI.AddToWorklist(Op.getNode());

22022

DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),

22023

/*AddTo*/ true);

22024

return true;

22025

}

22026

22027

// Failed to find any combines.

22028

return false;

22029

}

22030

22031

/// \brief Fully generic combining of x86 shuffle instructions.

22032

///

22033

/// This should be the last combine run over the x86 shuffle instructions. Once

22034

/// they have been fully optimized, this will recursively consider all chains

22035

/// of single-use shuffle instructions, build a generic model of the cumulative

22036

/// shuffle operation, and check for simpler instructions which implement this

22037

/// operation. We use this primarily for two purposes:

22038

///

22039

/// 1) Collapse generic shuffles to specialized single instructions when

22040

/// equivalent. In most cases, this is just an encoding size win, but

22041

/// sometimes we will collapse multiple generic shuffles into a single

22042

/// special-purpose shuffle.

22043

/// 2) Look for sequences of shuffle instructions with 3 or more total

22044

/// instructions, and replace them with the slightly more expensive SSSE3

22045

/// PSHUFB instruction if available. We do this as the last combining step

22046

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

22047

/// a suitable short sequence of other instructions. The PHUFB will either

22048

/// use a register or have to read from memory and so is slightly (but only

22049

/// slightly) more expensive than the other shuffle instructions.

22050

///

22051

/// Because this is inherently a quadratic operation (for each shuffle in

22052

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

22053

/// This should never be an issue in practice as the shuffle lowering doesn't

22054

/// produce sequences of more than 8 instructions.

22055

///

22056

/// FIXME: We will currently miss some cases where the redundant shuffling

22057

/// would simplify under the threshold for PSHUFB formation because of

22058

/// combine-ordering. To fix this, we should do the redundant instruction

22059

/// combining in this recursive walk.

22060

static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,

22061

ArrayRef<int> RootMask,

22062

int Depth, bool HasPSHUFB,

22063

SelectionDAG &DAG,

22064

TargetLowering::DAGCombinerInfo &DCI,

22065

const X86Subtarget *Subtarget) {

22066

// Bound the depth of our recursive combine because this is ultimately

22067

// quadratic in nature.

22068

if (Depth > 8)

22069

return false;

22070

22071

// Directly rip through bitcasts to find the underlying operand.

22072

while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())

22073

Op = Op.getOperand(0);

22074

22075

MVT VT = Op.getSimpleValueType();

22076

if (!VT.isVector())

22077

return false; // Bail if we hit a non-vector.

22078

// FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit

22079

// version should be added.

22080

if (VT.getSizeInBits() != 128)

22081

return false;

22082

22083

assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22084, __PRETTY_FUNCTION__))

22084

"Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22084, __PRETTY_FUNCTION__));

22085

assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22086, __PRETTY_FUNCTION__))

22086

"Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22086, __PRETTY_FUNCTION__));

22087

22088

if (!isTargetShuffle(Op.getOpcode()))

22089

return false;

22090

SmallVector<int, 16> OpMask;

22091

bool IsUnary;

22092

bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);

22093

// We only can combine unary shuffles which we can decode the mask for.

22094

if (!HaveMask || !IsUnary)

22095

return false;

22096

22097

assert(VT.getVectorNumElements() == OpMask.size() &&((VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == OpMask.size() && \"Different mask size from vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22098, __PRETTY_FUNCTION__))

22098

"Different mask size from vector size!")((VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == OpMask.size() && \"Different mask size from vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22098, __PRETTY_FUNCTION__));

22099

assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))

22100

RootMask.size() % OpMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))

22101

(OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))

22102

OpMask.size() % RootMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))

22103

OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))

22104

"The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__));

22105

int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());

22106

int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());

22107

assert(((RootRatio == 1 && OpRatio == 1) ||((((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1
) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && \"Must not have a ratio for both incoming and op masks!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22109, __PRETTY_FUNCTION__))

22108

(RootRatio == 1) != (OpRatio == 1)) &&((((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1
) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && \"Must not have a ratio for both incoming and op masks!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22109, __PRETTY_FUNCTION__))

22109

"Must not have a ratio for both incoming and op masks!")((((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1
) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && \"Must not have a ratio for both incoming and op masks!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22109, __PRETTY_FUNCTION__));

22110

22111

SmallVector<int, 16> Mask;

22112

Mask.reserve(std::max(OpMask.size(), RootMask.size()));

22113

22114

// Merge this shuffle operation's mask into our accumulated mask. Note that

22115

// this shuffle's mask will be the first applied to the input, followed by the

22116

// root mask to get us all the way to the root value arrangement. The reason

22117

// for this order is that we are recursing up the operation chain.

22118

for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {

22119

int RootIdx = i / RootRatio;

22120

if (RootMask[RootIdx] < 0) {

22121

// This is a zero or undef lane, we're done.

22122

Mask.push_back(RootMask[RootIdx]);

22123

continue;

22124

}

22125

22126

int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;

22127

int OpIdx = RootMaskedIdx / OpRatio;

22128

if (OpMask[OpIdx] < 0) {

22129

// The incoming lanes are zero or undef, it doesn't matter which ones we

22130

// are using.

22131

Mask.push_back(OpMask[OpIdx]);

22132

continue;

22133

}

22134

22135

// Ok, we have non-zero lanes, map them through.

22136

Mask.push_back(OpMask[OpIdx] * OpRatio +

22137

RootMaskedIdx % OpRatio);

22138

}

22139

22140

// See if we can recurse into the operand to combine more things.

22141

switch (Op.getOpcode()) {

22142

case X86ISD::PSHUFB:

22143

HasPSHUFB = true;

22144

case X86ISD::PSHUFD:

22145

case X86ISD::PSHUFHW:

22146

case X86ISD::PSHUFLW:

22147

if (Op.getOperand(0).hasOneUse() &&

22148

combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,

22149

HasPSHUFB, DAG, DCI, Subtarget))

22150

return true;

22151

break;

22152

22153

case X86ISD::UNPCKL:

22154

case X86ISD::UNPCKH:

22155

assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!")((Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0) == Op.getOperand(1) && \"We only combine unary shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22155, __PRETTY_FUNCTION__));

22156

// We can't check for single use, we have to check that this shuffle is the only user.

22157

if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&

22158

combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,

22159

HasPSHUFB, DAG, DCI, Subtarget))

22160

return true;

22161

break;

22162

}

22163

22164

// Minor canonicalization of the accumulated shuffle mask to make it easier

22165

// to match below. All this does is detect masks with squential pairs of

22166

// elements, and shrink them to the half-width mask. It does this in a loop

22167

// so it will reduce the size of the mask to the minimal width mask which

22168

// performs an equivalent shuffle.

22169

SmallVector<int, 16> WidenedMask;

22170

while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {

22171

Mask = std::move(WidenedMask);

22172

WidenedMask.clear();

22173

}

22174

22175

return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,

22176

Subtarget);

22177

}

22178

22179

/// \brief Get the PSHUF-style mask from PSHUF node.

22180

///

22181

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

22182

/// PSHUF-style masks that can be reused with such instructions.

22183

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

22184

SmallVector<int, 4> Mask;

22185

bool IsUnary;

22186

bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);

22187

(void)HaveMask;

22188

assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22188, __PRETTY_FUNCTION__));

22189

22190

switch (N.getOpcode()) {

22191

case X86ISD::PSHUFD:

22192

return Mask;

22193

case X86ISD::PSHUFLW:

22194

Mask.resize(4);

22195

return Mask;

22196

case X86ISD::PSHUFHW:

22197

Mask.erase(Mask.begin(), Mask.begin() + 4);

22198

for (int &M : Mask)

22199

M -= 4;

22200

return Mask;

22201

default:

22202

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22202);

22203

}

22204

}

22205

22206

/// \brief Search for a combinable shuffle across a chain ending in pshufd.

22207

///

22208

/// We walk up the chain and look for a combinable shuffle, skipping over

22209

/// shuffles that we could hoist this shuffle's transformation past without

22210

/// altering anything.

22211

static SDValue

22212

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

22213

SelectionDAG &DAG,

22214

TargetLowering::DAGCombinerInfo &DCI) {

22215

assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22216, __PRETTY_FUNCTION__))

22216

"Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22216, __PRETTY_FUNCTION__));

22217

SDLoc DL(N);

22218

22219

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

22220

// of the shuffles in the chain so that we can form a fresh chain to replace

22221

// this one.

22222

SmallVector<SDValue, 8> Chain;

22223

SDValue V = N.getOperand(0);

22224

for (; V.hasOneUse(); V = V.getOperand(0)) {

22225

switch (V.getOpcode()) {

22226

default:

22227

return SDValue(); // Nothing combined!

22228

22229

case ISD::BITCAST:

22230

// Skip bitcasts as we always know the type for the target specific

22231

// instructions.

22232

continue;

22233

22234

case X86ISD::PSHUFD:

22235

// Found another dword shuffle.

22236

break;

22237

22238

case X86ISD::PSHUFLW:

22239

// Check that the low words (being shuffled) are the identity in the

22240

// dword shuffle, and the high words are self-contained.

22241

if (Mask[0] != 0 || Mask[1] != 1 ||

22242

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

22243

return SDValue();

22244

22245

Chain.push_back(V);

22246

continue;

22247

22248

case X86ISD::PSHUFHW:

22249

// Check that the high words (being shuffled) are the identity in the

22250

// dword shuffle, and the low words are self-contained.

22251

if (Mask[2] != 2 || Mask[3] != 3 ||

22252

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

22253

return SDValue();

22254

22255

Chain.push_back(V);

22256

continue;

22257

22258

case X86ISD::UNPCKL:

22259

case X86ISD::UNPCKH:

22260

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

22261

// shuffle into a preceding word shuffle.

22262

if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)

22263

return SDValue();

22264

22265

// Search for a half-shuffle which we can combine with.

22266

unsigned CombineOp =

22267

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

22268

if (V.getOperand(0) != V.getOperand(1) ||

22269

!V->isOnlyUserOf(V.getOperand(0).getNode()))

22270

return SDValue();

22271

Chain.push_back(V);

22272

V = V.getOperand(0);

22273

do {

22274

switch (V.getOpcode()) {

22275

default:

22276

return SDValue(); // Nothing to combine.

22277

22278

case X86ISD::PSHUFLW:

22279

case X86ISD::PSHUFHW:

22280

if (V.getOpcode() == CombineOp)

22281

break;

22282

22283

Chain.push_back(V);

22284

22285

// Fallthrough!

22286

case ISD::BITCAST:

22287

V = V.getOperand(0);

22288

continue;

22289

}

22290

break;

22291

} while (V.hasOneUse());

22292

break;

22293

}

22294

// Break out of the loop if we break out of the switch.

22295

break;

22296

}

22297

22298

if (!V.hasOneUse())

22299

// We fell out of the loop without finding a viable combining instruction.

22300

return SDValue();

22301

22302

// Merge this node's mask and our incoming mask.

22303

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

22304

for (int &M : Mask)

22305

M = VMask[M];

22306

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

22307

getV4X86ShuffleImm8ForMask(Mask, DAG));

22308

22309

// Rebuild the chain around this new shuffle.

22310

while (!Chain.empty()) {

22311

SDValue W = Chain.pop_back_val();

22312

22313

if (V.getValueType() != W.getOperand(0).getValueType())

22314

V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);

22315

22316

switch (W.getOpcode()) {

22317

default:

22318

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22318);

22319

22320

case X86ISD::UNPCKL:

22321

case X86ISD::UNPCKH:

22322

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

22323

break;

22324

22325

case X86ISD::PSHUFD:

22326

case X86ISD::PSHUFLW:

22327

case X86ISD::PSHUFHW:

22328

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

22329

break;

22330

}

22331

}

22332

if (V.getValueType() != N.getValueType())

22333

V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);

22334

22335

// Return the new chain to replace N.

22336

return V;

22337

}

22338

22339

/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.

22340

///

22341

/// We walk up the chain, skipping shuffles of the other half and looking

22342

/// through shuffles which switch halves trying to find a shuffle of the same

22343

/// pair of dwords.

22344

static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,

22345

SelectionDAG &DAG,

22346

TargetLowering::DAGCombinerInfo &DCI) {

22347

assert((((N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD
::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22349, __PRETTY_FUNCTION__))

22348

(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&(((N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD
::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22349, __PRETTY_FUNCTION__))

22349

"Called with something other than an x86 128-bit half shuffle!")(((N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD
::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22349, __PRETTY_FUNCTION__));

22350

SDLoc DL(N);

22351

unsigned CombineOpcode = N.getOpcode();

22352

22353

// Walk up a single-use chain looking for a combinable shuffle.

22354

SDValue V = N.getOperand(0);

22355

for (; V.hasOneUse(); V = V.getOperand(0)) {

22356

switch (V.getOpcode()) {

22357

default:

22358

return false; // Nothing combined!

22359

22360

case ISD::BITCAST:

22361

// Skip bitcasts as we always know the type for the target specific

22362

// instructions.

22363

continue;

22364

22365

case X86ISD::PSHUFLW:

22366

case X86ISD::PSHUFHW:

22367

if (V.getOpcode() == CombineOpcode)

22368

break;

22369

22370

// Other-half shuffles are no-ops.

22371

continue;

22372

}

22373

// Break out of the loop if we break out of the switch.

22374

break;

22375

}

22376

22377

if (!V.hasOneUse())

22378

// We fell out of the loop without finding a viable combining instruction.

22379

return false;

22380

22381

// Combine away the bottom node as its shuffle will be accumulated into

22382

// a preceding shuffle.

22383

DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);

22384

22385

// Record the old value.

22386

SDValue Old = V;

22387

22388

// Merge this node's mask and our incoming mask (adjusted to account for all

22389

// the pshufd instructions encountered).

22390

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

22391

for (int &M : Mask)

22392

M = VMask[M];

22393

V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),

22394

getV4X86ShuffleImm8ForMask(Mask, DAG));

22395

22396

// Check that the shuffles didn't cancel each other out. If not, we need to

22397

// combine to the new one.

22398

if (Old != V)

22399

// Replace the combinable shuffle with the combined one, updating all users

22400

// so that we re-evaluate the chain here.

22401

DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);

22402

22403

return true;

22404

}

22405

22406

/// \brief Try to combine x86 target specific shuffles.

22407

static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,

22408

TargetLowering::DAGCombinerInfo &DCI,

22409

const X86Subtarget *Subtarget) {

22410

SDLoc DL(N);

22411

MVT VT = N.getSimpleValueType();

22412

SmallVector<int, 4> Mask;

22413

22414

switch (N.getOpcode()) {

22415

case X86ISD::PSHUFD:

22416

case X86ISD::PSHUFLW:

22417

case X86ISD::PSHUFHW:

22418

Mask = getPSHUFShuffleMask(N);

22419

assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail
("Mask.size() == 4", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22419, __PRETTY_FUNCTION__));

22420

break;

22421

default:

22422

return SDValue();

22423

}

22424

22425

// Nuke no-op shuffles that show up after combining.

22426

if (isNoopShuffleMask(Mask))

22427

return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);

22428

22429

// Look for simplifications involving one or two shuffle instructions.

22430

SDValue V = N.getOperand(0);

22431

switch (N.getOpcode()) {

22432

default:

22433

break;

22434

case X86ISD::PSHUFLW:

22435

case X86ISD::PSHUFHW:

22436

assert(VT == MVT::v8i16)((VT == MVT::v8i16) ? static_cast<void> (0) : __assert_fail
("VT == MVT::v8i16", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22436, __PRETTY_FUNCTION__));

22437

(void)VT;

22438

22439

if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))

22440

return SDValue(); // We combined away this shuffle, so we're done.

22441

22442

// See if this reduces to a PSHUFD which is no more expensive and can

22443

// combine with more operations. Note that it has to at least flip the

22444

// dwords as otherwise it would have been removed as a no-op.

22445

if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {

22446

int DMask[] = {0, 1, 2, 3};

22447

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

22448

DMask[DOffset + 0] = DOffset + 1;

22449

DMask[DOffset + 1] = DOffset + 0;

22450

V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);

22451

DCI.AddToWorklist(V.getNode());

22452

V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,

22453

getV4X86ShuffleImm8ForMask(DMask, DAG));

22454

DCI.AddToWorklist(V.getNode());

22455

return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);

22456

}

22457

22458

// Look for shuffle patterns which can be implemented as a single unpack.

22459

// FIXME: This doesn't handle the location of the PSHUFD generically, and

22460

// only works when we have a PSHUFD followed by two half-shuffles.

22461

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

22462

(V.getOpcode() == X86ISD::PSHUFLW ||

22463

V.getOpcode() == X86ISD::PSHUFHW) &&

22464

V.getOpcode() != N.getOpcode() &&

22465

V.hasOneUse()) {

22466

SDValue D = V.getOperand(0);

22467

while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())

22468

D = D.getOperand(0);

22469

if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {

22470

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

22471

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

22472

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

22473

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

22474

int WordMask[8];

22475

for (int i = 0; i < 4; ++i) {

22476

WordMask[i + NOffset] = Mask[i] + NOffset;

22477

WordMask[i + VOffset] = VMask[i] + VOffset;

22478

}

22479

// Map the word mask through the DWord mask.

22480

int MappedMask[8];

22481

for (int i = 0; i < 8; ++i)

22482

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

22483

const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};

22484

const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};

22485

if (std::equal(std::begin(MappedMask), std::end(MappedMask),

22486

std::begin(UnpackLoMask)) ||

22487

std::equal(std::begin(MappedMask), std::end(MappedMask),

22488

std::begin(UnpackHiMask))) {

22489

// We can replace all three shuffles with an unpack.

22490

V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));

22491

DCI.AddToWorklist(V.getNode());

22492

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

22493

: X86ISD::UNPCKH,

22494

DL, MVT::v8i16, V, V);

22495

}

22496

}

22497

}

22498

22499

break;

22500

22501

case X86ISD::PSHUFD:

22502

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))

22503

return NewN;

22504

22505

break;

22506

}

22507

22508

return SDValue();

22509

}

22510

22511

/// \brief Try to combine a shuffle into a target-specific add-sub node.

22512

///

22513

/// We combine this directly on the abstract vector shuffle nodes so it is

22514

/// easier to generically match. We also insert dummy vector shuffle nodes for

22515

/// the operands which explicitly discard the lanes which are unused by this

22516

/// operation to try to flow through the rest of the combiner the fact that

22517

/// they're unused.

22518

static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {

22519

SDLoc DL(N);

22520

EVT VT = N->getValueType(0);

22521

22522

// We only handle target-independent shuffles.

22523

// FIXME: It would be easy and harmless to use the target shuffle mask

22524

// extraction tool to support more.

22525

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

22526

return SDValue();

22527

22528

auto *SVN = cast<ShuffleVectorSDNode>(N);

22529

ArrayRef<int> Mask = SVN->getMask();

22530

SDValue V1 = N->getOperand(0);

22531

SDValue V2 = N->getOperand(1);

22532

22533

// We require the first shuffle operand to be the SUB node, and the second to

22534

// be the ADD node.

22535

// FIXME: We should support the commuted patterns.

22536

if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)

22537

return SDValue();

22538

22539

// If there are other uses of these operations we can't fold them.

22540

if (!V1->hasOneUse() || !V2->hasOneUse())

22541

return SDValue();

22542

22543

// Ensure that both operations have the same operands. Note that we can

22544

// commute the FADD operands.

22545

SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);

22546

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

22547

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

22548

return SDValue();

22549

22550

// We're looking for blends between FADD and FSUB nodes. We insist on these

22551

// nodes being lined up in a specific expected pattern.

22552

if (!(isShuffleEquivalent(Mask, 0, 3) ||

22553

isShuffleEquivalent(Mask, 0, 5, 2, 7) ||

22554

isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))

22555

return SDValue();

22556

22557

// Only specific types are legal at this point, assert so we notice if and

22558

// when these change.

22559

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
VT == MVT::v4f64) && "Unknown vector type encountered!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unknown vector type encountered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22561, __PRETTY_FUNCTION__))

22560

VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
VT == MVT::v4f64) && "Unknown vector type encountered!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unknown vector type encountered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22561, __PRETTY_FUNCTION__))

22561

"Unknown vector type encountered!")(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
VT == MVT::v4f64) && "Unknown vector type encountered!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unknown vector type encountered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22561, __PRETTY_FUNCTION__));

22562

22563

return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);

22564

}

22565

22566

/// PerformShuffleCombine - Performs several different shuffle combines.

22567

static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,

22568

TargetLowering::DAGCombinerInfo &DCI,

22569

const X86Subtarget *Subtarget) {

22570

SDLoc dl(N);

22571

SDValue N0 = N->getOperand(0);

22572

SDValue N1 = N->getOperand(1);

22573

EVT VT = N->getValueType(0);

22574

22575

// Don't create instructions with illegal types after legalize types has run.

22576

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22577

if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))

Taking false branch

→

22578

return SDValue();

22579

22580

// If we have legalized the vector types, look for blends of FADD and FSUB

22581

// nodes that we can fuse into an ADDSUB node.

22582

if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())

22583

if (SDValue AddSub = combineShuffleToAddSub(N, DAG))

22584

return AddSub;

22585

22586

// Combine 256-bit vector shuffles. This is only profitable when in AVX mode

22587

if (Subtarget->hasFp256() && VT.is256BitVector() &&

22588

N->getOpcode() == ISD::VECTOR_SHUFFLE)

22589

return PerformShuffleCombine256(N, DAG, DCI, Subtarget);

22590

22591

// During Type Legalization, when promoting illegal vector types,

22592

// the backend might introduce new shuffle dag nodes and bitcasts.

22593

22594

// This code performs the following transformation:

22595

// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->

22596

// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)

22597

22598

// We do this only if both the bitcast and the BINOP dag nodes have

22599

// one use. Also, perform this transformation only if the new binary

22600

// operation is legal. This is to avoid introducing dag nodes that

22601

// potentially need to be further expanded (or custom lowered) into a

22602

// less optimal sequence of dag nodes.

22603

if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&

22604

N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&

22605

N0.getOpcode() == ISD::BITCAST) {

22606

SDValue BC0 = N0.getOperand(0);

22607

EVT SVT = BC0.getValueType();

22608

unsigned Opcode = BC0.getOpcode();

22609

unsigned NumElts = VT.getVectorNumElements();

22610

22611

if (BC0.hasOneUse() && SVT.isVector() &&

22612

SVT.getVectorNumElements() * 2 == NumElts &&

22613

TLI.isOperationLegal(Opcode, VT)) {

22614

bool CanFold = false;

22615

switch (Opcode) {

22616

default : break;

22617

case ISD::ADD :

22618

case ISD::FADD :

22619

case ISD::SUB :

22620

case ISD::FSUB :

22621

case ISD::MUL :

22622

case ISD::FMUL :

22623

CanFold = true;

22624

}

22625

22626

unsigned SVTNumElts = SVT.getVectorNumElements();

22627

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

22628

for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)

22629

CanFold = SVOp->getMaskElt(i) == (int)(i * 2);

22630

for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)

22631

CanFold = SVOp->getMaskElt(i) < 0;

22632

22633

if (CanFold) {

22634

SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));

22635

SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));

22636

SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);

22637

return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);

22638

}

22639

}

22640

}

22641

22642

// Only handle 128 wide vector from here on.

22643

if (!VT.is128BitVector())

←

Taking false branch

→

22644

return SDValue();

22645

22646

// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,

22647

// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are

22648

// consecutive, non-overlapping, and in the right order.

22649

SmallVector<SDValue, 16> Elts;

22650

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)

←

Assuming 'i' is equal to 'e'

→

←

Loop condition is false. Execution continues on line 22653

→

22651

Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));

22652

22653

SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);

←

Calling 'EltsFromConsecutiveLoads'

→

22654

if (LD.getNode())

22655

return LD;

22656

22657

if (isTargetShuffle(N->getOpcode())) {

22658

SDValue Shuffle =

22659

PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);

22660

if (Shuffle.getNode())

22661

return Shuffle;

22662

22663

// Try recursively combining arbitrary sequences of x86 shuffle

22664

// instructions into higher-order shuffles. We do this after combining

22665

// specific PSHUF instruction sequences into their minimal form so that we

22666

// can evaluate how many specialized shuffle instructions are involved in

22667

// a particular chain.

22668

SmallVector<int, 1> NonceMask; // Just a placeholder.

22669

NonceMask.push_back(0);

22670

if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,

22671

/*Depth*/ 1, /*HasPSHUFB*/ false, DAG,

22672

DCI, Subtarget))

22673

return SDValue(); // This routine will use CombineTo to replace N.

22674

}

22675

22676

return SDValue();

22677

}

22678

22679

/// PerformTruncateCombine - Converts truncate operation to

22680

/// a sequence of vector shuffle operations.

22681

/// It is possible when we truncate 256-bit vector to 128-bit vector

22682

static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,

22683

TargetLowering::DAGCombinerInfo &DCI,

22684

const X86Subtarget *Subtarget) {

22685

return SDValue();

22686

}

22687

22688

/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target

22689

/// specific shuffle of a load can be folded into a single element load.

22690

/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but

22691

/// shuffles have been custom lowered so we need to handle those here.

22692

static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,

22693

TargetLowering::DAGCombinerInfo &DCI) {

22694

if (DCI.isBeforeLegalizeOps())

22695

return SDValue();

22696

22697

SDValue InVec = N->getOperand(0);

22698

SDValue EltNo = N->getOperand(1);

22699

22700

if (!isa<ConstantSDNode>(EltNo))

22701

return SDValue();

22702

22703

EVT OriginalVT = InVec.getValueType();

22704

22705

if (InVec.getOpcode() == ISD::BITCAST) {

22706

// Don't duplicate a load with other uses.

22707

if (!InVec.hasOneUse())

22708

return SDValue();

22709

EVT BCVT = InVec.getOperand(0).getValueType();

22710

if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())

22711

return SDValue();

22712

InVec = InVec.getOperand(0);

22713

}

22714

22715

EVT CurrentVT = InVec.getValueType();

22716

22717

if (!isTargetShuffle(InVec.getOpcode()))

22718

return SDValue();

22719

22720

// Don't duplicate a load with other uses.

22721

if (!InVec.hasOneUse())

22722

return SDValue();

22723

22724

SmallVector<int, 16> ShuffleMask;

22725

bool UnaryShuffle;

22726

if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),

22727

ShuffleMask, UnaryShuffle))

22728

return SDValue();

22729

22730

// Select the input vector, guarding against out of range extract vector.

22731

unsigned NumElems = CurrentVT.getVectorNumElements();

22732

int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();

22733

int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];

22734

SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)

22735

: InVec.getOperand(1);

22736

22737

// If inputs to shuffle are the same for both ops, then allow 2 uses

22738

unsigned AllowedUses = InVec.getNumOperands() > 1 &&

22739

InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;

22740

22741

if (LdNode.getOpcode() == ISD::BITCAST) {

22742

// Don't duplicate a load with other uses.

22743

if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))

22744

return SDValue();

22745

22746

AllowedUses = 1; // only allow 1 load use if we have a bitcast

22747

LdNode = LdNode.getOperand(0);

22748

}

22749

22750

if (!ISD::isNormalLoad(LdNode.getNode()))

22751

return SDValue();

22752

22753

LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

22754

22755

if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())

22756

return SDValue();

22757

22758

EVT EltVT = N->getValueType(0);

22759

// If there's a bitcast before the shuffle, check if the load type and

22760

// alignment is valid.

22761

unsigned Align = LN0->getAlignment();

22762

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22763

unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(

22764

EltVT.getTypeForEVT(*DAG.getContext()));

22765

22766

if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))

22767

return SDValue();

22768

22769

// All checks match so transform back to vector_shuffle so that DAG combiner

22770

// can finish the job

22771

SDLoc dl(N);

22772

22773

// Create shuffle node taking into account the case that its a unary shuffle

22774

SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)

22775

: InVec.getOperand(1);

22776

Shuffle = DAG.getVectorShuffle(CurrentVT, dl,

22777

InVec.getOperand(0), Shuffle,

22778

&ShuffleMask[0]);

22779

Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);

22780

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,

22781

EltNo);

22782

}

22783

22784

/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index

22785

/// generation and convert it from being a bunch of shuffles and extracts

22786

/// into a somewhat faster sequence. For i686, the best sequence is apparently

22787

/// storing the value and loading scalars back, while for x64 we should

22788

/// use 64-bit extracts and shifts.

22789

static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,

22790

TargetLowering::DAGCombinerInfo &DCI) {

22791

SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);

22792

if (NewOp.getNode())

22793

return NewOp;

22794

22795

SDValue InputVector = N->getOperand(0);

22796

22797

// Detect whether we are trying to convert from mmx to i32 and the bitcast

22798

// from mmx to v2i32 has a single usage.

22799

if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&

22800

InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&

22801

InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)

22802

return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),

22803

N->getValueType(0),

22804

InputVector.getNode()->getOperand(0));

22805

22806

// Only operate on vectors of 4 elements, where the alternative shuffling

22807

// gets to be more expensive.

22808

if (InputVector.getValueType() != MVT::v4i32)

22809

return SDValue();

22810

22811

// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a

22812

// single use which is a sign-extend or zero-extend, and all elements are

22813

// used.

22814

SmallVector<SDNode *, 4> Uses;

22815

unsigned ExtractedElements = 0;

22816

for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),

22817

UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {

22818

if (UI.getUse().getResNo() != InputVector.getResNo())

22819

return SDValue();

22820

22821

SDNode *Extract = *UI;

22822

if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

22823

return SDValue();

22824

22825

if (Extract->getValueType(0) != MVT::i32)

22826

return SDValue();

22827

if (!Extract->hasOneUse())

22828

return SDValue();

22829

if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&

22830

Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)

22831

return SDValue();

22832

if (!isa<ConstantSDNode>(Extract->getOperand(1)))

22833

return SDValue();

22834

22835

// Record which element was extracted.

22836

ExtractedElements |=

22837

1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();

22838

22839

Uses.push_back(Extract);

22840

}

22841

22842

// If not all the elements were used, this may not be worthwhile.

22843

if (ExtractedElements != 15)

22844

return SDValue();

22845

22846

// Ok, we've now decided to do the transformation.

22847

// If 64-bit shifts are legal, use the extract-shift sequence,

22848

// otherwise bounce the vector off the cache.

22849

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22850

SDValue Vals[4];

22851

SDLoc dl(InputVector);

22852

22853

if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {

22854

SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);

22855

EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();

22856

SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,

22857

DAG.getConstant(0, VecIdxTy));

22858

SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,

22859

DAG.getConstant(1, VecIdxTy));

22860

22861

SDValue ShAmt = DAG.getConstant(32,

22862

DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));

22863

Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);

22864

Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,

22865

DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));

22866

Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);

22867

Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,

22868

DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));

22869

} else {

22870

// Store the value to a temporary stack slot.

22871

SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());

22872

SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,

22873

MachinePointerInfo(), false, false, 0);

22874

22875

EVT ElementType = InputVector.getValueType().getVectorElementType();

22876

unsigned EltSize = ElementType.getSizeInBits() / 8;

22877

22878

// Replace each use (extract) with a load of the appropriate element.

22879

for (unsigned i = 0; i < 4; ++i) {

22880

uint64_t Offset = EltSize * i;

22881

SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());

22882

22883

SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),

22884

StackPtr, OffsetVal);

22885

22886

// Load the scalar.

22887

Vals[i] = DAG.getLoad(ElementType, dl, Ch,

22888

ScalarAddr, MachinePointerInfo(),

22889

false, false, false, 0);

22890

22891

}

22892

}

22893

22894

// Replace the extracts

22895

for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),

22896

UE = Uses.end(); UI != UE; ++UI) {

22897

SDNode *Extract = *UI;

22898

22899

SDValue Idx = Extract->getOperand(1);

22900

uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

22901

DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);

22902

}

22903

22904

// The replacement was made in place; don't return anything.

22905

return SDValue();

22906

}

22907

22908

/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.

22909

static std::pair<unsigned, bool>

22910

matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,

22911

SelectionDAG &DAG, const X86Subtarget *Subtarget) {

22912

if (!VT.isVector())

22913

return std::make_pair(0, false);

22914

22915

bool NeedSplit = false;

22916

switch (VT.getSimpleVT().SimpleTy) {

22917

default: return std::make_pair(0, false);

22918

case MVT::v4i64:

22919

case MVT::v2i64:

22920

if (!Subtarget->hasVLX())

22921

return std::make_pair(0, false);

22922

break;

22923

case MVT::v64i8:

22924

case MVT::v32i16:

22925

if (!Subtarget->hasBWI())

22926

return std::make_pair(0, false);

22927

break;

22928

case MVT::v16i32:

22929

case MVT::v8i64:

22930

if (!Subtarget->hasAVX512())

22931

return std::make_pair(0, false);

22932

break;

22933

case MVT::v32i8:

22934

case MVT::v16i16:

22935

case MVT::v8i32:

22936

if (!Subtarget->hasAVX2())

22937

NeedSplit = true;

22938

if (!Subtarget->hasAVX())

22939

return std::make_pair(0, false);

22940

break;

22941

case MVT::v16i8:

22942

case MVT::v8i16:

22943

case MVT::v4i32:

22944

if (!Subtarget->hasSSE2())

22945

return std::make_pair(0, false);

22946

}

22947

22948

// SSE2 has only a small subset of the operations.

22949

bool hasUnsigned = Subtarget->hasSSE41() ||

22950

(Subtarget->hasSSE2() && VT == MVT::v16i8);

22951

bool hasSigned = Subtarget->hasSSE41() ||

22952

(Subtarget->hasSSE2() && VT == MVT::v8i16);

22953

22954

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

22955

22956

unsigned Opc = 0;

22957

// Check for x CC y ? x : y.

22958

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

22959

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

22960

switch (CC) {

22961

default: break;

22962

case ISD::SETULT:

22963

case ISD::SETULE:

22964

Opc = hasUnsigned ? X86ISD::UMIN : 0; break;

22965

case ISD::SETUGT:

22966

case ISD::SETUGE:

22967

Opc = hasUnsigned ? X86ISD::UMAX : 0; break;

22968

case ISD::SETLT:

22969

case ISD::SETLE:

22970

Opc = hasSigned ? X86ISD::SMIN : 0; break;

22971

case ISD::SETGT:

22972

case ISD::SETGE:

22973

Opc = hasSigned ? X86ISD::SMAX : 0; break;

22974

}

22975

// Check for x CC y ? y : x -- a min/max with reversed arms.

22976

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

22977

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

22978

switch (CC) {

22979

default: break;

22980

case ISD::SETULT:

22981

case ISD::SETULE:

22982

Opc = hasUnsigned ? X86ISD::UMAX : 0; break;

22983

case ISD::SETUGT:

22984

case ISD::SETUGE:

22985

Opc = hasUnsigned ? X86ISD::UMIN : 0; break;

22986

case ISD::SETLT:

22987

case ISD::SETLE:

22988

Opc = hasSigned ? X86ISD::SMAX : 0; break;

22989

case ISD::SETGT:

22990

case ISD::SETGE:

22991

Opc = hasSigned ? X86ISD::SMIN : 0; break;

22992

}

22993

}

22994

22995

return std::make_pair(Opc, NeedSplit);

22996

}

22997

22998

static SDValue

22999

transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,

23000

const X86Subtarget *Subtarget) {

23001

SDLoc dl(N);

23002

SDValue Cond = N->getOperand(0);

23003

SDValue LHS = N->getOperand(1);

23004

SDValue RHS = N->getOperand(2);

23005

23006

if (Cond.getOpcode() == ISD::SIGN_EXTEND) {

23007

SDValue CondSrc = Cond->getOperand(0);

23008

if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)

23009

Cond = CondSrc->getOperand(0);

23010

}

23011

23012

if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

23013

return SDValue();

23014

23015

// A vselect where all conditions and data are constants can be optimized into

23016

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

23017

if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

23018

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

23019

return SDValue();

23020

23021

unsigned MaskValue = 0;

23022

if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))

23023

return SDValue();

23024

23025

MVT VT = N->getSimpleValueType(0);

23026

unsigned NumElems = VT.getVectorNumElements();

23027

SmallVector<int, 8> ShuffleMask(NumElems, -1);

23028

for (unsigned i = 0; i < NumElems; ++i) {

23029

// Be sure we emit undef where we can.

23030

if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)

23031

ShuffleMask[i] = -1;

23032

else

23033

ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);

23034

}

23035

23036

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

23037

if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))

23038

return SDValue();

23039

return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);

23040

}

23041

23042

/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT

23043

/// nodes.

23044

static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,

23045

TargetLowering::DAGCombinerInfo &DCI,

23046

const X86Subtarget *Subtarget) {

23047

SDLoc DL(N);

23048

SDValue Cond = N->getOperand(0);

23049

// Get the LHS/RHS of the select.

23050

SDValue LHS = N->getOperand(1);

23051

SDValue RHS = N->getOperand(2);

23052

EVT VT = LHS.getValueType();

23053

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

23054

23055

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

23056

// instructions match the semantics of the common C idiom x<y?x:y but not

23057

// x<=y?x:y, because of how they handle negative zero (which can be

23058

// ignored in unsafe-math mode).

23059

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

23060

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

23061

VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

23062

(Subtarget->hasSSE2() ||

23063

(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {

23064

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

23065

23066

unsigned Opcode = 0;

23067

// Check for x CC y ? x : y.

23068

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

23069

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

23070

switch (CC) {

23071

default: break;

23072

case ISD::SETULT:

23073

// Converting this to a min would handle NaNs incorrectly, and swapping

23074

// the operands would cause it to handle comparisons between positive

23075

// and negative zero incorrectly.

23076

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

23077

if (!DAG.getTarget().Options.UnsafeFPMath &&

23078

!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))

23079

break;

23080

std::swap(LHS, RHS);

23081

}

23082

Opcode = X86ISD::FMIN;

23083

break;

23084

case ISD::SETOLE:

23085

// Converting this to a min would handle comparisons between positive

23086

// and negative zero incorrectly.

23087

if (!DAG.getTarget().Options.UnsafeFPMath &&

23088

!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))

23089

break;

23090

Opcode = X86ISD::FMIN;

23091

break;

23092

case ISD::SETULE:

23093

// Converting this to a min would handle both negative zeros and NaNs

23094

// incorrectly, but we can swap the operands to fix both.

23095

std::swap(LHS, RHS);

23096

case ISD::SETOLT:

23097

case ISD::SETLT:

23098

case ISD::SETLE:

23099

Opcode = X86ISD::FMIN;

23100

break;

23101

23102

case ISD::SETOGE:

23103

// Converting this to a max would handle comparisons between positive

23104

// and negative zero incorrectly.

23105

if (!DAG.getTarget().Options.UnsafeFPMath &&

23106

!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))

23107

break;

23108

Opcode = X86ISD::FMAX;

23109

break;

23110

case ISD::SETUGT:

23111

// Converting this to a max would handle NaNs incorrectly, and swapping

23112

// the operands would cause it to handle comparisons between positive

23113

// and negative zero incorrectly.

23114

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

23115

if (!DAG.getTarget().Options.UnsafeFPMath &&

23116

!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))

23117

break;

23118

std::swap(LHS, RHS);

23119

}

23120

Opcode = X86ISD::FMAX;

23121

break;

23122

case ISD::SETUGE:

23123

// Converting this to a max would handle both negative zeros and NaNs

23124

// incorrectly, but we can swap the operands to fix both.

23125

std::swap(LHS, RHS);

23126

case ISD::SETOGT:

23127

case ISD::SETGT:

23128

case ISD::SETGE:

23129

Opcode = X86ISD::FMAX;

23130

break;

23131

}

23132

// Check for x CC y ? y : x -- a min/max with reversed arms.

23133

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

23134

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

23135

switch (CC) {

23136

default: break;

23137

case ISD::SETOGE:

23138

// Converting this to a min would handle comparisons between positive

23139

// and negative zero incorrectly, and swapping the operands would

23140

// cause it to handle NaNs incorrectly.

23141

if (!DAG.getTarget().Options.UnsafeFPMath &&

23142

!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {

23143

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

23144

break;

23145

std::swap(LHS, RHS);

23146

}

23147

Opcode = X86ISD::FMIN;

23148

break;

23149

case ISD::SETUGT:

23150

// Converting this to a min would handle NaNs incorrectly.

23151

if (!DAG.getTarget().Options.UnsafeFPMath &&

23152

(!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))

23153

break;

23154

Opcode = X86ISD::FMIN;

23155

break;

23156

case ISD::SETUGE:

23157

// Converting this to a min would handle both negative zeros and NaNs

23158

// incorrectly, but we can swap the operands to fix both.

23159

std::swap(LHS, RHS);

23160

case ISD::SETOGT:

23161

case ISD::SETGT:

23162

case ISD::SETGE:

23163

Opcode = X86ISD::FMIN;

23164

break;

23165

23166

case ISD::SETULT:

23167

// Converting this to a max would handle NaNs incorrectly.

23168

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

23169

break;

23170

Opcode = X86ISD::FMAX;

23171

break;

23172

case ISD::SETOLE:

23173

// Converting this to a max would handle comparisons between positive

23174

// and negative zero incorrectly, and swapping the operands would

23175

// cause it to handle NaNs incorrectly.

23176

if (!DAG.getTarget().Options.UnsafeFPMath &&

23177

!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {

23178

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

23179

break;

23180

std::swap(LHS, RHS);

23181

}

23182

Opcode = X86ISD::FMAX;

23183

break;

23184

case ISD::SETULE:

23185

// Converting this to a max would handle both negative zeros and NaNs

23186

// incorrectly, but we can swap the operands to fix both.

23187

std::swap(LHS, RHS);

23188

case ISD::SETOLT:

23189

case ISD::SETLT:

23190

case ISD::SETLE:

23191

Opcode = X86ISD::FMAX;

23192

break;

23193

}

23194

}

23195

23196

if (Opcode)

23197

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

23198

}

23199

23200

EVT CondVT = Cond.getValueType();

23201

if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&

23202

CondVT.getVectorElementType() == MVT::i1) {

23203

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

23204

// lowering on KNL. In this case we convert it to

23205

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

23206

// The same situation for all 128 and 256-bit vectors of i8 and i16.

23207

// Since SKX these selects have a proper lowering.

23208

EVT OpVT = LHS.getValueType();

23209

if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&

23210

(OpVT.getVectorElementType() == MVT::i8 ||

23211

OpVT.getVectorElementType() == MVT::i16) &&

23212

!(Subtarget->hasBWI() && Subtarget->hasVLX())) {

23213

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);

23214

DCI.AddToWorklist(Cond.getNode());

23215

return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);

23216

}

23217

}

23218

// If this is a select between two integer constants, try to do some

23219

// optimizations.

23220

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {

23221

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))

23222

// Don't do this for crazy integer types.

23223

if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {

23224

// If this is efficiently invertible, canonicalize the LHSC/RHSC values

23225

// so that TrueC (the true value) is larger than FalseC.

23226

bool NeedsCondInvert = false;

23227

23228

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&

23229

// Efficiently invertible.

23230

(Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.

23231

(Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.

23232

isa<ConstantSDNode>(Cond.getOperand(1))))) {

23233

NeedsCondInvert = true;

23234

std::swap(TrueC, FalseC);

23235

}

23236

23237

// Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.

23238

if (FalseC->getAPIntValue() == 0 &&

23239

TrueC->getAPIntValue().isPowerOf2()) {

23240

if (NeedsCondInvert) // Invert the condition if needed.

23241

Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,

23242

DAG.getConstant(1, Cond.getValueType()));

23243

23244

// Zero extend the condition if needed.

23245

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);

23246

23247

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

23248

return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,

23249

DAG.getConstant(ShAmt, MVT::i8));

23250

}

23251

23252

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.

23253

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

23254

if (NeedsCondInvert) // Invert the condition if needed.

23255

Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,

23256

DAG.getConstant(1, Cond.getValueType()));

23257

23258

// Zero extend the condition if needed.

23259

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

23260

FalseC->getValueType(0), Cond);

23261

return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

23262

SDValue(FalseC, 0));

23263

}

23264

23265

// Optimize cases that will turn into an LEA instruction. This requires

23266

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

23267

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

23268

uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();

23269

if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

23270

23271

bool isFastMultiplier = false;

23272

if (Diff < 10) {

23273

switch ((unsigned char)Diff) {

23274

default: break;

23275

case 1: // result = add base, cond

23276

case 2: // result = lea base( , cond*2)

23277

case 3: // result = lea base(cond, cond*2)

23278

case 4: // result = lea base( , cond*4)

23279

case 5: // result = lea base(cond, cond*4)

23280

case 8: // result = lea base( , cond*8)

23281

case 9: // result = lea base(cond, cond*8)

23282

isFastMultiplier = true;

23283

break;

23284

}

23285

}

23286

23287

if (isFastMultiplier) {

23288

APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();

23289

if (NeedsCondInvert) // Invert the condition if needed.

23290

Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,

23291

DAG.getConstant(1, Cond.getValueType()));

23292

23293

// Zero extend the condition if needed.

23294

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

23295

Cond);

23296

// Scale the condition by the difference.

23297

if (Diff != 1)

23298

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

23299

DAG.getConstant(Diff, Cond.getValueType()));

23300

23301

// Add the base if non-zero.

23302

if (FalseC->getAPIntValue() != 0)

23303

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

23304

SDValue(FalseC, 0));

23305

return Cond;

23306

}

23307

}

23308

}

23309

}

23310

23311

// Canonicalize max and min:

23312

// (x > y) ? x : y -> (x >= y) ? x : y

23313

// (x < y) ? x : y -> (x <= y) ? x : y

23314

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

23315

// the need for an extra compare

23316

// against zero. e.g.

23317

// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0

23318

// subl %esi, %edi

23319

// testl %edi, %edi

23320

// movl $0, %eax

23321

// cmovgl %edi, %eax

23322

// =>

23323

// xorl %eax, %eax

23324

// subl %esi, $edi

23325

// cmovsl %eax, %edi

23326

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

23327

DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

23328

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

23329

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

23330

switch (CC) {

23331

default: break;

23332

case ISD::SETLT:

23333

case ISD::SETGT: {

23334

ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;

23335

Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),

23336

Cond.getOperand(0), Cond.getOperand(1), NewCC);

23337

return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);

23338

}

23339

}

23340

}

23341

23342

// Early exit check

23343

if (!TLI.isTypeLegal(VT))

23344

return SDValue();

23345

23346

// Match VSELECTs into subs with unsigned saturation.

23347

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

23348

// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.

23349

((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||

23350

(Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {

23351

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

23352

23353

// Check if one of the arms of the VSELECT is a zero vector. If it's on the

23354

// left side invert the predicate to simplify logic below.

23355

SDValue Other;

23356

if (ISD::isBuildVectorAllZeros(LHS.getNode())) {

23357

Other = RHS;

23358

CC = ISD::getSetCCInverse(CC, true);

23359

} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {

23360

Other = LHS;

23361

}

23362

23363

if (Other.getNode() && Other->getNumOperands() == 2 &&

23364

DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {

23365

SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);

23366

SDValue CondRHS = Cond->getOperand(1);

23367

23368

// Look for a general sub with unsigned saturation first.

23369

// x >= y ? x-y : 0 --> subus x, y

23370

// x > y ? x-y : 0 --> subus x, y

23371

if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&

23372

Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))

23373

return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);

23374

23375

if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))

23376

if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {

23377

if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))

23378

if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())

23379

// If the RHS is a constant we have to reverse the const

23380

// canonicalization.

23381

// x > C-1 ? x+-C : 0 --> subus x, C

23382

if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&

23383

CondRHSConst->getAPIntValue() ==

23384

(-OpRHSConst->getAPIntValue() - 1))

23385

return DAG.getNode(

23386

X86ISD::SUBUS, DL, VT, OpLHS,

23387

DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));

23388

23389

// Another special case: If C was a sign bit, the sub has been

23390

// canonicalized into a xor.

23391

// FIXME: Would it be better to use computeKnownBits to determine

23392

// whether it's safe to decanonicalize the xor?

23393

// x s< 0 ? x^C : 0 --> subus x, C

23394

if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&

23395

ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&

23396

OpRHSConst->getAPIntValue().isSignBit())

23397

// Note that we have to rebuild the RHS constant here to ensure we

23398

// don't rely on particular values of undef lanes.

23399

return DAG.getNode(

23400

X86ISD::SUBUS, DL, VT, OpLHS,

23401

DAG.getConstant(OpRHSConst->getAPIntValue(), VT));

23402

}

23403

}

23404

}

23405

23406

// Try to match a min/max vector operation.

23407

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {

23408

std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);

23409

unsigned Opc = ret.first;

23410

bool NeedSplit = ret.second;

23411

23412

if (Opc && NeedSplit) {

23413

unsigned NumElems = VT.getVectorNumElements();

23414

// Extract the LHS vectors

23415

SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);

23416

SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);

23417

23418

// Extract the RHS vectors

23419

SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);

23420

SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);

23421

23422

// Create min/max for each subvector

23423

LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);

23424

RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);

23425

23426

// Merge the result

23427

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);

23428

} else if (Opc)

23429

return DAG.getNode(Opc, DL, VT, LHS, RHS);

23430

}

23431

23432

// Simplify vector selection if condition value type matches vselect

23433

// operand type

23434

if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {

23435

assert(Cond.getValueType().isVector() &&((Cond.getValueType().isVector() && "vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("Cond.getValueType().isVector() && \"vector select expects a vector selector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23436, __PRETTY_FUNCTION__))

23436

"vector select expects a vector selector!")((Cond.getValueType().isVector() && "vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("Cond.getValueType().isVector() && \"vector select expects a vector selector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23436, __PRETTY_FUNCTION__));

23437

23438

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

23439

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

23440

23441

// Try invert the condition if true value is not all 1s and false value

23442

// is not all 0s.

23443

if (!TValIsAllOnes && !FValIsAllZeros &&

23444

// Check if the selector will be produced by CMPP*/PCMP*

23445

Cond.getOpcode() == ISD::SETCC &&

23446

// Check if SETCC has already been promoted

23447

TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {

23448

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

23449

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

23450

23451

if (TValIsAllZeros || FValIsAllOnes) {

23452

SDValue CC = Cond.getOperand(2);

23453

ISD::CondCode NewCC =

23454

ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),

23455

Cond.getOperand(0).getValueType().isInteger());

23456

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);

23457

std::swap(LHS, RHS);

23458

TValIsAllOnes = FValIsAllOnes;

23459

FValIsAllZeros = TValIsAllZeros;

23460

}

23461

}

23462

23463

if (TValIsAllOnes || FValIsAllZeros) {

23464

SDValue Ret;

23465

23466

if (TValIsAllOnes && FValIsAllZeros)

23467

Ret = Cond;

23468

else if (TValIsAllOnes)

23469

Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,

23470

DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));

23471

else if (FValIsAllZeros)

23472

Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,

23473

DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));

23474

23475

return DAG.getNode(ISD::BITCAST, DL, VT, Ret);

23476

}

23477

}

23478

23479

// If we know that this node is legal then we know that it is going to be

23480

// matched by one of the SSE/AVX BLEND instructions. These instructions only

23481

// depend on the highest bit in each word. Try to use SimplifyDemandedBits

23482

// to simplify previous instructions.

23483

if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&

23484

!DCI.isBeforeLegalize() &&

23485

// We explicitly check against v8i16 and v16i16 because, although

23486

// they're marked as Custom, they might only be legal when Cond is a

23487

// build_vector of constants. This will be taken care in a later

23488

// condition.

23489

(TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&

23490

VT != MVT::v8i16) &&

23491

// Don't optimize vector of constants. Those are handled by

23492

// the generic code and all the bits must be properly set for

23493

// the generic optimizer.

23494

!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

23495

unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();

23496

23497

// Don't optimize vector selects that map to mask-registers.

23498

if (BitWidth == 1)

23499

return SDValue();

23500

23501

assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size")((BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"
) ? static_cast<void> (0) : __assert_fail ("BitWidth >= 8 && BitWidth <= 64 && \"Invalid mask size\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23501, __PRETTY_FUNCTION__));

23502

APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);

23503

23504

APInt KnownZero, KnownOne;

23505

TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),

23506

DCI.isBeforeLegalizeOps());

23507

if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||

23508

TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,

23509

TLO)) {

23510

// If we changed the computation somewhere in the DAG, this change

23511

// will affect all users of Cond.

23512

// Make sure it is fine and update all the nodes so that we do not

23513

// use the generic VSELECT anymore. Otherwise, we may perform

23514

// wrong optimizations as we messed up with the actual expectation

23515

// for the vector boolean values.

23516

if (Cond != TLO.Old) {

23517

// Check all uses of that condition operand to check whether it will be

23518

// consumed by non-BLEND instructions, which may depend on all bits are

23519

// set properly.

23520

for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();

23521

I != E; ++I)

23522

if (I->getOpcode() != ISD::VSELECT)

23523

// TODO: Add other opcodes eventually lowered into BLEND.

23524

return SDValue();

23525

23526

// Update all the users of the condition, before committing the change,

23527

// so that the VSELECT optimizations that expect the correct vector

23528

// boolean value will not be triggered.

23529

for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();

23530

I != E; ++I)

23531

DAG.ReplaceAllUsesOfValueWith(

23532

SDValue(*I, 0),

23533

DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),

23534

Cond, I->getOperand(1), I->getOperand(2)));

23535

DCI.CommitTargetLoweringOpt(TLO);

23536

return SDValue();

23537

}

23538

// At this point, only Cond is changed. Change the condition

23539

// just for N to keep the opportunity to optimize all other

23540

// users their own way.

23541

DAG.ReplaceAllUsesOfValueWith(

23542

SDValue(N, 0),

23543

DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),

23544

TLO.New, N->getOperand(1), N->getOperand(2)));

23545

return SDValue();

23546

}

23547

}

23548

23549

// We should generate an X86ISD::BLENDI from a vselect if its argument

23550

// is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of

23551

// constants. This specific pattern gets generated when we split a

23552

// selector for a 512 bit vector in a machine without AVX512 (but with

23553

// 256-bit vectors), during legalization:

23554

23555

// (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)

23556

23557

// Iff we find this pattern and the build_vectors are built from

23558

// constants, we translate the vselect into a shuffle_vector that we

23559

// know will be matched by LowerVECTOR_SHUFFLEtoBlend.

23560

if ((N->getOpcode() == ISD::VSELECT ||

23561

N->getOpcode() == X86ISD::SHRUNKBLEND) &&

23562

!DCI.isBeforeLegalize()) {

23563

SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);

23564

if (Shuffle.getNode())

23565

return Shuffle;

23566

}

23567

23568

return SDValue();

23569

}

23570

23571

// Check whether a boolean test is testing a boolean value generated by

23572

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

23573

// code.

23574

23575

// Simplify the following patterns:

23576

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

23577

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

23578

// to (Op EFLAGS Cond)

23579

23580

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

23581

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

23582

// to (Op EFLAGS !Cond)

23583

23584

// where Op could be BRCOND or CMOV.

23585

23586

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

23587

// Quit if not CMP and SUB with its value result used.

23588

if (Cmp.getOpcode() != X86ISD::CMP &&

23589

(Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))

23590

return SDValue();

23591

23592

// Quit if not used as a boolean value.

23593

if (CC != X86::COND_E && CC != X86::COND_NE)

23594

return SDValue();

23595

23596

// Check CMP operands. One of them should be 0 or 1 and the other should be

23597

// an SetCC or extended from it.

23598

SDValue Op1 = Cmp.getOperand(0);

23599

SDValue Op2 = Cmp.getOperand(1);

23600

23601

SDValue SetCC;

23602

const ConstantSDNode* C = nullptr;

23603

bool needOppositeCond = (CC == X86::COND_E);

23604

bool checkAgainstTrue = false; // Is it a comparison against 1?

23605

23606

if ((C = dyn_cast<ConstantSDNode>(Op1)))

23607

SetCC = Op2;

23608

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

23609

SetCC = Op1;

23610

else // Quit if all operands are not constants.

23611

return SDValue();

23612

23613

if (C->getZExtValue() == 1) {

23614

needOppositeCond = !needOppositeCond;

23615

checkAgainstTrue = true;

23616

} else if (C->getZExtValue() != 0)

23617

// Quit if the constant is neither 0 or 1.

23618

return SDValue();

23619

23620

bool truncatedToBoolWithAnd = false;

23621

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

23622

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

23623

SetCC.getOpcode() == ISD::TRUNCATE ||

23624

SetCC.getOpcode() == ISD::AND) {

23625

if (SetCC.getOpcode() == ISD::AND) {

23626

int OpIdx = -1;

23627

ConstantSDNode *CS;

23628

if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&

23629

CS->getZExtValue() == 1)

23630

OpIdx = 1;

23631

if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&

23632

CS->getZExtValue() == 1)

23633

OpIdx = 0;

23634

if (OpIdx == -1)

23635

break;

23636

SetCC = SetCC.getOperand(OpIdx);

23637

truncatedToBoolWithAnd = true;

23638

} else

23639

SetCC = SetCC.getOperand(0);

23640

}

23641

23642

switch (SetCC.getOpcode()) {

23643

case X86ISD::SETCC_CARRY:

23644

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

23645

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

23646

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

23647

// truncated to i1 using 'and'.

23648

if (checkAgainstTrue && !truncatedToBoolWithAnd)

23649

break;

23650

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23651, __PRETTY_FUNCTION__))

23651

"Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23651, __PRETTY_FUNCTION__));

23652

// FALL THROUGH

23653

case X86ISD::SETCC:

23654

// Set the condition code or opposite one if necessary.

23655

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

23656

if (needOppositeCond)

23657

CC = X86::GetOppositeBranchCondition(CC);

23658

return SetCC.getOperand(1);

23659

case X86ISD::CMOV: {

23660

// Check whether false/true value has canonical one, i.e. 0 or 1.

23661

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

23662

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

23663

// Quit if true value is not a constant.

23664

if (!TVal)

23665

return SDValue();

23666

// Quit if false value is not a constant.

23667

if (!FVal) {

23668

SDValue Op = SetCC.getOperand(0);

23669

// Skip 'zext' or 'trunc' node.

23670

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

23671

Op.getOpcode() == ISD::TRUNCATE)

23672

Op = Op.getOperand(0);

23673

// A special case for rdrand/rdseed, where 0 is set if false cond is

23674

// found.

23675

if ((Op.getOpcode() != X86ISD::RDRAND &&

23676

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

23677

return SDValue();

23678

}

23679

// Quit if false value is not the constant 0 or 1.

23680

bool FValIsFalse = true;

23681

if (FVal && FVal->getZExtValue() != 0) {

23682

if (FVal->getZExtValue() != 1)

23683

return SDValue();

23684

// If FVal is 1, opposite cond is needed.

23685

needOppositeCond = !needOppositeCond;

23686

FValIsFalse = false;

23687

}

23688

// Quit if TVal is not the constant opposite of FVal.

23689

if (FValIsFalse && TVal->getZExtValue() != 1)

23690

return SDValue();

23691

if (!FValIsFalse && TVal->getZExtValue() != 0)

23692

return SDValue();

23693

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

23694

if (needOppositeCond)

23695

CC = X86::GetOppositeBranchCondition(CC);

23696

return SetCC.getOperand(3);

23697

}

23698

}

23699

23700

return SDValue();

23701

}

23702

23703

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

23704

static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,

23705

TargetLowering::DAGCombinerInfo &DCI,

23706

const X86Subtarget *Subtarget) {

23707

SDLoc DL(N);

23708

23709

// If the flag operand isn't dead, don't touch this CMOV.

23710

if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())

23711

return SDValue();

23712

23713

SDValue FalseOp = N->getOperand(0);

23714

SDValue TrueOp = N->getOperand(1);

23715

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

23716

SDValue Cond = N->getOperand(3);

23717

23718

if (CC == X86::COND_E || CC == X86::COND_NE) {

23719

switch (Cond.getOpcode()) {

23720

default: break;

23721

case X86ISD::BSR:

23722

case X86ISD::BSF:

23723

// If operand of BSR / BSF are proven never zero, then ZF cannot be set.

23724

if (DAG.isKnownNeverZero(Cond.getOperand(0)))

23725

return (CC == X86::COND_E) ? FalseOp : TrueOp;

23726

}

23727

}

23728

23729

SDValue Flags;

23730

23731

Flags = checkBoolTestSetCCCombine(Cond, CC);

23732

if (Flags.getNode() &&

23733

// Extra check as FCMOV only supports a subset of X86 cond.

23734

(FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {

23735

SDValue Ops[] = { FalseOp, TrueOp,

23736

DAG.getConstant(CC, MVT::i8), Flags };

23737

return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);

23738

}

23739

23740

// If this is a select between two integer constants, try to do some

23741

// optimizations. Note that the operands are ordered the opposite of SELECT

23742

// operands.

23743

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

23744

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

23745

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

23746

// larger than FalseC (the false value).

23747

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

23748

CC = X86::GetOppositeBranchCondition(CC);

23749

std::swap(TrueC, FalseC);

23750

std::swap(TrueOp, FalseOp);

23751

}

23752

23753

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

23754

// This is efficient for any integer data type (including i8/i16) and

23755

// shift amount.

23756

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

23757

Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,

23758

DAG.getConstant(CC, MVT::i8), Cond);

23759

23760

// Zero extend the condition if needed.

23761

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

23762

23763

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

23764

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

23765

DAG.getConstant(ShAmt, MVT::i8));

23766

if (N->getNumValues() == 2) // Dead flag value?

23767

return DCI.CombineTo(N, Cond, SDValue());

23768

return Cond;

23769

}

23770

23771

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

23772

// for any integer data type, including i8/i16.

23773

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

23774

Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,

23775

DAG.getConstant(CC, MVT::i8), Cond);

23776

23777

// Zero extend the condition if needed.

23778

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

23779

FalseC->getValueType(0), Cond);

23780

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

23781

SDValue(FalseC, 0));

23782

23783

if (N->getNumValues() == 2) // Dead flag value?

23784

return DCI.CombineTo(N, Cond, SDValue());

23785

return Cond;

23786

}

23787

23788

// Optimize cases that will turn into an LEA instruction. This requires

23789

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

23790

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

23791

uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();

23792

if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

23793

23794

bool isFastMultiplier = false;

23795

if (Diff < 10) {

23796

switch ((unsigned char)Diff) {

23797

default: break;

23798

case 1: // result = add base, cond

23799

case 2: // result = lea base( , cond*2)

23800

case 3: // result = lea base(cond, cond*2)

23801

case 4: // result = lea base( , cond*4)

23802

case 5: // result = lea base(cond, cond*4)

23803

case 8: // result = lea base( , cond*8)

23804

case 9: // result = lea base(cond, cond*8)

23805

isFastMultiplier = true;

23806

break;

23807

}

23808

}

23809

23810

if (isFastMultiplier) {

23811

APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();

23812

Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,

23813

DAG.getConstant(CC, MVT::i8), Cond);

23814

// Zero extend the condition if needed.

23815

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

23816

Cond);

23817

// Scale the condition by the difference.

23818

if (Diff != 1)

23819

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

23820

DAG.getConstant(Diff, Cond.getValueType()));

23821

23822

// Add the base if non-zero.

23823

if (FalseC->getAPIntValue() != 0)

23824

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

23825

SDValue(FalseC, 0));

23826

if (N->getNumValues() == 2) // Dead flag value?

23827

return DCI.CombineTo(N, Cond, SDValue());

23828

return Cond;

23829

}

23830

}

23831

}

23832

}

23833

23834

// Handle these cases:

23835

// (select (x != c), e, c) -> select (x != c), e, x),

23836

// (select (x == c), c, e) -> select (x == c), x, e)

23837

// where the c is an integer constant, and the "select" is the combination

23838

// of CMOV and CMP.

23839

23840

// The rationale for this change is that the conditional-move from a constant

23841

// needs two instructions, however, conditional-move from a register needs

23842

// only one instruction.

23843

23844

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

23845

// some instruction-combining opportunities. This opt needs to be

23846

// postponed as late as possible.

23847

23848

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

23849

// the DCI.xxxx conditions are provided to postpone the optimization as

23850

// late as possible.

23851

23852

ConstantSDNode *CmpAgainst = nullptr;

23853

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

23854

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

23855

!isa<ConstantSDNode>(Cond.getOperand(0))) {

23856

23857

if (CC == X86::COND_NE &&

23858

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

23859

CC = X86::GetOppositeBranchCondition(CC);

23860

std::swap(TrueOp, FalseOp);

23861

}

23862

23863

if (CC == X86::COND_E &&

23864

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

23865

SDValue Ops[] = { FalseOp, Cond.getOperand(0),

23866

DAG.getConstant(CC, MVT::i8), Cond };

23867

return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);

23868

}

23869

}

23870

}

23871

23872

return SDValue();

23873

}

23874

23875

static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,

23876

const X86Subtarget *Subtarget) {

23877

unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();

23878

switch (IntNo) {

23879

default: return SDValue();

23880

// SSE/AVX/AVX2 blend intrinsics.

23881

case Intrinsic::x86_avx2_pblendvb:

23882

case Intrinsic::x86_avx2_pblendw:

23883

case Intrinsic::x86_avx2_pblendd_128:

23884

case Intrinsic::x86_avx2_pblendd_256:

23885

// Don't try to simplify this intrinsic if we don't have AVX2.

23886

if (!Subtarget->hasAVX2())

23887

return SDValue();

23888

// FALL-THROUGH

23889

case Intrinsic::x86_avx_blend_pd_256:

23890

case Intrinsic::x86_avx_blend_ps_256:

23891

case Intrinsic::x86_avx_blendv_pd_256:

23892

case Intrinsic::x86_avx_blendv_ps_256:

23893

// Don't try to simplify this intrinsic if we don't have AVX.

23894

if (!Subtarget->hasAVX())

23895

return SDValue();

23896

// FALL-THROUGH

23897

case Intrinsic::x86_sse41_pblendw:

23898

case Intrinsic::x86_sse41_blendpd:

23899

case Intrinsic::x86_sse41_blendps:

23900

case Intrinsic::x86_sse41_blendvps:

23901

case Intrinsic::x86_sse41_blendvpd:

23902

case Intrinsic::x86_sse41_pblendvb: {

23903

SDValue Op0 = N->getOperand(1);

23904

SDValue Op1 = N->getOperand(2);

23905

SDValue Mask = N->getOperand(3);

23906

23907

// Don't try to simplify this intrinsic if we don't have SSE4.1.

23908

if (!Subtarget->hasSSE41())

23909

return SDValue();

23910

23911

// fold (blend A, A, Mask) -> A

23912

if (Op0 == Op1)

23913

return Op0;

23914

// fold (blend A, B, allZeros) -> A

23915

if (ISD::isBuildVectorAllZeros(Mask.getNode()))

23916

return Op0;

23917

// fold (blend A, B, allOnes) -> B

23918

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

23919

return Op1;

23920

23921

// Simplify the case where the mask is a constant i32 value.

23922

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {

23923

if (C->isNullValue())

23924

return Op0;

23925

if (C->isAllOnesValue())

23926

return Op1;

23927

}

23928

23929

return SDValue();

23930

}

23931

23932

// Packed SSE2/AVX2 arithmetic shift immediate intrinsics.

23933

case Intrinsic::x86_sse2_psrai_w:

23934

case Intrinsic::x86_sse2_psrai_d:

23935

case Intrinsic::x86_avx2_psrai_w:

23936

case Intrinsic::x86_avx2_psrai_d:

23937

case Intrinsic::x86_sse2_psra_w:

23938

case Intrinsic::x86_sse2_psra_d:

23939

case Intrinsic::x86_avx2_psra_w:

23940

case Intrinsic::x86_avx2_psra_d: {

23941

SDValue Op0 = N->getOperand(1);

23942

SDValue Op1 = N->getOperand(2);

23943

EVT VT = Op0.getValueType();

23944

assert(VT.isVector() && "Expected a vector type!")((VT.isVector() && "Expected a vector type!") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected a vector type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23944, __PRETTY_FUNCTION__));

23945

23946

if (isa<BuildVectorSDNode>(Op1))

23947

Op1 = Op1.getOperand(0);

23948

23949

if (!isa<ConstantSDNode>(Op1))

23950

return SDValue();

23951

23952

EVT SVT = VT.getVectorElementType();

23953

unsigned SVTBits = SVT.getSizeInBits();

23954

23955

ConstantSDNode *CND = cast<ConstantSDNode>(Op1);

23956

const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());

23957

uint64_t ShAmt = C.getZExtValue();

23958

23959

// Don't try to convert this shift into a ISD::SRA if the shift

23960

// count is bigger than or equal to the element size.

23961

if (ShAmt >= SVTBits)

23962

return SDValue();

23963

23964

// Trivial case: if the shift count is zero, then fold this

23965

// into the first operand.

23966

if (ShAmt == 0)

23967

return Op0;

23968

23969

// Replace this packed shift intrinsic with a target independent

23970

// shift dag node.

23971

SDValue Splat = DAG.getConstant(C, VT);

23972

return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);

23973

}

23974

}

23975

}

23976

23977

/// PerformMulCombine - Optimize a single multiply with constant into two

23978

/// in order to implement it with two cheaper instructions, e.g.

23979

/// LEA + SHL, LEA + LEA.

23980

static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,

23981

TargetLowering::DAGCombinerInfo &DCI) {

23982

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

23983

return SDValue();

23984

23985

EVT VT = N->getValueType(0);

23986

if (VT != MVT::i64 && VT != MVT::i32)

23987

return SDValue();

23988

23989

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

23990

if (!C)

23991

return SDValue();

23992

uint64_t MulAmt = C->getZExtValue();

23993

if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)

23994

return SDValue();

23995

23996

uint64_t MulAmt1 = 0;

23997

uint64_t MulAmt2 = 0;

23998

if ((MulAmt % 9) == 0) {

23999

MulAmt1 = 9;

24000

MulAmt2 = MulAmt / 9;

24001

} else if ((MulAmt % 5) == 0) {

24002

MulAmt1 = 5;

24003

MulAmt2 = MulAmt / 5;

24004

} else if ((MulAmt % 3) == 0) {

24005

MulAmt1 = 3;

24006

MulAmt2 = MulAmt / 3;

24007

}

24008

if (MulAmt2 &&

24009

(isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){

24010

SDLoc DL(N);

24011

24012

if (isPowerOf2_64(MulAmt2) &&

24013

!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))

24014

// If second multiplifer is pow2, issue it first. We want the multiply by

24015

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

24016

// is an add.

24017

std::swap(MulAmt1, MulAmt2);

24018

24019

SDValue NewMul;

24020

if (isPowerOf2_64(MulAmt1))

24021

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

24022

DAG.getConstant(Log2_64(MulAmt1), MVT::i8));

24023

else

24024

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

24025

DAG.getConstant(MulAmt1, VT));

24026

24027

if (isPowerOf2_64(MulAmt2))

24028

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

24029

DAG.getConstant(Log2_64(MulAmt2), MVT::i8));

24030

else

24031

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

24032

DAG.getConstant(MulAmt2, VT));

24033

24034

// Do not add new nodes to DAG combiner worklist.

24035

DCI.CombineTo(N, NewMul, false);

24036

}

24037

return SDValue();

24038

}

24039

24040

static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {

24041

SDValue N0 = N->getOperand(0);

24042

SDValue N1 = N->getOperand(1);

24043

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

24044

EVT VT = N0.getValueType();

24045

24046

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

24047

// since the result of setcc_c is all zero's or all ones.

24048

if (VT.isInteger() && !VT.isVector() &&

24049

N1C && N0.getOpcode() == ISD::AND &&

24050

N0.getOperand(1).getOpcode() == ISD::Constant) {

24051

SDValue N00 = N0.getOperand(0);

24052

if (N00.getOpcode() == X86ISD::SETCC_CARRY ||

24053

((N00.getOpcode() == ISD::ANY_EXTEND ||

24054

N00.getOpcode() == ISD::ZERO_EXTEND) &&

24055

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {

24056

APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();

24057

APInt ShAmt = N1C->getAPIntValue();

24058

Mask = Mask.shl(ShAmt);

24059

if (Mask != 0)

24060

return DAG.getNode(ISD::AND, SDLoc(N), VT,

24061

N00, DAG.getConstant(Mask, VT));

24062

}

24063

}

24064

24065

// Hardware support for vector shifts is sparse which makes us scalarize the

24066

// vector operations in many cases. Also, on sandybridge ADD is faster than

24067

// shl.

24068

// (shl V, 1) -> add V,V

24069

if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))

24070

if (auto *N1SplatC = N1BV->getConstantSplatNode()) {

24071

assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type"
) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24071, __PRETTY_FUNCTION__));

24072

// We shift all of the values by one. In many cases we do not have

24073

// hardware support for this operation. This is better expressed as an ADD

24074

// of two values.

24075

if (N1SplatC->getZExtValue() == 1)

24076

return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);

24077

}

24078

24079

return SDValue();

24080

}

24081

24082

/// \brief Returns a vector of 0s if the node in input is a vector logical

24083

/// shift by a constant amount which is known to be bigger than or equal

24084

/// to the vector element size in bits.

24085

static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,

24086

const X86Subtarget *Subtarget) {

24087

EVT VT = N->getValueType(0);

24088

24089

if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&

24090

(!Subtarget->hasInt256() ||

24091

(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))

24092

return SDValue();

24093

24094

SDValue Amt = N->getOperand(1);

24095

SDLoc DL(N);

24096

if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))

24097

if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {

24098

APInt ShiftAmt = AmtSplat->getAPIntValue();

24099

unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();

24100

24101

// SSE2/AVX2 logical shifts always return a vector of 0s

24102

// if the shift amount is bigger than or equal to

24103

// the element size. The constant shift amount will be

24104

// encoded as a 8-bit immediate.

24105

if (ShiftAmt.trunc(8).uge(MaxAmount))

24106

return getZeroVector(VT, Subtarget, DAG, DL);

24107

}

24108

24109

return SDValue();

24110

}

24111

24112

/// PerformShiftCombine - Combine shifts.

24113

static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,

24114

TargetLowering::DAGCombinerInfo &DCI,

24115

const X86Subtarget *Subtarget) {

24116

if (N->getOpcode() == ISD::SHL) {

24117

SDValue V = PerformSHLCombine(N, DAG);

24118

if (V.getNode()) return V;

24119

}

24120

24121

if (N->getOpcode() != ISD::SRA) {

24122

// Try to fold this logical shift into a zero vector.

24123

SDValue V = performShiftToAllZeros(N, DAG, Subtarget);

24124

if (V.getNode()) return V;

24125

}

24126

24127

return SDValue();

24128

}

24129

24130

// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))

24131

// where both setccs reference the same FP CMP, and rewrite for CMPEQSS

24132

// and friends. Likewise for OR -> CMPNEQSS.

24133

static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,

24134

TargetLowering::DAGCombinerInfo &DCI,

24135

const X86Subtarget *Subtarget) {

24136

unsigned opcode;

24137

24138

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

24139

// we're requiring SSE2 for both.

24140

if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

24141

SDValue N0 = N->getOperand(0);

24142

SDValue N1 = N->getOperand(1);

24143

SDValue CMP0 = N0->getOperand(1);

24144

SDValue CMP1 = N1->getOperand(1);

24145

SDLoc DL(N);

24146

24147

// The SETCCs should both refer to the same CMP.

24148

if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)

24149

return SDValue();

24150

24151

SDValue CMP00 = CMP0->getOperand(0);

24152

SDValue CMP01 = CMP0->getOperand(1);

24153

EVT VT = CMP00.getValueType();

24154

24155

if (VT == MVT::f32 || VT == MVT::f64) {

24156

bool ExpectingFlags = false;

24157

// Check for any users that want flags:

24158

for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();

24159

!ExpectingFlags && UI != UE; ++UI)

24160

switch (UI->getOpcode()) {

24161

default:

24162

case ISD::BR_CC:

24163

case ISD::BRCOND:

24164

case ISD::SELECT:

24165

ExpectingFlags = true;

24166

break;

24167

case ISD::CopyToReg:

24168

case ISD::SIGN_EXTEND:

24169

case ISD::ZERO_EXTEND:

24170

case ISD::ANY_EXTEND:

24171

break;

24172

}

24173

24174

if (!ExpectingFlags) {

24175

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

24176

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

24177

24178

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

24179

X86::CondCode tmp = cc0;

24180

cc0 = cc1;

24181

cc1 = tmp;

24182

}

24183

24184

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

24185

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

24186

// FIXME: need symbolic constants for these magic numbers.

24187

// See X86ATTInstPrinter.cpp:printSSECC().

24188

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

24189

if (Subtarget->hasAVX512()) {

24190

SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,

24191

CMP01, DAG.getConstant(x86cc, MVT::i8));

24192

if (N->getValueType(0) != MVT::i1)

24193

return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),

24194

FSetCC);

24195

return FSetCC;

24196

}

24197

SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,

24198

CMP00.getValueType(), CMP00, CMP01,

24199

DAG.getConstant(x86cc, MVT::i8));

24200

24201

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

24202

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

24203

24204

if (is64BitFP && !Subtarget->is64Bit()) {

24205

// On a 32-bit target, we cannot bitcast the 64-bit float to a

24206

// 64-bit integer, since that's not a legal type. Since

24207

// OnesOrZeroesF is all ones of all zeroes, we don't need all the

24208

// bits, but can do this little dance to extract the lowest 32 bits

24209

// and work with those going forward.

24210

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

24211

OnesOrZeroesF);

24212

SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,

24213

Vector64);

24214

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

24215

Vector32, DAG.getIntPtrConstant(0));

24216

IntVT = MVT::i32;

24217

}

24218

24219

SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);

24220

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

24221

DAG.getConstant(1, IntVT));

24222

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);

24223

return OneBitOfTruth;

24224

}

24225

}

24226

}

24227

}

24228

return SDValue();

24229

}

24230

24231

/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector

24232

/// so it can be folded inside ANDNP.

24233

static bool CanFoldXORWithAllOnes(const SDNode *N) {

24234

EVT VT = N->getValueType(0);

24235

24236

// Match direct AllOnes for 128 and 256-bit vectors

24237

if (ISD::isBuildVectorAllOnes(N))

24238

return true;

24239

24240

// Look through a bit convert.

24241

if (N->getOpcode() == ISD::BITCAST)

24242

N = N->getOperand(0).getNode();

24243

24244

// Sometimes the operand may come from a insert_subvector building a 256-bit

24245

// allones vector

24246

if (VT.is256BitVector() &&

24247

N->getOpcode() == ISD::INSERT_SUBVECTOR) {

24248

SDValue V1 = N->getOperand(0);

24249

SDValue V2 = N->getOperand(1);

24250

24251

if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&

24252

V1.getOperand(0).getOpcode() == ISD::UNDEF &&

24253

ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&

24254

ISD::isBuildVectorAllOnes(V2.getNode()))

24255

return true;

24256

}

24257

24258

return false;

24259

}

24260

24261

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

24262

// register. In most cases we actually compare or select YMM-sized registers

24263

// and mixing the two types creates horrible code. This method optimizes

24264

// some of the transition sequences.

24265

static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,

24266

TargetLowering::DAGCombinerInfo &DCI,

24267

const X86Subtarget *Subtarget) {

24268

EVT VT = N->getValueType(0);

24269

if (!VT.is256BitVector())

24270

return SDValue();

24271

24272

assert((N->getOpcode() == ISD::ANY_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24274, __PRETTY_FUNCTION__))

24273

N->getOpcode() == ISD::ZERO_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24274, __PRETTY_FUNCTION__))

24274

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24274, __PRETTY_FUNCTION__));

24275

24276

SDValue Narrow = N->getOperand(0);

24277

EVT NarrowVT = Narrow->getValueType(0);

24278

if (!NarrowVT.is128BitVector())

24279

return SDValue();

24280

24281

if (Narrow->getOpcode() != ISD::XOR &&

24282

Narrow->getOpcode() != ISD::AND &&

24283

Narrow->getOpcode() != ISD::OR)

24284

return SDValue();

24285

24286

SDValue N0 = Narrow->getOperand(0);

24287

SDValue N1 = Narrow->getOperand(1);

24288

SDLoc DL(Narrow);

24289

24290

// The Left side has to be a trunc.

24291

if (N0.getOpcode() != ISD::TRUNCATE)

24292

return SDValue();

24293

24294

// The type of the truncated inputs.

24295

EVT WideVT = N0->getOperand(0)->getValueType(0);

24296

if (WideVT != VT)

24297

return SDValue();

24298

24299

// The right side has to be a 'trunc' or a constant vector.

24300

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;

24301

ConstantSDNode *RHSConstSplat = nullptr;

24302

if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))

24303

RHSConstSplat = RHSBV->getConstantSplatNode();

24304

if (!RHSTrunc && !RHSConstSplat)

24305

return SDValue();

24306

24307

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24308

24309

if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))

24310

return SDValue();

24311

24312

// Set N0 and N1 to hold the inputs to the new wide operation.

24313

N0 = N0->getOperand(0);

24314

if (RHSConstSplat) {

24315

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),

24316

SDValue(RHSConstSplat, 0));

24317

SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);

24318

N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);

24319

} else if (RHSTrunc) {

24320

N1 = N1->getOperand(0);

24321

}

24322

24323

// Generate the wide operation.

24324

SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);

24325

unsigned Opcode = N->getOpcode();

24326

switch (Opcode) {

24327

case ISD::ANY_EXTEND:

24328

return Op;

24329

case ISD::ZERO_EXTEND: {

24330

unsigned InBits = NarrowVT.getScalarType().getSizeInBits();

24331

APInt Mask = APInt::getAllOnesValue(InBits);

24332

Mask = Mask.zext(VT.getScalarType().getSizeInBits());

24333

return DAG.getNode(ISD::AND, DL, VT,

24334

Op, DAG.getConstant(Mask, VT));

24335

}

24336

case ISD::SIGN_EXTEND:

24337

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

24338

Op, DAG.getValueType(NarrowVT));

24339

default:

24340

llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24340);

24341

}

24342

}

24343

24344

static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,

24345

TargetLowering::DAGCombinerInfo &DCI,

24346

const X86Subtarget *Subtarget) {

24347

EVT VT = N->getValueType(0);

24348

if (DCI.isBeforeLegalizeOps())

24349

return SDValue();

24350

24351

SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);

24352

if (R.getNode())

24353

return R;

24354

24355

// Create BEXTR instructions

24356

// BEXTR is ((X >> imm) & (2**size-1))

24357

if (VT == MVT::i32 || VT == MVT::i64) {

24358

SDValue N0 = N->getOperand(0);

24359

SDValue N1 = N->getOperand(1);

24360

SDLoc DL(N);

24361

24362

// Check for BEXTR.

24363

if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&

24364

(N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {

24365

ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);

24366

ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));

24367

if (MaskNode && ShiftNode) {

24368

uint64_t Mask = MaskNode->getZExtValue();

24369

uint64_t Shift = ShiftNode->getZExtValue();

24370

if (isMask_64(Mask)) {

24371

uint64_t MaskSize = CountPopulation_64(Mask);

24372

if (Shift + MaskSize <= VT.getSizeInBits())

24373

return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),

24374

DAG.getConstant(Shift | (MaskSize << 8), VT));

24375

}

24376

}

24377

} // BEXTR

24378

24379

return SDValue();

24380

}

24381

24382

// Want to form ANDNP nodes:

24383

// 1) In the hopes of then easily combining them with OR and AND nodes

24384

// to form PBLEND/PSIGN.

24385

// 2) To match ANDN packed intrinsics

24386

if (VT != MVT::v2i64 && VT != MVT::v4i64)

24387

return SDValue();

24388

24389

SDValue N0 = N->getOperand(0);

24390

SDValue N1 = N->getOperand(1);

24391

SDLoc DL(N);

24392

24393

// Check LHS for vnot

24394

if (N0.getOpcode() == ISD::XOR &&

24395

//ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))

24396

CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))

24397

return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

24398

24399

// Check RHS for vnot

24400

if (N1.getOpcode() == ISD::XOR &&

24401

//ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))

24402

CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))

24403

return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

24404

24405

return SDValue();

24406

}

24407

24408

static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,

24409

TargetLowering::DAGCombinerInfo &DCI,

24410

const X86Subtarget *Subtarget) {

24411

if (DCI.isBeforeLegalizeOps())

24412

return SDValue();

24413

24414

SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);

24415

if (R.getNode())

24416

return R;

24417

24418

SDValue N0 = N->getOperand(0);

24419

SDValue N1 = N->getOperand(1);

24420

EVT VT = N->getValueType(0);

24421

24422

// look for psign/blend

24423

if (VT == MVT::v2i64 || VT == MVT::v4i64) {

24424

if (!Subtarget->hasSSSE3() ||

24425

(VT == MVT::v4i64 && !Subtarget->hasInt256()))

24426

return SDValue();

24427

24428

// Canonicalize pandn to RHS

24429

if (N0.getOpcode() == X86ISD::ANDNP)

24430

std::swap(N0, N1);

24431

// or (and (m, y), (pandn m, x))

24432

if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {

24433

SDValue Mask = N1.getOperand(0);

24434

SDValue X = N1.getOperand(1);

24435

SDValue Y;

24436

if (N0.getOperand(0) == Mask)

24437

Y = N0.getOperand(1);

24438

if (N0.getOperand(1) == Mask)

24439

Y = N0.getOperand(0);

24440

24441

// Check to see if the mask appeared in both the AND and ANDNP and

24442

if (!Y.getNode())

24443

return SDValue();

24444

24445

// Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.

24446

// Look through mask bitcast.

24447

if (Mask.getOpcode() == ISD::BITCAST)

24448

Mask = Mask.getOperand(0);

24449

if (X.getOpcode() == ISD::BITCAST)

24450

X = X.getOperand(0);

24451

if (Y.getOpcode() == ISD::BITCAST)

24452

Y = Y.getOperand(0);

24453

24454

EVT MaskVT = Mask.getValueType();

24455

24456

// Validate that the Mask operand is a vector sra node.

24457

// FIXME: what to do for bytes, since there is a psignb/pblendvb, but

24458

// there is no psrai.b

24459

unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();

24460

unsigned SraAmt = ~0;

24461

if (Mask.getOpcode() == ISD::SRA) {

24462

if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))

24463

if (auto *AmtConst = AmtBV->getConstantSplatNode())

24464

SraAmt = AmtConst->getZExtValue();

24465

} else if (Mask.getOpcode() == X86ISD::VSRAI) {

24466

SDValue SraC = Mask.getOperand(1);

24467

SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();

24468

}

24469

if ((SraAmt + 1) != EltBits)

24470

return SDValue();

24471

24472

SDLoc DL(N);

24473

24474

// Now we know we at least have a plendvb with the mask val. See if

24475

// we can form a psignb/w/d.

24476

// psign = x.type == y.type == mask.type && y = sub(0, x);

24477

if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&

24478

ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&

24479

X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {

24480

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Unsupported VT for PSIGN") ? static_cast<void> (0) : __assert_fail
("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Unsupported VT for PSIGN\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24481, __PRETTY_FUNCTION__))

24481

"Unsupported VT for PSIGN")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Unsupported VT for PSIGN") ? static_cast<void> (0) : __assert_fail
("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Unsupported VT for PSIGN\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24481, __PRETTY_FUNCTION__));

24482

Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));

24483

return DAG.getNode(ISD::BITCAST, DL, VT, Mask);

24484

}

24485

// PBLENDVB only available on SSE 4.1

24486

if (!Subtarget->hasSSE41())

24487

return SDValue();

24488

24489

EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;

24490

24491

X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);

24492

Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);

24493

Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);

24494

Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);

24495

return DAG.getNode(ISD::BITCAST, DL, VT, Mask);

24496

}

24497

}

24498

24499

if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)

24500

return SDValue();

24501

24502

// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)

24503

MachineFunction &MF = DAG.getMachineFunction();

24504

bool OptForSize = MF.getFunction()->getAttributes().

24505

hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);

24506

24507

// SHLD/SHRD instructions have lower register pressure, but on some

24508

// platforms they have higher latency than the equivalent

24509

// series of shifts/or that would otherwise be generated.

24510

// Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions

24511

// have higher latencies and we are not optimizing for size.

24512

if (!OptForSize && Subtarget->isSHLDSlow())

24513

return SDValue();

24514

24515

if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)

24516

std::swap(N0, N1);

24517

if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)

24518

return SDValue();

24519

if (!N0.hasOneUse() || !N1.hasOneUse())

24520

return SDValue();

24521

24522

SDValue ShAmt0 = N0.getOperand(1);

24523

if (ShAmt0.getValueType() != MVT::i8)

24524

return SDValue();

24525

SDValue ShAmt1 = N1.getOperand(1);

24526

if (ShAmt1.getValueType() != MVT::i8)

24527

return SDValue();

24528

if (ShAmt0.getOpcode() == ISD::TRUNCATE)

24529

ShAmt0 = ShAmt0.getOperand(0);

24530

if (ShAmt1.getOpcode() == ISD::TRUNCATE)

24531

ShAmt1 = ShAmt1.getOperand(0);

24532

24533

SDLoc DL(N);

24534

unsigned Opc = X86ISD::SHLD;

24535

SDValue Op0 = N0.getOperand(0);

24536

SDValue Op1 = N1.getOperand(0);

24537

if (ShAmt0.getOpcode() == ISD::SUB) {

24538

Opc = X86ISD::SHRD;

24539

std::swap(Op0, Op1);

24540

std::swap(ShAmt0, ShAmt1);

24541

}

24542

24543

unsigned Bits = VT.getSizeInBits();

24544

if (ShAmt1.getOpcode() == ISD::SUB) {

24545

SDValue Sum = ShAmt1.getOperand(0);

24546

if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {

24547

SDValue ShAmt1Op1 = ShAmt1.getOperand(1);

24548

if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)

24549

ShAmt1Op1 = ShAmt1Op1.getOperand(0);

24550

if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)

24551

return DAG.getNode(Opc, DL, VT,

24552

Op0, Op1,

24553

DAG.getNode(ISD::TRUNCATE, DL,

24554

MVT::i8, ShAmt0));

24555

}

24556

} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {

24557

ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);

24558

if (ShAmt0C &&

24559

ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)

24560

return DAG.getNode(Opc, DL, VT,

24561

N0.getOperand(0), N1.getOperand(0),

24562

DAG.getNode(ISD::TRUNCATE, DL,

24563

MVT::i8, ShAmt0));

24564

}

24565

24566

return SDValue();

24567

}

24568

24569

// Generate NEG and CMOV for integer abs.

24570

static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {

24571

EVT VT = N->getValueType(0);

24572

24573

// Since X86 does not have CMOV for 8-bit integer, we don't convert

24574

// 8-bit integer abs to NEG and CMOV.

24575

if (VT.isInteger() && VT.getSizeInBits() == 8)

24576

return SDValue();

24577

24578

SDValue N0 = N->getOperand(0);

24579

SDValue N1 = N->getOperand(1);

24580

SDLoc DL(N);

24581

24582

// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)

24583

// and change it to SUB and CMOV.

24584

if (VT.isInteger() && N->getOpcode() == ISD::XOR &&

24585

N0.getOpcode() == ISD::ADD &&

24586

N0.getOperand(1) == N1 &&

24587

N1.getOpcode() == ISD::SRA &&

24588

N1.getOperand(0) == N0.getOperand(0))

24589

if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))

24590

if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {

24591

// Generate SUB & CMOV.

24592

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

24593

DAG.getConstant(0, VT), N0.getOperand(0));

24594

24595

SDValue Ops[] = { N0.getOperand(0), Neg,

24596

DAG.getConstant(X86::COND_GE, MVT::i8),

24597

SDValue(Neg.getNode(), 1) };

24598

return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);

24599

}

24600

return SDValue();

24601

}

24602

24603

// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes

24604

static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,

24605

TargetLowering::DAGCombinerInfo &DCI,

24606

const X86Subtarget *Subtarget) {

24607

if (DCI.isBeforeLegalizeOps())

24608

return SDValue();

24609

24610

if (Subtarget->hasCMov()) {

24611

SDValue RV = performIntegerAbsCombine(N, DAG);

24612

if (RV.getNode())

24613

return RV;

24614

}

24615

24616

return SDValue();

24617

}

24618

24619

/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.

24620

static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,

24621

TargetLowering::DAGCombinerInfo &DCI,

24622

const X86Subtarget *Subtarget) {

24623

LoadSDNode *Ld = cast<LoadSDNode>(N);

24624

EVT RegVT = Ld->getValueType(0);

24625

EVT MemVT = Ld->getMemoryVT();

24626

SDLoc dl(Ld);

24627

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24628

24629

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

24630

// into two 16-byte operations.

24631

ISD::LoadExtType Ext = Ld->getExtensionType();

24632

unsigned Alignment = Ld->getAlignment();

24633

bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;

24634

if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&

24635

!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {

24636

unsigned NumElems = RegVT.getVectorNumElements();

24637

if (NumElems < 2)

24638

return SDValue();

24639

24640

SDValue Ptr = Ld->getBasePtr();

24641

SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());

24642

24643

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

24644

NumElems/2);

24645

SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,

24646

Ld->getPointerInfo(), Ld->isVolatile(),

24647

Ld->isNonTemporal(), Ld->isInvariant(),

24648

Alignment);

24649

Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);

24650

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,

24651

Ld->getPointerInfo(), Ld->isVolatile(),

24652

Ld->isNonTemporal(), Ld->isInvariant(),

24653

std::min(16U, Alignment));

24654

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

24655

Load1.getValue(1),

24656

Load2.getValue(1));

24657

24658

SDValue NewVec = DAG.getUNDEF(RegVT);

24659

NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);

24660

NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);

24661

return DCI.CombineTo(N, NewVec, TF, true);

24662

}

24663

24664

return SDValue();

24665

}

24666

24667

/// PerformMLOADCombine - Resolve extending loads

24668

static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,

24669

TargetLowering::DAGCombinerInfo &DCI,

24670

const X86Subtarget *Subtarget) {

24671

MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

24672

if (Mld->getExtensionType() != ISD::SEXTLOAD)

24673

return SDValue();

24674

24675

EVT VT = Mld->getValueType(0);

24676

unsigned NumElems = VT.getVectorNumElements();

24677

EVT LdVT = Mld->getMemoryVT();

24678

SDLoc dl(Mld);

24679

24680

assert(LdVT != VT && "Cannot extend to the same type")((LdVT != VT && "Cannot extend to the same type") ? static_cast
<void> (0) : __assert_fail ("LdVT != VT && \"Cannot extend to the same type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24680, __PRETTY_FUNCTION__));

24681

unsigned ToSz = VT.getVectorElementType().getSizeInBits();

24682

unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();

24683

// From, To sizes and ElemCount must be pow of two

24684

assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24685, __PRETTY_FUNCTION__))

24685

"Unexpected size for extending masked load")((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24685, __PRETTY_FUNCTION__));

24686

24687

unsigned SizeRatio = ToSz / FromSz;

24688

assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits())((SizeRatio * NumElems * FromSz == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SizeRatio * NumElems * FromSz == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24688, __PRETTY_FUNCTION__));

24689

24690

// Create a type on which we perform the shuffle

24691

EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),

24692

LdVT.getScalarType(), NumElems*SizeRatio);

24693

assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())((WideVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24693, __PRETTY_FUNCTION__));

24694

24695

// Convert Src0 value

24696

SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());

24697

if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {

24698

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

24699

for (unsigned i = 0; i != NumElems; ++i)

24700

ShuffleVec[i] = i * SizeRatio;

24701

24702

// Can't shuffle using an illegal type.

24703

assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal") ? static_cast<void> (0) :
__assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24704, __PRETTY_FUNCTION__))

24704

&& "WideVecVT should be legal")((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal") ? static_cast<void> (0) :
__assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24704, __PRETTY_FUNCTION__));

24705

WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,

24706

DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);

24707

}

24708

// Prepare the new mask

24709

SDValue NewMask;

24710

SDValue Mask = Mld->getMask();

24711

if (Mask.getValueType() == VT) {

24712

// Mask and original value have the same type

24713

NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);

24714

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

24715

for (unsigned i = 0; i != NumElems; ++i)

24716

ShuffleVec[i] = i * SizeRatio;

24717

for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)

24718

ShuffleVec[i] = NumElems*SizeRatio;

24719

NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,

24720

DAG.getConstant(0, WideVecVT),

24721

&ShuffleVec[0]);

24722

}

24723

else {

24724

assert(Mask.getValueType().getVectorElementType() == MVT::i1)((Mask.getValueType().getVectorElementType() == MVT::i1) ? static_cast
<void> (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24724, __PRETTY_FUNCTION__));

24725

unsigned WidenNumElts = NumElems*SizeRatio;

24726

unsigned MaskNumElts = VT.getVectorNumElements();

24727

EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

24728

WidenNumElts);

24729

24730

unsigned NumConcat = WidenNumElts / MaskNumElts;

24731

SmallVector<SDValue, 16> Ops(NumConcat);

24732

SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());

24733

Ops[0] = Mask;

24734

for (unsigned i = 1; i != NumConcat; ++i)

24735

Ops[i] = ZeroVal;

24736

24737

NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);

24738

}

24739

24740

SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),

24741

Mld->getBasePtr(), NewMask, WideSrc0,

24742

Mld->getMemoryVT(), Mld->getMemOperand(),

24743

ISD::NON_EXTLOAD);

24744

SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);

24745

return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);

24746

24747

}

24748

/// PerformMSTORECombine - Resolve truncating stores

24749

static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,

24750

const X86Subtarget *Subtarget) {

24751

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

24752

if (!Mst->isTruncatingStore())

24753

return SDValue();

24754

24755

EVT VT = Mst->getValue().getValueType();

24756

unsigned NumElems = VT.getVectorNumElements();

24757

EVT StVT = Mst->getMemoryVT();

24758

SDLoc dl(Mst);

24759

24760

assert(StVT != VT && "Cannot truncate to the same type")((StVT != VT && "Cannot truncate to the same type") ?
static_cast<void> (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24760, __PRETTY_FUNCTION__));

24761

unsigned FromSz = VT.getVectorElementType().getSizeInBits();

24762

unsigned ToSz = StVT.getVectorElementType().getSizeInBits();

24763

24764

// From, To sizes and ElemCount must be pow of two

24765

assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24766, __PRETTY_FUNCTION__))

24766

"Unexpected size for truncating masked store")((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24766, __PRETTY_FUNCTION__));

24767

// We are going to use the original vector elt for storing.

24768

// Accumulated smaller vector elements must be a multiple of the store size.

24769

assert (((NumElems * FromSz) % ToSz) == 0 &&((((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24770, __PRETTY_FUNCTION__))

24770

"Unexpected ratio for truncating masked store")((((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24770, __PRETTY_FUNCTION__));

24771

24772

unsigned SizeRatio = FromSz / ToSz;

24773

assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())((SizeRatio * NumElems * ToSz == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24773, __PRETTY_FUNCTION__));

24774

24775

// Create a type on which we perform the shuffle

24776

EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),

24777

StVT.getScalarType(), NumElems*SizeRatio);

24778

24779

24780

24781

SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());

24782

SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);

24783

for (unsigned i = 0; i != NumElems; ++i)

24784

ShuffleVec[i] = i * SizeRatio;

24785

24786

// Can't shuffle using an illegal type.

24787

24788

24789

24790

SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,

24791

DAG.getUNDEF(WideVecVT),

24792

&ShuffleVec[0]);

24793

24794

SDValue NewMask;

24795

SDValue Mask = Mst->getMask();

24796

if (Mask.getValueType() == VT) {

24797

// Mask and original value have the same type

24798

NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);

24799

for (unsigned i = 0; i != NumElems; ++i)

24800

ShuffleVec[i] = i * SizeRatio;

24801

for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)

24802

ShuffleVec[i] = NumElems*SizeRatio;

24803

NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,

24804

DAG.getConstant(0, WideVecVT),

24805

&ShuffleVec[0]);

24806

}

24807

else {

24808

24809

unsigned WidenNumElts = NumElems*SizeRatio;

24810

unsigned MaskNumElts = VT.getVectorNumElements();

24811

EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

24812

WidenNumElts);

24813

24814

unsigned NumConcat = WidenNumElts / MaskNumElts;

24815

SmallVector<SDValue, 16> Ops(NumConcat);

24816

SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());

24817

Ops[0] = Mask;

24818

for (unsigned i = 1; i != NumConcat; ++i)

24819

Ops[i] = ZeroVal;

24820

24821

NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);

24822

}

24823

24824

return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),

24825

NewMask, StVT, Mst->getMemOperand(), false);

24826

}

24827

/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.

24828

static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,

24829

const X86Subtarget *Subtarget) {

24830

StoreSDNode *St = cast<StoreSDNode>(N);

24831

EVT VT = St->getValue().getValueType();

24832

EVT StVT = St->getMemoryVT();

24833

SDLoc dl(St);

24834

SDValue StoredVal = St->getOperand(1);

24835

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24836

24837

// If we are saving a concatenation of two XMM registers and 32-byte stores

24838

// are slow, such as on Sandy Bridge, perform two 16-byte stores.

24839

unsigned Alignment = St->getAlignment();

24840

bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;

24841

if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&

24842

StVT == VT && !IsAligned) {

24843

unsigned NumElems = VT.getVectorNumElements();

24844

if (NumElems < 2)

24845

return SDValue();

24846

24847

SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);

24848

SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);

24849

24850

SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());

24851

SDValue Ptr0 = St->getBasePtr();

24852

SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);

24853

24854

SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,

24855

St->getPointerInfo(), St->isVolatile(),

24856

St->isNonTemporal(), Alignment);

24857

SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,

24858

St->getPointerInfo(), St->isVolatile(),

24859

St->isNonTemporal(),

24860

std::min(16U, Alignment));

24861

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

24862

}

24863

24864

// Optimize trunc store (of multiple scalars) to shuffle and store.

24865

// First, pack all of the elements in one place. Next, store to memory

24866

// in fewer chunks.

24867

if (St->isTruncatingStore() && VT.isVector()) {

24868

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24869

unsigned NumElems = VT.getVectorNumElements();

24870

24871

unsigned FromSz = VT.getVectorElementType().getSizeInBits();

24872

unsigned ToSz = StVT.getVectorElementType().getSizeInBits();

24873

24874

// From, To sizes and ElemCount must be pow of two

24875

if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();

24876

// We are going to use the original vector elt for storing.

24877

// Accumulated smaller vector elements must be a multiple of the store size.

24878

if (0 != (NumElems * FromSz) % ToSz) return SDValue();

24879

24880

unsigned SizeRatio = FromSz / ToSz;

24881

24882

24883

24884

// Create a type on which we perform the shuffle

24885

EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),

24886

StVT.getScalarType(), NumElems*SizeRatio);

24887

24888

24889

24890

SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());

24891

SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);

24892

for (unsigned i = 0; i != NumElems; ++i)

24893

ShuffleVec[i] = i * SizeRatio;

24894

24895

// Can't shuffle using an illegal type.

24896

if (!TLI.isTypeLegal(WideVecVT))

24897

return SDValue();

24898

24899

SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,

24900

DAG.getUNDEF(WideVecVT),

24901

&ShuffleVec[0]);

24902

// At this point all of the data is stored at the bottom of the

24903

// register. We now need to save it to mem.

24904

24905

// Find the largest store unit

24906

MVT StoreType = MVT::i8;

24907

for (MVT Tp : MVT::integer_valuetypes()) {

24908

if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)

24909

StoreType = Tp;

24910

}

24911

24912

// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.

24913

if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&

24914

(64 <= NumElems * ToSz))

24915

StoreType = MVT::f64;

24916

24917

// Bitcast the original vector into a vector of store-size units

24918

EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),

24919

StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());

24920

assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits())((StoreVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("StoreVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24920, __PRETTY_FUNCTION__));

24921

SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);

24922

SmallVector<SDValue, 8> Chains;

24923

SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,

24924

TLI.getPointerTy());

24925

SDValue Ptr = St->getBasePtr();

24926

24927

// Perform one or more big stores into memory.

24928

for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {

24929

SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,

24930

StoreType, ShuffWide,

24931

DAG.getIntPtrConstant(i));

24932

SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,

24933

St->getPointerInfo(), St->isVolatile(),

24934

St->isNonTemporal(), St->getAlignment());

24935

Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);

24936

Chains.push_back(Ch);

24937

}

24938

24939

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

24940

}

24941

24942

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

24943

// the FP state in cases where an emms may be missing.

24944

// A preferable solution to the general problem is to figure out the right

24945

// places to insert EMMS. This qualifies as a quick hack.

24946

24947

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

24948

if (VT.getSizeInBits() != 64)

24949

return SDValue();

24950

24951

const Function *F = DAG.getMachineFunction().getFunction();

24952

bool NoImplicitFloatOps = F->getAttributes().

24953

hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);

24954

bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps

24955

&& Subtarget->hasSSE2();

24956

if ((VT.isVector() ||

24957

(VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&

24958

isa<LoadSDNode>(St->getValue()) &&

24959

!cast<LoadSDNode>(St->getValue())->isVolatile() &&

24960

St->getChain().hasOneUse() && !St->isVolatile()) {

24961

SDNode* LdVal = St->getValue().getNode();

24962

LoadSDNode *Ld = nullptr;

24963

int TokenFactorIndex = -1;

24964

SmallVector<SDValue, 8> Ops;

24965

SDNode* ChainVal = St->getChain().getNode();

24966

// Must be a store of a load. We currently handle two cases: the load

24967

// is a direct child, and it's under an intervening TokenFactor. It is

24968

// possible to dig deeper under nested TokenFactors.

24969

if (ChainVal == LdVal)

24970

Ld = cast<LoadSDNode>(St->getChain());

24971

else if (St->getValue().hasOneUse() &&

24972

ChainVal->getOpcode() == ISD::TokenFactor) {

24973

for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {

24974

if (ChainVal->getOperand(i).getNode() == LdVal) {

24975

TokenFactorIndex = i;

24976

Ld = cast<LoadSDNode>(St->getValue());

24977

} else

24978

Ops.push_back(ChainVal->getOperand(i));

24979

}

24980

}

24981

24982

if (!Ld || !ISD::isNormalLoad(Ld))

24983

return SDValue();

24984

24985

// If this is not the MMX case, i.e. we are just turning i64 load/store

24986

// into f64 load/store, avoid the transformation if there are multiple

24987

// uses of the loaded value.

24988

if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))

24989

return SDValue();

24990

24991

SDLoc LdDL(Ld);

24992

SDLoc StDL(N);

24993

// If we are a 64-bit capable x86, lower to a single movq load/store pair.

24994

// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store

24995

// pair instead.

24996

if (Subtarget->is64Bit() || F64IsLegal) {

24997

EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;

24998

SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),

24999

Ld->getPointerInfo(), Ld->isVolatile(),

25000

Ld->isNonTemporal(), Ld->isInvariant(),

25001

Ld->getAlignment());

25002

SDValue NewChain = NewLd.getValue(1);

25003

if (TokenFactorIndex != -1) {

25004

Ops.push_back(NewChain);

25005

NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);

25006

}

25007

return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),

25008

St->getPointerInfo(),

25009

St->isVolatile(), St->isNonTemporal(),

25010

St->getAlignment());

25011

}

25012

25013

// Otherwise, lower to two pairs of 32-bit loads / stores.

25014

SDValue LoAddr = Ld->getBasePtr();

25015

SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,

25016

DAG.getConstant(4, MVT::i32));

25017

25018

SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,

25019

Ld->getPointerInfo(),

25020

Ld->isVolatile(), Ld->isNonTemporal(),

25021

Ld->isInvariant(), Ld->getAlignment());

25022

SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,

25023

Ld->getPointerInfo().getWithOffset(4),

25024

Ld->isVolatile(), Ld->isNonTemporal(),

25025

Ld->isInvariant(),

25026

MinAlign(Ld->getAlignment(), 4));

25027

25028

SDValue NewChain = LoLd.getValue(1);

25029

if (TokenFactorIndex != -1) {

25030

Ops.push_back(LoLd);

25031

Ops.push_back(HiLd);

25032

NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);

25033

}

25034

25035

LoAddr = St->getBasePtr();

25036

HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,

25037

DAG.getConstant(4, MVT::i32));

25038

25039

SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,

25040

St->getPointerInfo(),

25041

St->isVolatile(), St->isNonTemporal(),

25042

St->getAlignment());

25043

SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,

25044

St->getPointerInfo().getWithOffset(4),

25045

St->isVolatile(),

25046

St->isNonTemporal(),

25047

MinAlign(St->getAlignment(), 4));

25048

return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);

25049

}

25050

return SDValue();

25051

}

25052

25053

/// Return 'true' if this vector operation is "horizontal"

25054

/// and return the operands for the horizontal operation in LHS and RHS. A

25055

/// horizontal operation performs the binary operation on successive elements

25056

/// of its first operand, then on successive elements of its second operand,

25057

/// returning the resulting values in a vector. For example, if

25058

/// A = < float a0, float a1, float a2, float a3 >

25059

/// and

25060

/// B = < float b0, float b1, float b2, float b3 >

25061

/// then the result of doing a horizontal operation on A and B is

25062

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

25063

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

25064

/// A horizontal-op B, for some already available A and B, and if so then LHS is

25065

/// set to A, RHS to B, and the routine returns 'true'.

25066

/// Note that the binary operation should have the property that if one of the

25067

/// operands is UNDEF then the result is UNDEF.

25068

static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {

25069

// Look for the following pattern: if

25070

// A = < float a0, float a1, float a2, float a3 >

25071

// B = < float b0, float b1, float b2, float b3 >

25072

// and

25073

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

25074

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

25075

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

25076

// which is A horizontal-op B.

25077

25078

// At least one of the operands should be a vector shuffle.

25079

if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&

25080

RHS.getOpcode() != ISD::VECTOR_SHUFFLE)

25081

return false;

25082

25083

MVT VT = LHS.getSimpleValueType();

25084

25085

assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25086, __PRETTY_FUNCTION__))

25086

"Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25086, __PRETTY_FUNCTION__));

25087

25088

// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to

25089

// operate independently on 128-bit lanes.

25090

unsigned NumElts = VT.getVectorNumElements();

25091

unsigned NumLanes = VT.getSizeInBits()/128;

25092

unsigned NumLaneElts = NumElts / NumLanes;

25093

assert((NumLaneElts % 2 == 0) &&(((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25094, __PRETTY_FUNCTION__))

25094

"Vector type should have an even number of elements in each lane")(((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25094, __PRETTY_FUNCTION__));

25095

unsigned HalfLaneElts = NumLaneElts/2;

25096

25097

// View LHS in the form

25098

// LHS = VECTOR_SHUFFLE A, B, LMask

25099

// If LHS is not a shuffle then pretend it is the shuffle

25100

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

25101

// NOTE: in what follows a default initialized SDValue represents an UNDEF of

25102

// type VT.

25103

SDValue A, B;

25104

SmallVector<int, 16> LMask(NumElts);

25105

if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {

25106

if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)

25107

A = LHS.getOperand(0);

25108

if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)

25109

B = LHS.getOperand(1);

25110

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();

25111

std::copy(Mask.begin(), Mask.end(), LMask.begin());

25112

} else {

25113

if (LHS.getOpcode() != ISD::UNDEF)

25114

A = LHS;

25115

for (unsigned i = 0; i != NumElts; ++i)

25116

LMask[i] = i;

25117

}

25118

25119

// Likewise, view RHS in the form

25120

// RHS = VECTOR_SHUFFLE C, D, RMask

25121

SDValue C, D;

25122

SmallVector<int, 16> RMask(NumElts);

25123

if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {

25124

if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)

25125

C = RHS.getOperand(0);

25126

if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)

25127

D = RHS.getOperand(1);

25128

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();

25129

std::copy(Mask.begin(), Mask.end(), RMask.begin());

25130

} else {

25131

if (RHS.getOpcode() != ISD::UNDEF)

25132

C = RHS;

25133

for (unsigned i = 0; i != NumElts; ++i)

25134

RMask[i] = i;

25135

}

25136

25137

// Check that the shuffles are both shuffling the same vectors.

25138

if (!(A == C && B == D) && !(A == D && B == C))

25139

return false;

25140

25141

// If everything is UNDEF then bail out: it would be better to fold to UNDEF.

25142

if (!A.getNode() && !B.getNode())

25143

return false;

25144

25145

// If A and B occur in reverse order in RHS, then "swap" them (which means

25146

// rewriting the mask).

25147

if (A != C)

25148

CommuteVectorShuffleMask(RMask, NumElts);

25149

25150

// At this point LHS and RHS are equivalent to

25151

// LHS = VECTOR_SHUFFLE A, B, LMask

25152

// RHS = VECTOR_SHUFFLE A, B, RMask

25153

// Check that the masks correspond to performing a horizontal operation.

25154

for (unsigned l = 0; l != NumElts; l += NumLaneElts) {

25155

for (unsigned i = 0; i != NumLaneElts; ++i) {

25156

int LIdx = LMask[i+l], RIdx = RMask[i+l];

25157

25158

// Ignore any UNDEF components.

25159

if (LIdx < 0 || RIdx < 0 ||

25160

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

25161

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

25162

continue;

25163

25164

// Check that successive elements are being operated on. If not, this is

25165

// not a horizontal operation.

25166

unsigned Src = (i/HalfLaneElts); // each lane is split between srcs

25167

int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;

25168

if (!(LIdx == Index && RIdx == Index + 1) &&

25169

!(IsCommutative && LIdx == Index + 1 && RIdx == Index))

25170

return false;

25171

}

25172

}

25173

25174

LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

25175

RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

25176

return true;

25177

}

25178

25179

/// Do target-specific dag combines on floating point adds.

25180

static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,

25181

const X86Subtarget *Subtarget) {

25182

EVT VT = N->getValueType(0);

25183

SDValue LHS = N->getOperand(0);

25184

SDValue RHS = N->getOperand(1);

25185

25186

// Try to synthesize horizontal adds from adds of shuffles.

25187

if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

25188

(Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&

25189

isHorizontalBinOp(LHS, RHS, true))

25190

return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);

25191

return SDValue();

25192

}

25193

25194

/// Do target-specific dag combines on floating point subs.

25195

static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,

25196

const X86Subtarget *Subtarget) {

25197

EVT VT = N->getValueType(0);

25198

SDValue LHS = N->getOperand(0);

25199

SDValue RHS = N->getOperand(1);

25200

25201

// Try to synthesize horizontal subs from subs of shuffles.

25202

if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

25203

(Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&

25204

isHorizontalBinOp(LHS, RHS, false))

25205

return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);

25206

return SDValue();

25207

}

25208

25209

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

25210

static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {

25211

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD
::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25211, __PRETTY_FUNCTION__));

25212

// F[X]OR(0.0, x) -> x

25213

// F[X]OR(x, 0.0) -> x

25214

if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))

25215

if (C->getValueAPF().isPosZero())

25216

return N->getOperand(1);

25217

if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))

25218

if (C->getValueAPF().isPosZero())

25219

return N->getOperand(0);

25220

return SDValue();

25221

}

25222

25223

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

25224

static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {

25225

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD
::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25225, __PRETTY_FUNCTION__));

25226

25227

// Only perform optimizations if UnsafeMath is used.

25228

if (!DAG.getTarget().Options.UnsafeFPMath)

25229

return SDValue();

25230

25231

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

25232

// into FMINC and FMAXC, which are Commutative operations.

25233

unsigned NewOp = 0;

25234

switch (N->getOpcode()) {

25235

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25235);

25236

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

25237

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

25238

}

25239

25240

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

25241

N->getOperand(0), N->getOperand(1));

25242

}

25243

25244

/// Do target-specific dag combines on X86ISD::FAND nodes.

25245

static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {

25246

// FAND(0.0, x) -> 0.0

25247

// FAND(x, 0.0) -> 0.0

25248

if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))

25249

if (C->getValueAPF().isPosZero())

25250

return N->getOperand(0);

25251

if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))

25252

if (C->getValueAPF().isPosZero())

25253

return N->getOperand(1);

25254

return SDValue();

25255

}

25256

25257

/// Do target-specific dag combines on X86ISD::FANDN nodes

25258

static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {

25259

// FANDN(x, 0.0) -> 0.0

25260

// FANDN(0.0, x) -> x

25261

if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))

25262

if (C->getValueAPF().isPosZero())

25263

return N->getOperand(1);

25264

if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))

25265

if (C->getValueAPF().isPosZero())

25266

return N->getOperand(1);

25267

return SDValue();

25268

}

25269

25270

static SDValue PerformBTCombine(SDNode *N,

25271

SelectionDAG &DAG,

25272

TargetLowering::DAGCombinerInfo &DCI) {

25273

// BT ignores high bits in the bit index operand.

25274

SDValue Op1 = N->getOperand(1);

25275

if (Op1.hasOneUse()) {

25276

unsigned BitWidth = Op1.getValueSizeInBits();

25277

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

25278

APInt KnownZero, KnownOne;

25279

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

25280

!DCI.isBeforeLegalizeOps());

25281

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25282

if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||

25283

TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))

25284

DCI.CommitTargetLoweringOpt(TLO);

25285

}

25286

return SDValue();

25287

}

25288

25289

static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {

25290

SDValue Op = N->getOperand(0);

25291

if (Op.getOpcode() == ISD::BITCAST)

25292

Op = Op.getOperand(0);

25293

EVT VT = N->getValueType(0), OpVT = Op.getValueType();

25294

if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&

25295

VT.getVectorElementType().getSizeInBits() ==

25296

OpVT.getVectorElementType().getSizeInBits()) {

25297

return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);

25298

}

25299

return SDValue();

25300

}

25301

25302

static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,

25303

const X86Subtarget *Subtarget) {

25304

EVT VT = N->getValueType(0);

25305

if (!VT.isVector())

25306

return SDValue();

25307

25308

SDValue N0 = N->getOperand(0);

25309

SDValue N1 = N->getOperand(1);

25310

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

25311

SDLoc dl(N);

25312

25313

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

25314

// both SSE and AVX2 since there is no sign-extended shift right

25315

// operation on a vector with 64-bit elements.

25316

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

25317

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

25318

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

25319

N0.getOpcode() == ISD::SIGN_EXTEND)) {

25320

SDValue N00 = N0.getOperand(0);

25321

25322

// EXTLOAD has a better solution on AVX2,

25323

// it may be replaced with X86ISD::VSEXT node.

25324

if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())

25325

if (!ISD::isNormalLoad(N00.getNode()))

25326

return SDValue();

25327

25328

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

25329

SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,

25330

N00, N1);

25331

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

25332

}

25333

}

25334

return SDValue();

25335

}

25336

25337

static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,

25338

TargetLowering::DAGCombinerInfo &DCI,

25339

const X86Subtarget *Subtarget) {

25340

SDValue N0 = N->getOperand(0);

25341

EVT VT = N->getValueType(0);

25342

25343

// (i8,i32 sext (sdivrem (i8 x, i8 y)) ->

25344

// (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)

25345

// This exposes the sext to the sdivrem lowering, so that it directly extends

25346

// from AH (which we otherwise need to do contortions to access).

25347

if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&

25348

N0.getValueType() == MVT::i8 && VT == MVT::i32) {

25349

SDLoc dl(N);

25350

SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);

25351

SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,

25352

N0.getOperand(0), N0.getOperand(1));

25353

DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));

25354

return R.getValue(1);

25355

}

25356

25357

if (!DCI.isBeforeLegalizeOps())

25358

return SDValue();

25359

25360

if (!Subtarget->hasFp256())

25361

return SDValue();

25362

25363

if (VT.isVector() && VT.getSizeInBits() == 256) {

25364

SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);

25365

if (R.getNode())

25366

return R;

25367

}

25368

25369

return SDValue();

25370

}

25371

25372

static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,

25373

const X86Subtarget* Subtarget) {

25374

SDLoc dl(N);

25375

EVT VT = N->getValueType(0);

25376

25377

// Let legalize expand this if it isn't a legal type yet.

25378

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

25379

return SDValue();

25380

25381

EVT ScalarVT = VT.getScalarType();

25382

if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

25383

(!Subtarget->hasFMA() && !Subtarget->hasFMA4()))

25384

return SDValue();

25385

25386

SDValue A = N->getOperand(0);

25387

SDValue B = N->getOperand(1);

25388

SDValue C = N->getOperand(2);

25389

25390

bool NegA = (A.getOpcode() == ISD::FNEG);

25391

bool NegB = (B.getOpcode() == ISD::FNEG);

25392

bool NegC = (C.getOpcode() == ISD::FNEG);

25393

25394

// Negative multiplication when NegA xor NegB

25395

bool NegMul = (NegA != NegB);

25396

if (NegA)

25397

A = A.getOperand(0);

25398

if (NegB)

25399

B = B.getOperand(0);

25400

if (NegC)

25401

C = C.getOperand(0);

25402

25403

unsigned Opcode;

25404

if (!NegMul)

25405

Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;

25406

else

25407

Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;

25408

25409

return DAG.getNode(Opcode, dl, VT, A, B, C);

25410

}

25411

25412

static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,

25413

TargetLowering::DAGCombinerInfo &DCI,

25414

const X86Subtarget *Subtarget) {

25415

// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->

25416

// (and (i32 x86isd::setcc_carry), 1)

25417

// This eliminates the zext. This transformation is necessary because

25418

// ISD::SETCC is always legalized to i8.

25419

SDLoc dl(N);

25420

SDValue N0 = N->getOperand(0);

25421

EVT VT = N->getValueType(0);

25422

25423

if (N0.getOpcode() == ISD::AND &&

25424

N0.hasOneUse() &&

25425

N0.getOperand(0).hasOneUse()) {

25426

SDValue N00 = N0.getOperand(0);

25427

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

25428

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));

25429

if (!C || C->getZExtValue() != 1)

25430

return SDValue();

25431

return DAG.getNode(ISD::AND, dl, VT,

25432

DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,

25433

N00.getOperand(0), N00.getOperand(1)),

25434

DAG.getConstant(1, VT));

25435

}

25436

}

25437

25438

if (N0.getOpcode() == ISD::TRUNCATE &&

25439

N0.hasOneUse() &&

25440

N0.getOperand(0).hasOneUse()) {

25441

SDValue N00 = N0.getOperand(0);

25442

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

25443

return DAG.getNode(ISD::AND, dl, VT,

25444

DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,

25445

N00.getOperand(0), N00.getOperand(1)),

25446

DAG.getConstant(1, VT));

25447

}

25448

}

25449

if (VT.is256BitVector()) {

25450

SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);

25451

if (R.getNode())

25452

return R;

25453

}

25454

25455

// (i8,i32 zext (udivrem (i8 x, i8 y)) ->

25456

// (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)

25457

// This exposes the zext to the udivrem lowering, so that it directly extends

25458

// from AH (which we otherwise need to do contortions to access).

25459

if (N0.getOpcode() == ISD::UDIVREM &&

25460

N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&

25461

(VT == MVT::i32 || VT == MVT::i64)) {

25462

SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);

25463

SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,

25464

N0.getOperand(0), N0.getOperand(1));

25465

DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));

25466

return R.getValue(1);

25467

}

25468

25469

return SDValue();

25470

}

25471

25472

// Optimize x == -y --> x+y == 0

25473

// x != -y --> x+y != 0

25474

static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,

25475

const X86Subtarget* Subtarget) {

25476

ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

25477

SDValue LHS = N->getOperand(0);

25478

SDValue RHS = N->getOperand(1);

25479

EVT VT = N->getValueType(0);

25480

SDLoc DL(N);

25481

25482

if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)

25483

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))

25484

if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {

25485

SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),

25486

LHS.getValueType(), RHS, LHS.getOperand(1));

25487

return DAG.getSetCC(SDLoc(N), N->getValueType(0),

25488

addV, DAG.getConstant(0, addV.getValueType()), CC);

25489

}

25490

if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)

25491

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))

25492

if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {

25493

SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),

25494

RHS.getValueType(), LHS, RHS.getOperand(1));

25495

return DAG.getSetCC(SDLoc(N), N->getValueType(0),

25496

addV, DAG.getConstant(0, addV.getValueType()), CC);

25497

}

25498

25499

if (VT.getScalarType() == MVT::i1) {

25500

bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&

25501

(LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);

25502

bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());

25503

if (!IsSEXT0 && !IsVZero0)

25504

return SDValue();

25505

bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&

25506

(RHS.getOperand(0).getValueType().getScalarType() == MVT::i1);

25507

bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

25508

25509

if (!IsSEXT1 && !IsVZero1)

25510

return SDValue();

25511

25512

if (IsSEXT0 && IsVZero1) {

25513

assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type")((VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25513, __PRETTY_FUNCTION__));

25514

if (CC == ISD::SETEQ)

25515

return DAG.getNOT(DL, LHS.getOperand(0), VT);

25516

return LHS.getOperand(0);

25517

}

25518

if (IsSEXT1 && IsVZero0) {

25519

assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type")((VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == RHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25519, __PRETTY_FUNCTION__));

25520

if (CC == ISD::SETEQ)

25521

return DAG.getNOT(DL, RHS.getOperand(0), VT);

25522

return RHS.getOperand(0);

25523

}

25524

}

25525

25526

return SDValue();

25527

}

25528

25529

static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,

25530

const X86Subtarget *Subtarget) {

25531

SDLoc dl(N);

25532

MVT VT = N->getOperand(1)->getSimpleValueType(0);

25533

assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&(((VT == MVT::v4f32 || VT == MVT::v4i32) && "X86insertps is only defined for v4x32"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"X86insertps is only defined for v4x32\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25534, __PRETTY_FUNCTION__))

25534

"X86insertps is only defined for v4x32")(((VT == MVT::v4f32 || VT == MVT::v4i32) && "X86insertps is only defined for v4x32"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"X86insertps is only defined for v4x32\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25534, __PRETTY_FUNCTION__));

25535

25536

SDValue Ld = N->getOperand(1);

25537

if (MayFoldLoad(Ld)) {

25538

// Extract the countS bits from the immediate so we can get the proper

25539

// address when narrowing the vector load to a specific element.

25540

// When the second source op is a memory address, interps doesn't use

25541

// countS and just gets an f32 from that address.

25542

unsigned DestIndex =

25543

cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;

25544

Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);

25545

} else

25546

return SDValue();

25547

25548

// Create this as a scalar to vector to match the instruction pattern.

25549

SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);

25550

// countS bits are ignored when loading from memory on insertps, which

25551

// means we don't need to explicitly set them to 0.

25552

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),

25553

LoadScalarToVector, N->getOperand(2));

25554

}

25555

25556

// Helper function of PerformSETCCCombine. It is to materialize "setb reg"

25557

// as "sbb reg,reg", since it can be extended without zext and produces

25558

// an all-ones bit which is more useful than 0/1 in some cases.

25559

static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,

25560

MVT VT) {

25561

if (VT == MVT::i8)

25562

return DAG.getNode(ISD::AND, DL, VT,

25563

DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,

25564

DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),

25565

DAG.getConstant(1, VT));

25566

assert (VT == MVT::i1 && "Unexpected type for SECCC node")((VT == MVT::i1 && "Unexpected type for SECCC node") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::i1 && \"Unexpected type for SECCC node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25566, __PRETTY_FUNCTION__));

25567

return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,

25568

DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,

25569

DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));

25570

}

25571

25572

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

25573

static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,

25574

TargetLowering::DAGCombinerInfo &DCI,

25575

const X86Subtarget *Subtarget) {

25576

SDLoc DL(N);

25577

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

25578

SDValue EFLAGS = N->getOperand(1);

25579

25580

if (CC == X86::COND_A) {

25581

// Try to convert COND_A into COND_B in an attempt to facilitate

25582

// materializing "setb reg".

25583

25584

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

25585

// cannot take an immediate as its first operand.

25586

25587

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

25588

EFLAGS.getValueType().isInteger() &&

25589

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

25590

SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),

25591

EFLAGS.getNode()->getVTList(),

25592

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

25593

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

25594

return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));

25595

}

25596

}

25597

25598

// Materialize "setb reg" as "sbb reg,reg", since it can be extended without

25599

// a zext and produces an all-ones bit which is more useful than 0/1 in some

25600

// cases.

25601

if (CC == X86::COND_B)

25602

return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));

25603

25604

SDValue Flags;

25605

25606

Flags = checkBoolTestSetCCCombine(EFLAGS, CC);

25607

if (Flags.getNode()) {

25608

SDValue Cond = DAG.getConstant(CC, MVT::i8);

25609

return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);

25610

}

25611

25612

return SDValue();

25613

}

25614

25615

// Optimize branch condition evaluation.

25616

25617

static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,

25618

TargetLowering::DAGCombinerInfo &DCI,

25619

const X86Subtarget *Subtarget) {

25620

SDLoc DL(N);

25621

SDValue Chain = N->getOperand(0);

25622

SDValue Dest = N->getOperand(1);

25623

SDValue EFLAGS = N->getOperand(3);

25624

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

25625

25626

SDValue Flags;

25627

25628

Flags = checkBoolTestSetCCCombine(EFLAGS, CC);

25629

if (Flags.getNode()) {

25630

SDValue Cond = DAG.getConstant(CC, MVT::i8);

25631

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,

25632

Flags);

25633

}

25634

25635

return SDValue();

25636

}

25637

25638

static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,

25639

SelectionDAG &DAG) {

25640

// Take advantage of vector comparisons producing 0 or -1 in each lane to

25641

// optimize away operation when it's from a constant.

25642

25643

// The general transformation is:

25644

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

25645

// AND(VECTOR_CMP(x,y), constant2)

25646

// constant2 = UNARYOP(constant)

25647

25648

// Early exit if this isn't a vector operation, the operand of the

25649

// unary operation isn't a bitwise AND, or if the sizes of the operations

25650

// aren't the same.

25651

EVT VT = N->getValueType(0);

25652

if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||

25653

N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||

25654

VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())

25655

return SDValue();

25656

25657

// Now check that the other operand of the AND is a constant. We could

25658

// make the transformation for non-constant splats as well, but it's unclear

25659

// that would be a benefit as it would not eliminate any operations, just

25660

// perform one more step in scalar code before moving to the vector unit.

25661

if (BuildVectorSDNode *BV =

25662

dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {

25663

// Bail out if the vector isn't a constant.

25664

if (!BV->isConstant())

25665

return SDValue();

25666

25667

// Everything checks out. Build up the new and improved node.

25668

SDLoc DL(N);

25669

EVT IntVT = BV->getValueType(0);

25670

// Create a new constant of the appropriate type for the transformed

25671

// DAG.

25672

SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

25673

// The AND node needs bitcasts to/from an integer vector type around it.

25674

SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);

25675

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,

25676

N->getOperand(0)->getOperand(0), MaskConst);

25677

SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);

25678

return Res;

25679

}

25680

25681

return SDValue();

25682

}

25683

25684

static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,

25685

const X86TargetLowering *XTLI) {

25686

// First try to optimize away the conversion entirely when it's

25687

// conditionally from a constant. Vectors only.

25688

SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);

25689

if (Res != SDValue())

25690

return Res;

25691

25692

// Now move on to more general possibilities.

25693

SDValue Op0 = N->getOperand(0);

25694

EVT InVT = Op0->getValueType(0);

25695

25696

// SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))

25697

if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {

25698

SDLoc dl(N);

25699

MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;

25700

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

25701

return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);

25702

}

25703

25704

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

25705

// a 32-bit target where SSE doesn't support i64->FP operations.

25706

if (Op0.getOpcode() == ISD::LOAD) {

25707

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

25708

EVT VT = Ld->getValueType(0);

25709

if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&

25710

ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&

25711

!XTLI->getSubtarget()->is64Bit() &&

25712

VT == MVT::i64) {

25713

SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),

25714

Ld->getChain(), Op0, DAG);

25715

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));

25716

return FILDChain;

25717

}

25718

}

25719

return SDValue();

25720

}

25721

25722

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

25723

static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,

25724

X86TargetLowering::DAGCombinerInfo &DCI) {

25725

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

25726

// the result is either zero or one (depending on the input carry bit).

25727

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

25728

if (X86::isZeroNode(N->getOperand(0)) &&

25729

X86::isZeroNode(N->getOperand(1)) &&

25730

// We don't have a good way to replace an EFLAGS use, so only do this when

25731

// dead right now.

25732

SDValue(N, 1).use_empty()) {

25733

SDLoc DL(N);

25734

EVT VT = N->getValueType(0);

25735

SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));

25736

SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,

25737

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

25738

DAG.getConstant(X86::COND_B,MVT::i8),

25739

N->getOperand(2)),

25740

DAG.getConstant(1, VT));

25741

return DCI.CombineTo(N, Res1, CarryOut);

25742

}

25743

25744

return SDValue();

25745

}

25746

25747

// fold (add Y, (sete X, 0)) -> adc 0, Y

25748

// (add Y, (setne X, 0)) -> sbb -1, Y

25749

// (sub (sete X, 0), Y) -> sbb 0, Y

25750

// (sub (setne X, 0), Y) -> adc -1, Y

25751

static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {

25752

SDLoc DL(N);

25753

25754

// Look through ZExts.

25755

SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);

25756

if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())

25757

return SDValue();

25758

25759

SDValue SetCC = Ext.getOperand(0);

25760

if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())

25761

return SDValue();

25762

25763

X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);

25764

if (CC != X86::COND_E && CC != X86::COND_NE)

25765

return SDValue();

25766

25767

SDValue Cmp = SetCC.getOperand(1);

25768

if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||

25769

!X86::isZeroNode(Cmp.getOperand(1)) ||

25770

!Cmp.getOperand(0).getValueType().isInteger())

25771

return SDValue();

25772

25773

SDValue CmpOp0 = Cmp.getOperand(0);

25774

SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,

25775

DAG.getConstant(1, CmpOp0.getValueType()));

25776

25777

SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);

25778

if (CC == X86::COND_NE)

25779

return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,

25780

DL, OtherVal.getValueType(), OtherVal,

25781

DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);

25782

return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,

25783

DL, OtherVal.getValueType(), OtherVal,

25784

DAG.getConstant(0, OtherVal.getValueType()), NewCmp);

25785

}

25786

25787

/// PerformADDCombine - Do target-specific dag combines on integer adds.

25788

static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,

25789

const X86Subtarget *Subtarget) {

25790

EVT VT = N->getValueType(0);

25791

SDValue Op0 = N->getOperand(0);

25792

SDValue Op1 = N->getOperand(1);

25793

25794

// Try to synthesize horizontal adds from adds of shuffles.

25795

if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||

25796

(Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&

25797

isHorizontalBinOp(Op0, Op1, true))

25798

return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

25799

25800

return OptimizeConditionalInDecrement(N, DAG);

25801

}

25802

25803

static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,

25804

const X86Subtarget *Subtarget) {

25805

SDValue Op0 = N->getOperand(0);

25806

SDValue Op1 = N->getOperand(1);

25807

25808

// X86 can't encode an immediate LHS of a sub. See if we can push the

25809

// negation into a preceding instruction.

25810

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {

25811

// If the RHS of the sub is a XOR with one use and a constant, invert the

25812

// immediate. Then add one to the LHS of the sub so we can turn

25813

// X-Y -> X+~Y+1, saving one register.

25814

if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&

25815

isa<ConstantSDNode>(Op1.getOperand(1))) {

25816

APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();

25817

EVT VT = Op0.getValueType();

25818

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,

25819

Op1.getOperand(0),

25820

DAG.getConstant(~XorC, VT));

25821

return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,

25822

DAG.getConstant(C->getAPIntValue()+1, VT));

25823

}

25824

}

25825

25826

// Try to synthesize horizontal adds from adds of shuffles.

25827

EVT VT = N->getValueType(0);

25828

if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||

25829

(Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&

25830

isHorizontalBinOp(Op0, Op1, true))

25831

return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);

25832

25833

return OptimizeConditionalInDecrement(N, DAG);

25834

}

25835

25836

/// performVZEXTCombine - Performs build vector combines

25837

static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,

25838

TargetLowering::DAGCombinerInfo &DCI,

25839

const X86Subtarget *Subtarget) {

25840

SDLoc DL(N);

25841

MVT VT = N->getSimpleValueType(0);

25842

SDValue Op = N->getOperand(0);

25843

MVT OpVT = Op.getSimpleValueType();

25844

MVT OpEltVT = OpVT.getVectorElementType();

25845

unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();

25846

25847

// (vzext (bitcast (vzext (x)) -> (vzext x)

25848

SDValue V = Op;

25849

while (V.getOpcode() == ISD::BITCAST)

25850

V = V.getOperand(0);

25851

25852

if (V != Op && V.getOpcode() == X86ISD::VZEXT) {

25853

MVT InnerVT = V.getSimpleValueType();

25854

MVT InnerEltVT = InnerVT.getVectorElementType();

25855

25856

// If the element sizes match exactly, we can just do one larger vzext. This

25857

// is always an exact type match as vzext operates on integer types.

25858

if (OpEltVT == InnerEltVT) {

25859

assert(OpVT == InnerVT && "Types must match for vzext!")((OpVT == InnerVT && "Types must match for vzext!") ?
static_cast<void> (0) : __assert_fail ("OpVT == InnerVT && \"Types must match for vzext!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25859, __PRETTY_FUNCTION__));

25860

return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));

25861

}

25862

25863

// The only other way we can combine them is if only a single element of the

25864

// inner vzext is used in the input to the outer vzext.

25865

if (InnerEltVT.getSizeInBits() < InputBits)

25866

return SDValue();

25867

25868

// In this case, the inner vzext is completely dead because we're going to

25869

// only look at bits inside of the low element. Just do the outer vzext on

25870

// a bitcast of the input to the inner.

25871

return DAG.getNode(X86ISD::VZEXT, DL, VT,

25872

DAG.getNode(ISD::BITCAST, DL, OpVT, V));

25873

}

25874

25875

// Check if we can bypass extracting and re-inserting an element of an input

25876

// vector. Essentialy:

25877

// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)

25878

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

25879

V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

25880

V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {

25881

SDValue ExtractedV = V.getOperand(0);

25882

SDValue OrigV = ExtractedV.getOperand(0);

25883

if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))

25884

if (ExtractIdx->getZExtValue() == 0) {

25885

MVT OrigVT = OrigV.getSimpleValueType();

25886

// Extract a subvector if necessary...

25887

if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {

25888

int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();

25889

OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),

25890

OrigVT.getVectorNumElements() / Ratio);

25891

OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,

25892

DAG.getIntPtrConstant(0));

25893

}

25894

Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);

25895

return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);

25896

}

25897

}

25898

25899

return SDValue();

25900

}

25901

25902

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

25903

DAGCombinerInfo &DCI) const {

25904

SelectionDAG &DAG = DCI.DAG;

25905

switch (N->getOpcode()) {

25906

default: break;

25907

case ISD::EXTRACT_VECTOR_ELT:

25908

return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);

25909

case ISD::VSELECT:

25910

case ISD::SELECT:

25911

case X86ISD::SHRUNKBLEND:

25912

return PerformSELECTCombine(N, DAG, DCI, Subtarget);

25913

case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);

25914

case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);

25915

case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);

25916

case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);

25917

case ISD::MUL: return PerformMulCombine(N, DAG, DCI);

25918

case ISD::SHL:

25919

case ISD::SRA:

25920

case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);

25921

case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);

25922

case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);

25923

case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);

25924

case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);

25925

case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);

25926

case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);

25927

case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);

25928

case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);

25929

case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);

25930

case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);

25931

case X86ISD::FXOR:

25932

case X86ISD::FOR: return PerformFORCombine(N, DAG);

25933

case X86ISD::FMIN:

25934

case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);

25935

case X86ISD::FAND: return PerformFANDCombine(N, DAG);

25936

case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);

25937

case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);

25938

case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);

25939

case ISD::ANY_EXTEND:

25940

case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);

25941

case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);

25942

case ISD::SIGN_EXTEND_INREG:

25943

return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);

25944

case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);

25945

case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);

25946

case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);

25947

case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);

25948

case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);

25949

case X86ISD::SHUFP: // Handle all target specific shuffles

25950

case X86ISD::PALIGNR:

25951

case X86ISD::UNPCKH:

25952

case X86ISD::UNPCKL:

25953

case X86ISD::MOVHLPS:

25954

case X86ISD::MOVLHPS:

25955

case X86ISD::PSHUFB:

25956

case X86ISD::PSHUFD:

25957

case X86ISD::PSHUFHW:

25958

case X86ISD::PSHUFLW:

25959

case X86ISD::MOVSS:

25960

case X86ISD::MOVSD:

25961

case X86ISD::VPERMILPI:

25962

case X86ISD::VPERM2X128:

25963

case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);

25964

case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);

25965

case ISD::INTRINSIC_WO_CHAIN:

25966

return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);

25967

case X86ISD::INSERTPS: {

25968

if (getTargetMachine().getOptLevel() > CodeGenOpt::None)

25969

return PerformINSERTPSCombine(N, DAG, Subtarget);

25970

break;

25971

}

25972

case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);

25973

}

25974

25975

return SDValue();

25976

}

25977

25978

/// isTypeDesirableForOp - Return true if the target has native support for

25979

/// the specified value type and it is 'desirable' to use the type for the

25980

/// given node type. e.g. On x86 i16 is legal, but undesirable since i16

25981

/// instruction encodings are longer and some i16 instructions are slow.

25982

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

25983

if (!isTypeLegal(VT))

25984

return false;

25985

if (VT != MVT::i16)

25986

return true;

25987

25988

switch (Opc) {

25989

default:

25990

return true;

25991

case ISD::LOAD:

25992

case ISD::SIGN_EXTEND:

25993

case ISD::ZERO_EXTEND:

25994

case ISD::ANY_EXTEND:

25995

case ISD::SHL:

25996

case ISD::SRL:

25997

case ISD::SUB:

25998

case ISD::ADD:

25999

case ISD::MUL:

26000

case ISD::AND:

26001

case ISD::OR:

26002

case ISD::XOR:

26003

return false;

26004

}

26005

}

26006

26007

/// IsDesirableToPromoteOp - This method query the target whether it is

26008

/// beneficial for dag combiner to promote the specified node. If true, it

26009

/// should return the desired promotion type by reference.

26010

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

26011

EVT VT = Op.getValueType();

26012

if (VT != MVT::i16)

26013

return false;

26014

26015

bool Promote = false;

26016

bool Commute = false;

26017

switch (Op.getOpcode()) {

26018

default: break;

26019

case ISD::LOAD: {

26020

LoadSDNode *LD = cast<LoadSDNode>(Op);

26021

// If the non-extending load has a single use and it's not live out, then it

26022

// might be folded.

26023

if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&

26024

Op.hasOneUse()*/) {

26025

for (SDNode::use_iterator UI = Op.getNode()->use_begin(),

26026

UE = Op.getNode()->use_end(); UI != UE; ++UI) {

26027

// The only case where we'd want to promote LOAD (rather then it being

26028

// promoted as an operand is when it's only use is liveout.

26029

if (UI->getOpcode() != ISD::CopyToReg)

26030

return false;

26031

}

26032

}

26033

Promote = true;

26034

break;

26035

}

26036

case ISD::SIGN_EXTEND:

26037

case ISD::ZERO_EXTEND:

26038

case ISD::ANY_EXTEND:

26039

Promote = true;

26040

break;

26041

case ISD::SHL:

26042

case ISD::SRL: {

26043

SDValue N0 = Op.getOperand(0);

26044

// Look out for (store (shl (load), x)).

26045

if (MayFoldLoad(N0) && MayFoldIntoStore(Op))

26046

return false;

26047

Promote = true;

26048

break;

26049

}

26050

case ISD::ADD:

26051

case ISD::MUL:

26052

case ISD::AND:

26053

case ISD::OR:

26054

case ISD::XOR:

26055

Commute = true;

26056

// fallthrough

26057

case ISD::SUB: {

26058

SDValue N0 = Op.getOperand(0);

26059

SDValue N1 = Op.getOperand(1);

26060

if (!Commute && MayFoldLoad(N1))

26061

return false;

26062

// Avoid disabling potential load folding opportunities.

26063

if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))

26064

return false;

26065

if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))

26066

return false;

26067

Promote = true;

26068

}

26069

}

26070

26071

PVT = MVT::i32;

26072

return Promote;

26073

}

26074

26075

//===----------------------------------------------------------------------===//

26076

// X86 Inline Assembly Support

26077

//===----------------------------------------------------------------------===//

26078

26079

namespace {

26080

// Helper to match a string separated by whitespace.

26081

bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {

26082

s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.

26083

26084

for (unsigned i = 0, e = args.size(); i != e; ++i) {

26085

StringRef piece(*args[i]);

26086

if (!s.startswith(piece)) // Check if the piece matches.

26087

return false;

26088

26089

s = s.substr(piece.size());

26090

StringRef::size_type pos = s.find_first_not_of(" \t");

26091

if (pos == 0) // We matched a prefix.

26092

return false;

26093

26094

s = s.substr(pos);

26095

}

26096

26097

return s.empty();

26098

}

26099

const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};

26100

}

26101

26102

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

26103

26104

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

26105

if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&

26106

std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&

26107

std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

26108

26109

if (AsmPieces.size() == 3)

26110

return true;

26111

else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))

26112

return true;

26113

}

26114

}

26115

return false;

26116

}

26117

26118

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

26119

InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

26120

26121

std::string AsmStr = IA->getAsmString();

26122

26123

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

26124

if (!Ty || Ty->getBitWidth() % 16 != 0)

26125

return false;

26126

26127

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

26128

SmallVector<StringRef, 4> AsmPieces;

26129

SplitString(AsmStr, AsmPieces, ";\n");

26130

26131

switch (AsmPieces.size()) {

26132

default: return false;

26133

case 1:

26134

// FIXME: this should verify that we are targeting a 486 or better. If not,

26135

// we will turn this bswap into something that will be lowered to logical

26136

// ops instead of emitting the bswap asm. For now, we don't support 486 or

26137

// lower so don't worry about this.

26138

// bswap $0

26139

if (matchAsm(AsmPieces[0], "bswap", "$0") ||

26140

matchAsm(AsmPieces[0], "bswapl", "$0") ||

26141

matchAsm(AsmPieces[0], "bswapq", "$0") ||

26142

matchAsm(AsmPieces[0], "bswap", "${0:q}") ||

26143

matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||

26144

matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {

26145

// No need to check constraints, nothing other than the equivalent of

26146

// "=r,0" would be valid here.

26147

return IntrinsicLowering::LowerToByteSwap(CI);

26148

}

26149

26150

// rorw $$8, ${0:w} --> llvm.bswap.i16

26151

if (CI->getType()->isIntegerTy(16) &&

26152

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

26153

(matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||

26154

matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {

26155

AsmPieces.clear();

26156

const std::string &ConstraintsStr = IA->getConstraintString();

26157

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

26158

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

26159

if (clobbersFlagRegisters(AsmPieces))

26160

return IntrinsicLowering::LowerToByteSwap(CI);

26161

}

26162

break;

26163

case 3:

26164

if (CI->getType()->isIntegerTy(32) &&

26165

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

26166

matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&

26167

matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&

26168

matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {

26169

AsmPieces.clear();

26170

const std::string &ConstraintsStr = IA->getConstraintString();

26171

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

26172

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

26173

if (clobbersFlagRegisters(AsmPieces))

26174

return IntrinsicLowering::LowerToByteSwap(CI);

26175

}

26176

26177

if (CI->getType()->isIntegerTy(64)) {

26178

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

26179

if (Constraints.size() >= 2 &&

26180

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

26181

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

26182

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

26183

if (matchAsm(AsmPieces[0], "bswap", "%eax") &&

26184

matchAsm(AsmPieces[1], "bswap", "%edx") &&

26185

matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))

26186

return IntrinsicLowering::LowerToByteSwap(CI);

26187

}

26188

}

26189

break;

26190

}

26191

return false;

26192

}

26193

26194

/// getConstraintType - Given a constraint letter, return the type of

26195

/// constraint it is for this target.

26196

X86TargetLowering::ConstraintType

26197

X86TargetLowering::getConstraintType(const std::string &Constraint) const {

26198

if (Constraint.size() == 1) {

26199

switch (Constraint[0]) {

26200

case 'R':

26201

case 'q':

26202

case 'Q':

26203

case 'f':

26204

case 't':

26205

case 'u':

26206

case 'y':

26207

case 'x':

26208

case 'Y':

26209

case 'l':

26210

return C_RegisterClass;

26211

case 'a':

26212

case 'b':

26213

case 'c':

26214

case 'd':

26215

case 'S':

26216

case 'D':

26217

case 'A':

26218

return C_Register;

26219

case 'I':

26220

case 'J':

26221

case 'K':

26222

case 'L':

26223

case 'M':

26224

case 'N':

26225

case 'G':

26226

case 'C':

26227

case 'e':

26228

case 'Z':

26229

return C_Other;

26230

default:

26231

break;

26232

}

26233

}

26234

return TargetLowering::getConstraintType(Constraint);

26235

}

26236

26237

/// Examine constraint type and operand type and determine a weight value.

26238

/// This object must already have been set up with the operand type

26239

/// and the current alternative constraint selected.

26240

TargetLowering::ConstraintWeight

26241

X86TargetLowering::getSingleConstraintMatchWeight(

26242

AsmOperandInfo &info, const char *constraint) const {

26243

ConstraintWeight weight = CW_Invalid;

26244

Value *CallOperandVal = info.CallOperandVal;

26245

// If we don't have a value, we can't do a match,

26246

// but allow it at the lowest weight.

26247

if (!CallOperandVal)

26248

return CW_Default;

26249

Type *type = CallOperandVal->getType();

26250

// Look at the constraint type.

26251

switch (*constraint) {

26252

default:

26253

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

26254

case 'R':

26255

case 'q':

26256

case 'Q':

26257

case 'a':

26258

case 'b':

26259

case 'c':

26260

case 'd':

26261

case 'S':

26262

case 'D':

26263

case 'A':

26264

if (CallOperandVal->getType()->isIntegerTy())

26265

weight = CW_SpecificReg;

26266

break;

26267

case 'f':

26268

case 't':

26269

case 'u':

26270

if (type->isFloatingPointTy())

26271

weight = CW_SpecificReg;

26272

break;

26273

case 'y':

26274

if (type->isX86_MMXTy() && Subtarget->hasMMX())

26275

weight = CW_SpecificReg;

26276

break;

26277

case 'x':

26278

case 'Y':

26279

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||

26280

((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))

26281

weight = CW_Register;

26282

break;

26283

case 'I':

26284

if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

26285

if (C->getZExtValue() <= 31)

26286

weight = CW_Constant;

26287

}

26288

break;

26289

case 'J':

26290

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

26291

if (C->getZExtValue() <= 63)

26292

weight = CW_Constant;

26293

}

26294

break;

26295

case 'K':

26296

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

26297

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

26298

weight = CW_Constant;

26299

}

26300

break;

26301

case 'L':

26302

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

26303

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

26304

weight = CW_Constant;

26305

}

26306

break;

26307

case 'M':

26308

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

26309

if (C->getZExtValue() <= 3)

26310

weight = CW_Constant;

26311

}

26312

break;

26313

case 'N':

26314

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

26315

if (C->getZExtValue() <= 0xff)

26316

weight = CW_Constant;

26317

}

26318

break;

26319

case 'G':

26320

case 'C':

26321

if (dyn_cast<ConstantFP>(CallOperandVal)) {

26322

weight = CW_Constant;

26323

}

26324

break;

26325

case 'e':

26326

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

26327

if ((C->getSExtValue() >= -0x80000000LL) &&

26328

(C->getSExtValue() <= 0x7fffffffLL))

26329

weight = CW_Constant;

26330

}

26331

break;

26332

case 'Z':

26333

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

26334

if (C->getZExtValue() <= 0xffffffff)

26335

weight = CW_Constant;

26336

}

26337

break;

26338

}

26339

return weight;

26340

}

26341

26342

/// LowerXConstraint - try to replace an X constraint, which matches anything,

26343

/// with another that has more specific requirements based on the type of the

26344

/// corresponding operand.

26345

const char *X86TargetLowering::

26346

LowerXConstraint(EVT ConstraintVT) const {

26347

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

26348

// 'f' like normal targets.

26349

if (ConstraintVT.isFloatingPoint()) {

26350

if (Subtarget->hasSSE2())

26351

return "Y";

26352

if (Subtarget->hasSSE1())

26353

return "x";

26354

}

26355

26356

return TargetLowering::LowerXConstraint(ConstraintVT);

26357

}

26358

26359

/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops

26360

/// vector. If it is invalid, don't add anything to Ops.

26361

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

26362

std::string &Constraint,

26363

std::vector<SDValue>&Ops,

26364

SelectionDAG &DAG) const {

26365

SDValue Result;

26366

26367

// Only support length 1 constraints for now.

26368

if (Constraint.length() > 1) return;

26369

26370

char ConstraintLetter = Constraint[0];

26371

switch (ConstraintLetter) {

26372

default: break;

26373

case 'I':

26374

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26375

if (C->getZExtValue() <= 31) {

26376

Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());

26377

break;

26378

}

26379

}

26380

return;

26381

case 'J':

26382

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26383

if (C->getZExtValue() <= 63) {

26384

Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());

26385

break;

26386

}

26387

}

26388

return;

26389

case 'K':

26390

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26391

if (isInt<8>(C->getSExtValue())) {

26392

Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());

26393

break;

26394

}

26395

}

26396

return;

26397

case 'L':

26398

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26399

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

26400

(Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {

26401

Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());

26402

break;

26403

}

26404

}

26405

return;

26406

case 'M':

26407

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26408

if (C->getZExtValue() <= 3) {

26409

Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());

26410

break;

26411

}

26412

}

26413

return;

26414

case 'N':

26415

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26416

if (C->getZExtValue() <= 255) {

26417

Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());

26418

break;

26419

}

26420

}

26421

return;

26422

case 'O':

26423

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26424

if (C->getZExtValue() <= 127) {

26425

Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());

26426

break;

26427

}

26428

}

26429

return;

26430

case 'e': {

26431

// 32-bit signed value

26432

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26433

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

26434

C->getSExtValue())) {

26435

// Widen to 64 bits here to get it sign extended.

26436

Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);

26437

break;

26438

}

26439

// FIXME gcc accepts some relocatable values here too, but only in certain

26440

// memory models; it's complicated.

26441

}

26442

return;

26443

}

26444

case 'Z': {

26445

// 32-bit unsigned value

26446

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

26447

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

26448

C->getZExtValue())) {

26449

Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());

26450

break;

26451

}

26452

}

26453

// FIXME gcc accepts some relocatable values here too, but only in certain

26454

// memory models; it's complicated.

26455

return;

26456

}

26457

case 'i': {

26458

// Literal immediates are always ok.

26459

if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {

26460

// Widen to 64 bits here to get it sign extended.

26461

Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);

26462

break;

26463

}

26464

26465

// In any sort of PIC mode addresses need to be computed at runtime by

26466

// adding in a register or some sort of table lookup. These can't

26467

// be used as immediates.

26468

if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())

26469

return;

26470

26471

// If we are in non-pic codegen mode, we allow the address of a global (with

26472

// an optional displacement) to be used with 'i'.

26473

GlobalAddressSDNode *GA = nullptr;

26474

int64_t Offset = 0;

26475

26476

// Match either (GA), (GA+C), (GA+C1+C2), etc.

26477

while (1) {

26478

if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {

26479

Offset += GA->getOffset();

26480

break;

26481

} else if (Op.getOpcode() == ISD::ADD) {

26482

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

26483

Offset += C->getZExtValue();

26484

Op = Op.getOperand(0);

26485

continue;

26486

}

26487

} else if (Op.getOpcode() == ISD::SUB) {

26488

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

26489

Offset += -C->getZExtValue();

26490

Op = Op.getOperand(0);

26491

continue;

26492

}

26493

}

26494

26495

// Otherwise, this isn't something we can handle, reject it.

26496

return;

26497

}

26498

26499

const GlobalValue *GV = GA->getGlobal();

26500

// If we require an extra load to get this address, as in PIC mode, we

26501

// can't accept it.

26502

if (isGlobalStubReference(

26503

Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))

26504

return;

26505

26506

Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),

26507

GA->getValueType(0), Offset);

26508

break;

26509

}

26510

}

26511

26512

if (Result.getNode()) {

26513

Ops.push_back(Result);

26514

return;

26515

}

26516

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

26517

}

26518

26519

std::pair<unsigned, const TargetRegisterClass*>

26520

X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,

26521

MVT VT) const {

26522

// First, see if this is a constraint that directly corresponds to an LLVM

26523

// register class.

26524

if (Constraint.size() == 1) {

26525

// GCC Constraint Letters

26526

switch (Constraint[0]) {

26527

default: break;

26528

// TODO: Slight differences here in allocation order and leaving

26529

// RIP in the class. Do they matter any more here than they do

26530

// in the normal allocation?

26531

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

26532

if (Subtarget->is64Bit()) {

26533

if (VT == MVT::i32 || VT == MVT::f32)

26534

return std::make_pair(0U, &X86::GR32RegClass);

26535

if (VT == MVT::i16)

26536

return std::make_pair(0U, &X86::GR16RegClass);

26537

if (VT == MVT::i8 || VT == MVT::i1)

26538

return std::make_pair(0U, &X86::GR8RegClass);

26539

if (VT == MVT::i64 || VT == MVT::f64)

26540

return std::make_pair(0U, &X86::GR64RegClass);

26541

break;

26542

}

26543

// 32-bit fallthrough

26544

case 'Q': // Q_REGS

26545

if (VT == MVT::i32 || VT == MVT::f32)

26546

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

26547

if (VT == MVT::i16)

26548

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

26549

if (VT == MVT::i8 || VT == MVT::i1)

26550

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

26551

if (VT == MVT::i64)

26552

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

26553

break;

26554

case 'r': // GENERAL_REGS

26555

case 'l': // INDEX_REGS

26556

if (VT == MVT::i8 || VT == MVT::i1)

26557

return std::make_pair(0U, &X86::GR8RegClass);

26558

if (VT == MVT::i16)

26559

return std::make_pair(0U, &X86::GR16RegClass);

26560

if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())

26561

return std::make_pair(0U, &X86::GR32RegClass);

26562

return std::make_pair(0U, &X86::GR64RegClass);

26563

case 'R': // LEGACY_REGS

26564

if (VT == MVT::i8 || VT == MVT::i1)

26565

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

26566

if (VT == MVT::i16)

26567

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

26568

if (VT == MVT::i32 || !Subtarget->is64Bit())

26569

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

26570

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

26571

case 'f': // FP Stack registers.

26572

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

26573

// value to the correct fpstack register class.

26574

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

26575

return std::make_pair(0U, &X86::RFP32RegClass);

26576

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

26577

return std::make_pair(0U, &X86::RFP64RegClass);

26578

return std::make_pair(0U, &X86::RFP80RegClass);

26579

case 'y': // MMX_REGS if MMX allowed.

26580

if (!Subtarget->hasMMX()) break;

26581

return std::make_pair(0U, &X86::VR64RegClass);

26582

case 'Y': // SSE_REGS if SSE2 allowed

26583

if (!Subtarget->hasSSE2()) break;

26584

// FALL THROUGH.

26585

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

26586

if (!Subtarget->hasSSE1()) break;

26587

26588

switch (VT.SimpleTy) {

26589

default: break;

26590

// Scalar SSE types.

26591

case MVT::f32:

26592

case MVT::i32:

26593

return std::make_pair(0U, &X86::FR32RegClass);

26594

case MVT::f64:

26595

case MVT::i64:

26596

return std::make_pair(0U, &X86::FR64RegClass);

26597

// Vector types.

26598

case MVT::v16i8:

26599

case MVT::v8i16:

26600

case MVT::v4i32:

26601

case MVT::v2i64:

26602

case MVT::v4f32:

26603

case MVT::v2f64:

26604

return std::make_pair(0U, &X86::VR128RegClass);

26605

// AVX types.

26606

case MVT::v32i8:

26607

case MVT::v16i16:

26608

case MVT::v8i32:

26609

case MVT::v4i64:

26610

case MVT::v8f32:

26611

case MVT::v4f64:

26612

return std::make_pair(0U, &X86::VR256RegClass);

26613

case MVT::v8f64:

26614

case MVT::v16f32:

26615

case MVT::v16i32:

26616

case MVT::v8i64:

26617

return std::make_pair(0U, &X86::VR512RegClass);

26618

}

26619

break;

26620

}

26621

}

26622

26623

// Use the default implementation in TargetLowering to convert the register

26624

// constraint into a member of a register class.

26625

std::pair<unsigned, const TargetRegisterClass*> Res;

26626

Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);

26627

26628

// Not found as a standard register?

26629

if (!Res.second) {

26630

// Map st(0) -> st(7) -> ST0

26631

if (Constraint.size() == 7 && Constraint[0] == '{' &&

26632

tolower(Constraint[1]) == 's' &&

26633

tolower(Constraint[2]) == 't' &&

26634

Constraint[3] == '(' &&

26635

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

26636

Constraint[5] == ')' &&

26637

Constraint[6] == '}') {

26638

26639

Res.first = X86::FP0+Constraint[4]-'0';

26640

Res.second = &X86::RFP80RegClass;

26641

return Res;

26642

}

26643

26644

// GCC allows "st(0)" to be called just plain "st".

26645

if (StringRef("{st}").equals_lower(Constraint)) {

26646

Res.first = X86::FP0;

26647

Res.second = &X86::RFP80RegClass;

26648

return Res;

26649

}

26650

26651

// flags -> EFLAGS

26652

if (StringRef("{flags}").equals_lower(Constraint)) {

26653

Res.first = X86::EFLAGS;

26654

Res.second = &X86::CCRRegClass;

26655

return Res;

26656

}

26657

26658

// 'A' means EAX + EDX.

26659

if (Constraint == "A") {

26660

Res.first = X86::EAX;

26661

Res.second = &X86::GR32_ADRegClass;

26662

return Res;

26663

}

26664

return Res;

26665

}

26666

26667

// Otherwise, check to see if this is a register class of the wrong value

26668

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

26669

// turn into {ax},{dx}.

26670

if (Res.second->hasType(VT))

26671

return Res; // Correct type already, nothing to do.

26672

26673

// All of the single-register GCC register classes map their values onto

26674

// 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we

26675

// really want an 8-bit or 32-bit register, map to the appropriate register

26676

// class and return the appropriate register.

26677

if (Res.second == &X86::GR16RegClass) {

26678

if (VT == MVT::i8 || VT == MVT::i1) {

26679

unsigned DestReg = 0;

26680

switch (Res.first) {

26681

default: break;

26682

case X86::AX: DestReg = X86::AL; break;

26683

case X86::DX: DestReg = X86::DL; break;

26684

case X86::CX: DestReg = X86::CL; break;

26685

case X86::BX: DestReg = X86::BL; break;

26686

}

26687

if (DestReg) {

26688

Res.first = DestReg;

26689

Res.second = &X86::GR8RegClass;

26690

}

26691

} else if (VT == MVT::i32 || VT == MVT::f32) {

26692

unsigned DestReg = 0;

26693

switch (Res.first) {

26694

default: break;

26695

case X86::AX: DestReg = X86::EAX; break;

26696

case X86::DX: DestReg = X86::EDX; break;

26697

case X86::CX: DestReg = X86::ECX; break;

26698

case X86::BX: DestReg = X86::EBX; break;

26699

case X86::SI: DestReg = X86::ESI; break;

26700

case X86::DI: DestReg = X86::EDI; break;

26701

case X86::BP: DestReg = X86::EBP; break;

26702

case X86::SP: DestReg = X86::ESP; break;

26703

}

26704

if (DestReg) {

26705

Res.first = DestReg;

26706

Res.second = &X86::GR32RegClass;

26707

}

26708

} else if (VT == MVT::i64 || VT == MVT::f64) {

26709

unsigned DestReg = 0;

26710

switch (Res.first) {

26711

default: break;

26712

case X86::AX: DestReg = X86::RAX; break;

26713

case X86::DX: DestReg = X86::RDX; break;

26714

case X86::CX: DestReg = X86::RCX; break;

26715

case X86::BX: DestReg = X86::RBX; break;

26716

case X86::SI: DestReg = X86::RSI; break;

26717

case X86::DI: DestReg = X86::RDI; break;

26718

case X86::BP: DestReg = X86::RBP; break;

26719

case X86::SP: DestReg = X86::RSP; break;

26720

}

26721

if (DestReg) {

26722

Res.first = DestReg;

26723

Res.second = &X86::GR64RegClass;

26724

}

26725

}

26726

} else if (Res.second == &X86::FR32RegClass ||

26727

Res.second == &X86::FR64RegClass ||

26728

Res.second == &X86::VR128RegClass ||

26729

Res.second == &X86::VR256RegClass ||

26730

Res.second == &X86::FR32XRegClass ||

26731

Res.second == &X86::FR64XRegClass ||

26732

Res.second == &X86::VR128XRegClass ||

26733

Res.second == &X86::VR256XRegClass ||

26734

Res.second == &X86::VR512RegClass) {

26735

// Handle references to XMM physical registers that got mapped into the

26736

// wrong class. This can happen with constraints like {xmm0} where the

26737

// target independent register mapper will just pick the first match it can

26738

// find, ignoring the required type.

26739

26740

if (VT == MVT::f32 || VT == MVT::i32)

26741

Res.second = &X86::FR32RegClass;

26742

else if (VT == MVT::f64 || VT == MVT::i64)

26743

Res.second = &X86::FR64RegClass;

26744

else if (X86::VR128RegClass.hasType(VT))

26745

Res.second = &X86::VR128RegClass;

26746

else if (X86::VR256RegClass.hasType(VT))

26747

Res.second = &X86::VR256RegClass;

26748

else if (X86::VR512RegClass.hasType(VT))

26749

Res.second = &X86::VR512RegClass;

26750

}

26751

26752

return Res;

26753

}

26754

26755

int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,

26756

Type *Ty) const {

26757

// Scaling factors are not free at all.

26758

// An indexed folded instruction, i.e., inst (reg1, reg2, scale),

26759

// will take 2 allocations in the out of order engine instead of 1

26760

// for plain addressing mode, i.e. inst (reg1).

26761

// E.g.,

26762

// vaddps (%rsi,%drx), %ymm0, %ymm1

26763

// Requires two allocations (one for the load, one for the computation)

26764

// whereas:

26765

// vaddps (%rsi), %ymm0, %ymm1

26766

// Requires just 1 allocation, i.e., freeing allocations for other operations

26767

// and having less micro operations to execute.

26768

26769

// For some X86 architectures, this is even worse because for instance for

26770

// stores, the complex addressing mode forces the instruction to use the

26771

// "load" ports instead of the dedicated "store" port.

26772

// E.g., on Haswell:

26773

// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.

26774

// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.

26775

if (isLegalAddressingMode(AM, Ty))

26776

// Scale represents reg2 * scale, thus account for 1

26777

// as soon as we use a second register.

26778

return AM.Scale != 0;

26779

return -1;

26780

}

26781

26782

bool X86TargetLowering::isTargetFTOL() const {

26783

return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();

26784

}