LLVM 22.0.0git
X86FixupInstTuning.cpp
Go to the documentation of this file.
1//===-- X86FixupInstTunings.cpp - replace instructions -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file does a tuning pass replacing slower machine instructions
10// with faster ones. We do this here, as opposed to during normal ISel, as
11// attempting to get the "right" instruction can break patterns. This pass
12// is not meant search for special cases where an instruction can be transformed
13// to another, it is only meant to do transformations where the old instruction
14// is always replacable with the new instructions. For example:
15//
16// `vpermq ymm` -> `vshufd ymm`
17// -- BAD, not always valid (lane cross/non-repeated mask)
18//
19// `vpermilps ymm` -> `vshufd ymm`
20// -- GOOD, always replaceable
21//
22//===----------------------------------------------------------------------===//
23
24#include "X86.h"
25#include "X86InstrInfo.h"
26#include "X86Subtarget.h"
27#include "llvm/ADT/Statistic.h"
30
31using namespace llvm;
32
33#define DEBUG_TYPE "x86-fixup-inst-tuning"
34
35STATISTIC(NumInstChanges, "Number of instructions changes");
36
37namespace {
38class X86FixupInstTuningPass : public MachineFunctionPass {
39public:
40 static char ID;
41
42 X86FixupInstTuningPass() : MachineFunctionPass(ID) {}
43
44 StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; }
45
46 bool runOnMachineFunction(MachineFunction &MF) override;
47 bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
49
50 // This pass runs after regalloc and doesn't support VReg operands.
51 MachineFunctionProperties getRequiredProperties() const override {
52 return MachineFunctionProperties().setNoVRegs();
53 }
54
55private:
56 const X86InstrInfo *TII = nullptr;
57 const X86Subtarget *ST = nullptr;
58 const MCSchedModel *SM = nullptr;
59};
60} // end anonymous namespace
61
62char X86FixupInstTuningPass::ID = 0;
63
64INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
65
67 return new X86FixupInstTuningPass();
68}
69
70template <typename T>
71static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
72 if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
73 return *NewVal < *CurVal;
74
75 return std::nullopt;
76}
77
78bool X86FixupInstTuningPass::processInstruction(
81 MachineInstr &MI = *I;
82 unsigned Opc = MI.getOpcode();
83 unsigned NumOperands = MI.getDesc().getNumOperands();
84 bool OptSize = MF.getFunction().hasOptSize();
85
86 auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
87 // We already checked that SchedModel exists in `NewOpcPreferable`.
89 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
90 };
91
92 auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
93 // We already checked that SchedModel exists in `NewOpcPreferable`.
95 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
96 };
97
98 auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
99 if (unsigned Size = TII->get(Opcode).getSize())
100 return Size;
101 // Zero size means we where unable to compute it.
102 return std::nullopt;
103 };
104
105 auto NewOpcPreferable = [&](unsigned NewOpc,
106 bool ReplaceInTie = true) -> bool {
107 std::optional<bool> Res;
108 if (SM->hasInstrSchedModel()) {
109 // Compare tput -> lat -> code size.
110 Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
111 if (Res.has_value())
112 return *Res;
113
114 Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
115 if (Res.has_value())
116 return *Res;
117 }
118
119 Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
120 if (Res.has_value())
121 return *Res;
122
123 // We either have either were unable to get tput/lat/codesize or all values
124 // were equal. Return specified option for a tie.
125 return ReplaceInTie;
126 };
127
128 // `vpermilpd r, i` -> `vshufpd r, r, i`
129 // `vpermilpd r, i, k` -> `vshufpd r, r, i, k`
130 // `vshufpd` is always as fast or faster than `vpermilpd` and takes
131 // 1 less byte of code size for VEX and EVEX encoding.
132 auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
133 if (!NewOpcPreferable(NewOpc))
134 return false;
135 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
136 {
137 unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
138 MI.removeOperand(NumOperands - 1);
139 MI.addOperand(MI.getOperand(NumOperands - 2));
140 MI.setDesc(TII->get(NewOpc));
141 MI.addOperand(MachineOperand::CreateImm(MaskImm));
142 }
143 LLVM_DEBUG(dbgs() << " With: " << MI);
144 return true;
145 };
146
147 // `vpermilps r, i` -> `vshufps r, r, i`
148 // `vpermilps r, i, k` -> `vshufps r, r, i, k`
149 // `vshufps` is always as fast or faster than `vpermilps` and takes
150 // 1 less byte of code size for VEX and EVEX encoding.
151 auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
152 if (!NewOpcPreferable(NewOpc))
153 return false;
154 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
155 {
156 unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
157 MI.removeOperand(NumOperands - 1);
158 MI.addOperand(MI.getOperand(NumOperands - 2));
159 MI.setDesc(TII->get(NewOpc));
160 MI.addOperand(MachineOperand::CreateImm(MaskImm));
161 }
162 LLVM_DEBUG(dbgs() << " With: " << MI);
163 return true;
164 };
165
166 // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles.
167 // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less
168 // byte of code size.
169 auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
170 // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
171 // `vpshufd` saves a byte of code size.
172 if (!ST->hasNoDomainDelayShuffle() ||
173 !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
174 return false;
175 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
176 {
177 MI.setDesc(TII->get(NewOpc));
178 }
179 LLVM_DEBUG(dbgs() << " With: " << MI);
180 return true;
181 };
182
183 // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00`
184 // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff`
185 // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00`
186 // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff`
187 // `vunpcklpd r, m` -> `vunpcklqdq r, m, k`
188 // `vunpckhpd r, m` -> `vunpckhqdq r, m, k`
189 // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k`
190 // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k`
191 // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd`
192 // -> `vunpck{l|h}qdq`
193 // 2) If `vshufpd` faster than `vunpck{l|h}pd`
194 // -> `vshufpd`
195 //
196 // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay)
197 auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
198 if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
199 return false;
200 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
201 {
202 MI.setDesc(TII->get(NewOpc));
203 MI.addOperand(MachineOperand::CreateImm(MaskImm));
204 }
205 LLVM_DEBUG(dbgs() << " With: " << MI);
206 return true;
207 };
208
209 auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool {
210 // TODO it may be worth it to set ReplaceInTie to `true` as there is no real
211 // downside to the integer unpck, but if someone doesn't specify exact
212 // target we won't find it faster.
213 if (!ST->hasNoDomainDelayShuffle() ||
214 !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
215 return false;
216 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
217 {
218 MI.setDesc(TII->get(NewOpc));
219 }
220 LLVM_DEBUG(dbgs() << " With: " << MI);
221 return true;
222 };
223
224 auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain,
225 unsigned NewOpc) -> bool {
226 if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
227 return true;
228 return ProcessUNPCK(NewOpc, 0x00);
229 };
230 auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain,
231 unsigned NewOpc) -> bool {
232 if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
233 return true;
234 return ProcessUNPCK(NewOpc, 0xff);
235 };
236
237 auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool {
238 return ProcessUNPCKToIntDomain(NewOpcIntDomain);
239 };
240
241 auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
242 return ProcessUNPCKToIntDomain(NewOpc);
243 };
244
245 auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
246 if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
247 return false;
248 // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
249 APInt MaskW =
250 APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false);
251 APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
252 if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
253 return false;
254 APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
255 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
256 {
257 MI.setDesc(TII->get(MovOpc));
258 MI.removeOperand(NumOperands - 1);
259 MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
260 }
261 LLVM_DEBUG(dbgs() << " With: " << MI);
262 return true;
263 };
264
265 auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
266 unsigned MovImm) -> bool {
267 if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
268 return false;
269 if (!OptSize && !NewOpcPreferable(MovOpc))
270 return false;
271 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
272 {
273 MI.setDesc(TII->get(MovOpc));
274 MI.removeOperand(NumOperands - 1);
275 }
276 LLVM_DEBUG(dbgs() << " With: " << MI);
277 return true;
278 };
279
280 // Is ADD(X,X) more efficient than SHL(X,1)?
281 auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
282 if (MI.getOperand(NumOperands - 1).getImm() != 1)
283 return false;
284 if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
285 return false;
286 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
287 {
288 MI.setDesc(TII->get(AddOpc));
289 MI.removeOperand(NumOperands - 1);
290 MI.addOperand(MI.getOperand(NumOperands - 2));
291 }
292 LLVM_DEBUG(dbgs() << " With: " << MI);
293 return false;
294 };
295
296 switch (Opc) {
297 case X86::BLENDPDrri:
298 return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
299 case X86::VBLENDPDrri:
300 return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
301
302 case X86::BLENDPSrri:
303 return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
304 ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
305 case X86::VBLENDPSrri:
306 return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
307 ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
308
309 case X86::VPBLENDWrri:
310 // TODO: Add X86::VPBLENDWrmi handling
311 // TODO: Add X86::VPBLENDWYrri handling
312 // TODO: Add X86::VPBLENDWYrmi handling
313 return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
314
315 case X86::VPERMILPDri:
316 return ProcessVPERMILPDri(X86::VSHUFPDrri);
317 case X86::VPERMILPDYri:
318 return ProcessVPERMILPDri(X86::VSHUFPDYrri);
319 case X86::VPERMILPDZ128ri:
320 return ProcessVPERMILPDri(X86::VSHUFPDZ128rri);
321 case X86::VPERMILPDZ256ri:
322 return ProcessVPERMILPDri(X86::VSHUFPDZ256rri);
323 case X86::VPERMILPDZri:
324 return ProcessVPERMILPDri(X86::VSHUFPDZrri);
325 case X86::VPERMILPDZ128rikz:
326 return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz);
327 case X86::VPERMILPDZ256rikz:
328 return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz);
329 case X86::VPERMILPDZrikz:
330 return ProcessVPERMILPDri(X86::VSHUFPDZrrikz);
331 case X86::VPERMILPDZ128rik:
332 return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik);
333 case X86::VPERMILPDZ256rik:
334 return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik);
335 case X86::VPERMILPDZrik:
336 return ProcessVPERMILPDri(X86::VSHUFPDZrrik);
337
338 case X86::VPERMILPSri:
339 return ProcessVPERMILPSri(X86::VSHUFPSrri);
340 case X86::VPERMILPSYri:
341 return ProcessVPERMILPSri(X86::VSHUFPSYrri);
342 case X86::VPERMILPSZ128ri:
343 return ProcessVPERMILPSri(X86::VSHUFPSZ128rri);
344 case X86::VPERMILPSZ256ri:
345 return ProcessVPERMILPSri(X86::VSHUFPSZ256rri);
346 case X86::VPERMILPSZri:
347 return ProcessVPERMILPSri(X86::VSHUFPSZrri);
348 case X86::VPERMILPSZ128rikz:
349 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz);
350 case X86::VPERMILPSZ256rikz:
351 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz);
352 case X86::VPERMILPSZrikz:
353 return ProcessVPERMILPSri(X86::VSHUFPSZrrikz);
354 case X86::VPERMILPSZ128rik:
355 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik);
356 case X86::VPERMILPSZ256rik:
357 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik);
358 case X86::VPERMILPSZrik:
359 return ProcessVPERMILPSri(X86::VSHUFPSZrrik);
360 case X86::VPERMILPSmi:
361 return ProcessVPERMILPSmi(X86::VPSHUFDmi);
362 case X86::VPERMILPSYmi:
363 // TODO: See if there is a more generic way we can test if the replacement
364 // instruction is supported.
365 return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false;
366 case X86::VPERMILPSZ128mi:
367 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi);
368 case X86::VPERMILPSZ256mi:
369 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
370 case X86::VPERMILPSZmi:
371 return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
372 case X86::VPERMILPSZ128mikz:
373 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz);
374 case X86::VPERMILPSZ256mikz:
375 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz);
376 case X86::VPERMILPSZmikz:
377 return ProcessVPERMILPSmi(X86::VPSHUFDZmikz);
378 case X86::VPERMILPSZ128mik:
379 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik);
380 case X86::VPERMILPSZ256mik:
381 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik);
382 case X86::VPERMILPSZmik:
383 return ProcessVPERMILPSmi(X86::VPSHUFDZmik);
384
385 case X86::MOVLHPSrr:
386 case X86::UNPCKLPDrr:
387 return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri);
388 case X86::VMOVLHPSrr:
389 case X86::VUNPCKLPDrr:
390 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri);
391 case X86::VUNPCKLPDYrr:
392 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri);
393 // VMOVLHPS is always 128 bits.
394 case X86::VMOVLHPSZrr:
395 case X86::VUNPCKLPDZ128rr:
396 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri);
397 case X86::VUNPCKLPDZ256rr:
398 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri);
399 case X86::VUNPCKLPDZrr:
400 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri);
401 case X86::VUNPCKLPDZ128rrk:
402 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik);
403 case X86::VUNPCKLPDZ256rrk:
404 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik);
405 case X86::VUNPCKLPDZrrk:
406 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik);
407 case X86::VUNPCKLPDZ128rrkz:
408 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
409 case X86::VUNPCKLPDZ256rrkz:
410 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
411 case X86::VUNPCKLPDZrrkz:
412 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz);
413 case X86::UNPCKHPDrr:
414 return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri);
415 case X86::VUNPCKHPDrr:
416 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri);
417 case X86::VUNPCKHPDYrr:
418 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri);
419 case X86::VUNPCKHPDZ128rr:
420 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri);
421 case X86::VUNPCKHPDZ256rr:
422 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri);
423 case X86::VUNPCKHPDZrr:
424 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri);
425 case X86::VUNPCKHPDZ128rrk:
426 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik);
427 case X86::VUNPCKHPDZ256rrk:
428 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik);
429 case X86::VUNPCKHPDZrrk:
430 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik);
431 case X86::VUNPCKHPDZ128rrkz:
432 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
433 case X86::VUNPCKHPDZ256rrkz:
434 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
435 case X86::VUNPCKHPDZrrkz:
436 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz);
437 case X86::UNPCKLPDrm:
438 return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm);
439 case X86::VUNPCKLPDrm:
440 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm);
441 case X86::VUNPCKLPDYrm:
442 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm);
443 case X86::VUNPCKLPDZ128rm:
444 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm);
445 case X86::VUNPCKLPDZ256rm:
446 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm);
447 case X86::VUNPCKLPDZrm:
448 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm);
449 case X86::VUNPCKLPDZ128rmk:
450 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk);
451 case X86::VUNPCKLPDZ256rmk:
452 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk);
453 case X86::VUNPCKLPDZrmk:
454 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk);
455 case X86::VUNPCKLPDZ128rmkz:
456 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz);
457 case X86::VUNPCKLPDZ256rmkz:
458 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz);
459 case X86::VUNPCKLPDZrmkz:
460 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz);
461 case X86::UNPCKHPDrm:
462 return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm);
463 case X86::VUNPCKHPDrm:
464 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm);
465 case X86::VUNPCKHPDYrm:
466 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm);
467 case X86::VUNPCKHPDZ128rm:
468 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm);
469 case X86::VUNPCKHPDZ256rm:
470 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm);
471 case X86::VUNPCKHPDZrm:
472 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm);
473 case X86::VUNPCKHPDZ128rmk:
474 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk);
475 case X86::VUNPCKHPDZ256rmk:
476 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk);
477 case X86::VUNPCKHPDZrmk:
478 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk);
479 case X86::VUNPCKHPDZ128rmkz:
480 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz);
481 case X86::VUNPCKHPDZ256rmkz:
482 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz);
483 case X86::VUNPCKHPDZrmkz:
484 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz);
485
486 case X86::UNPCKLPSrr:
487 return ProcessUNPCKPS(X86::PUNPCKLDQrr);
488 case X86::VUNPCKLPSrr:
489 return ProcessUNPCKPS(X86::VPUNPCKLDQrr);
490 case X86::VUNPCKLPSYrr:
491 return ProcessUNPCKPS(X86::VPUNPCKLDQYrr);
492 case X86::VUNPCKLPSZ128rr:
493 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr);
494 case X86::VUNPCKLPSZ256rr:
495 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr);
496 case X86::VUNPCKLPSZrr:
497 return ProcessUNPCKPS(X86::VPUNPCKLDQZrr);
498 case X86::VUNPCKLPSZ128rrk:
499 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk);
500 case X86::VUNPCKLPSZ256rrk:
501 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk);
502 case X86::VUNPCKLPSZrrk:
503 return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk);
504 case X86::VUNPCKLPSZ128rrkz:
505 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz);
506 case X86::VUNPCKLPSZ256rrkz:
507 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz);
508 case X86::VUNPCKLPSZrrkz:
509 return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz);
510 case X86::UNPCKHPSrr:
511 return ProcessUNPCKPS(X86::PUNPCKHDQrr);
512 case X86::VUNPCKHPSrr:
513 return ProcessUNPCKPS(X86::VPUNPCKHDQrr);
514 case X86::VUNPCKHPSYrr:
515 return ProcessUNPCKPS(X86::VPUNPCKHDQYrr);
516 case X86::VUNPCKHPSZ128rr:
517 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr);
518 case X86::VUNPCKHPSZ256rr:
519 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr);
520 case X86::VUNPCKHPSZrr:
521 return ProcessUNPCKPS(X86::VPUNPCKHDQZrr);
522 case X86::VUNPCKHPSZ128rrk:
523 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk);
524 case X86::VUNPCKHPSZ256rrk:
525 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk);
526 case X86::VUNPCKHPSZrrk:
527 return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk);
528 case X86::VUNPCKHPSZ128rrkz:
529 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz);
530 case X86::VUNPCKHPSZ256rrkz:
531 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz);
532 case X86::VUNPCKHPSZrrkz:
533 return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz);
534 case X86::UNPCKLPSrm:
535 return ProcessUNPCKPS(X86::PUNPCKLDQrm);
536 case X86::VUNPCKLPSrm:
537 return ProcessUNPCKPS(X86::VPUNPCKLDQrm);
538 case X86::VUNPCKLPSYrm:
539 return ProcessUNPCKPS(X86::VPUNPCKLDQYrm);
540 case X86::VUNPCKLPSZ128rm:
541 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm);
542 case X86::VUNPCKLPSZ256rm:
543 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm);
544 case X86::VUNPCKLPSZrm:
545 return ProcessUNPCKPS(X86::VPUNPCKLDQZrm);
546 case X86::VUNPCKLPSZ128rmk:
547 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk);
548 case X86::VUNPCKLPSZ256rmk:
549 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk);
550 case X86::VUNPCKLPSZrmk:
551 return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk);
552 case X86::VUNPCKLPSZ128rmkz:
553 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz);
554 case X86::VUNPCKLPSZ256rmkz:
555 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz);
556 case X86::VUNPCKLPSZrmkz:
557 return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz);
558 case X86::UNPCKHPSrm:
559 return ProcessUNPCKPS(X86::PUNPCKHDQrm);
560 case X86::VUNPCKHPSrm:
561 return ProcessUNPCKPS(X86::VPUNPCKHDQrm);
562 case X86::VUNPCKHPSYrm:
563 return ProcessUNPCKPS(X86::VPUNPCKHDQYrm);
564 case X86::VUNPCKHPSZ128rm:
565 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm);
566 case X86::VUNPCKHPSZ256rm:
567 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm);
568 case X86::VUNPCKHPSZrm:
569 return ProcessUNPCKPS(X86::VPUNPCKHDQZrm);
570 case X86::VUNPCKHPSZ128rmk:
571 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk);
572 case X86::VUNPCKHPSZ256rmk:
573 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk);
574 case X86::VUNPCKHPSZrmk:
575 return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk);
576 case X86::VUNPCKHPSZ128rmkz:
577 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz);
578 case X86::VUNPCKHPSZ256rmkz:
579 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
580 case X86::VUNPCKHPSZrmkz:
581 return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
582
583 case X86::PSLLWri:
584 return ProcessShiftLeftToAdd(X86::PADDWrr);
585 case X86::VPSLLWri:
586 return ProcessShiftLeftToAdd(X86::VPADDWrr);
587 case X86::VPSLLWYri:
588 return ProcessShiftLeftToAdd(X86::VPADDWYrr);
589 case X86::VPSLLWZ128ri:
590 return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
591 case X86::VPSLLWZ256ri:
592 return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
593 case X86::VPSLLWZri:
594 return ProcessShiftLeftToAdd(X86::VPADDWZrr);
595 case X86::PSLLDri:
596 return ProcessShiftLeftToAdd(X86::PADDDrr);
597 case X86::VPSLLDri:
598 return ProcessShiftLeftToAdd(X86::VPADDDrr);
599 case X86::VPSLLDYri:
600 return ProcessShiftLeftToAdd(X86::VPADDDYrr);
601 case X86::VPSLLDZ128ri:
602 return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
603 case X86::VPSLLDZ256ri:
604 return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
605 case X86::VPSLLDZri:
606 return ProcessShiftLeftToAdd(X86::VPADDDZrr);
607 case X86::PSLLQri:
608 return ProcessShiftLeftToAdd(X86::PADDQrr);
609 case X86::VPSLLQri:
610 return ProcessShiftLeftToAdd(X86::VPADDQrr);
611 case X86::VPSLLQYri:
612 return ProcessShiftLeftToAdd(X86::VPADDQYrr);
613 case X86::VPSLLQZ128ri:
614 return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
615 case X86::VPSLLQZ256ri:
616 return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
617 case X86::VPSLLQZri:
618 return ProcessShiftLeftToAdd(X86::VPADDQZrr);
619
620 default:
621 return false;
622 }
623}
624
625bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
626 LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";);
627 bool Changed = false;
628 ST = &MF.getSubtarget<X86Subtarget>();
629 TII = ST->getInstrInfo();
630 SM = &ST->getSchedModel();
631
632 for (MachineBasicBlock &MBB : MF) {
633 for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
634 if (processInstruction(MF, MBB, I)) {
635 ++NumInstChanges;
636 Changed = true;
637 }
638 }
639 }
640 LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";);
641 return Changed;
642}
const TargetInstrInfo & TII
MachineBasicBlock & MBB
Function Alias Analysis false
#define DEBUG_TYPE
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
#define T
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static std::optional< bool > CmpOptionals(T NewVal, T CurVal)
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
unsigned getSchedClass() const
Return the scheduling class for this instruction.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition MCInstrInfo.h:90
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
static MachineOperand CreateImm(int64_t Val)
bool hasNoDomainDelayShuffle() const
const X86InstrInfo * getInstrInfo() const override
bool hasAVX2() const
Changed
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionPass * createX86FixupInstTuning()
Return a pass that replaces equivalent slower instructions with faster ones.
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
static LLVM_ABI int computeInstrLatency(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Returns the latency value for the scheduling class.
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)